gpu: add TLimit sensor
This commit introduces a new thermal limit (TLimit) sensor for the GPU,
enhancing the existing temperature monitoring capabilities.
Tested.
The TEMP_0 update is disabled while testing this patch as it requires
MCTP request queueing since OCP MCTP VDM specifies at max one
outstanding request to the device. The MCTP request queueing is being
introduces with this patch -
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80023
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_TEMP_1",
"Name": "NVIDIA GB200 GPU TEMP 1",
"Reading": 47.875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
├─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_0
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 48 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
Change-Id: Ib8e0ef93a4acbb8870671665b098fb61d0205cb2
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuDevice.cpp b/src/gpu/GpuDevice.cpp
index dccd730..11423dd 100644
--- a/src/gpu/GpuDevice.cpp
+++ b/src/gpu/GpuDevice.cpp
@@ -6,6 +6,7 @@
#include "GpuDevice.hpp"
#include "GpuSensor.hpp"
+#include "GpuTLimitSensor.hpp"
#include "Thresholds.hpp"
#include "Utils.hpp"
@@ -58,13 +59,17 @@
conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
std::vector<thresholds::Threshold>{}));
- lg2::info("Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
- "NAME", name, "PATH", path);
+ sensors.push_back(std::make_shared<GpuTLimitSensor>(
+ conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
+ std::vector<thresholds::Threshold>{}));
+
+ lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
+ name, "PATH", path);
}
void GpuDevice::read()
{
- for ([[maybe_unused]] const auto& sensor : sensors)
+ for (const auto& sensor : sensors)
{
sensor->update();
}