nvidia-gpu: add dram temperature sensor
This commit introduces a dram temperature sensor for the GPU, enhancing
the existing temperature monitoring capabilities.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_DRAM_0_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_DRAM_0_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_DRAM_0_TEMP_0",
"Name": "NVIDIA GB200 GPU 0 DRAM 0 TEMP 0",
"Reading": 30.0,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
},
"Thresholds": {
"UpperCritical": {
"Reading": 95.0
}
}
}%
```
Change-Id: I914bb94f85e2d4163397b71a08b4ddd8b171e7d7
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index 2c14a1c..d25860d 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -59,6 +59,12 @@
mctpRequester,
std::bind_front(&GpuDevice::processTLimitThresholds, this));
+ dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
+ conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
+ gpuDramTempSensorId, objectServer,
+ std::vector<thresholds::Threshold>{thresholds::Threshold{
+ thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
+
powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
objectServer, std::vector<thresholds::Threshold>{});
@@ -107,6 +113,7 @@
{
tLimitSensor->update();
}
+ dramTempSensor->update();
powerSensor->update();
energySensor->update();
voltageSensor->update();