nvidia-gpu: add TLimit sensor This commit introduces a new thermal limit (TLimit) sensor for the GPU, enhancing the existing temperature monitoring capabilities. Tested: Build an image for gb200nvl-obmc machine with the following patches cherry picked. This patches are needed to enable the mctp stack. https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422 ``` $ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1 { "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1", "@odata.type": "#Sensor.v1_2_0.Sensor", "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1", "Name": "NVIDIA GB200 GPU 0 TEMP 1", "Reading": 57.3984375, "ReadingRangeMax": 127.0, "ReadingRangeMin": -128.0, "ReadingType": "Temperature", "ReadingUnits": "Cel", "Status": { "Health": "OK", "State": "Enabled" } }% ``` Change-Id: Ib8e0ef93a4acbb8870671665b098fb61d0205cb2 Signed-off-by: Harshit Aghera <haghera@nvidia.com>

commit: ba138dae62cfd96571372a3e22317dc57ab72c80 [log] [tgz]
author: Harshit Aghera <haghera@nvidia.com> Mon May 05 12:26:35 2025 +0530
committer: Ed Tanous <ed@tanous.net> Thu Jul 10 15:01:22 2025 +0000
tree: 1b00cc22de6359dba7b387943d9f60fcbd681f1c
parent: 4ecdfaaaf39039bd6d73ee67aa44830672675a81 [diff] [blame]
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index cd39b56..ab476da 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp

@@ -42,8 +42,12 @@
 void GpuDevice::makeSensors()
 {
     tempSensor = std::make_shared<NvidiaGpuTempSensor>(
-        conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
-        std::vector<thresholds::Threshold>{});
+        conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
+        objectServer, std::vector<thresholds::Threshold>{});
+
+    tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
+        conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
+        objectServer, std::vector<thresholds::Threshold>{});
 
     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
               name, "PATH", path);
@@ -54,6 +58,7 @@
 void GpuDevice::read()
 {
     tempSensor->update();
+    tLimitSensor->update();
 
     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
     waitTimer.async_wait([this](const boost::system::error_code& ec) {
commit	ba138dae62cfd96571372a3e22317dc57ab72c80	[log] [tgz]
author	Harshit Aghera <haghera@nvidia.com>	Mon May 05 12:26:35 2025 +0530
committer	Ed Tanous <ed@tanous.net>	Thu Jul 10 15:01:22 2025 +0000
tree	1b00cc22de6359dba7b387943d9f60fcbd681f1c
parent	4ecdfaaaf39039bd6d73ee67aa44830672675a81 [diff] [blame]