nvidia-gpu: add thresholds support to TLimit This patch introduces support for retrieving GPU TLimit thresholds directly from the GPU device. TLimit Temperature represents the difference in degrees Celsius between the current GPU temperature and the initial throttle threshold. The patch also enables the extraction of three critical throttle thresholds — Warning Low, Critical Low, and Hard Shutdown Low — from the GPU hardware. Tested: Build an image for gb200nvl-obmc machine with the following patches cherry picked. This patches are needed to enable the mctp stack. https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422 ``` $ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1 { "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1", "@odata.type": "#Sensor.v1_2_0.Sensor", "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1", "Name": "NVIDIA GB200 GPU 0 TEMP 1", "Reading": 57.3984375, "ReadingRangeMax": 127.0, "ReadingRangeMin": -128.0, "ReadingType": "Temperature", "ReadingUnits": "Cel", "Status": { "Health": "OK", "State": "Enabled" }, "Thresholds": { "LowerCaution": { "Reading": 0.0 }, "LowerCritical": { "Reading": 0.0 }, "LowerFatal": { "Reading": 0.0 } } }% ``` Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963 Signed-off-by: Harshit Aghera <haghera@nvidia.com>

commit: 5e7deccd14dcac790028a6641291cc019c1c4e52 [log] [tgz]
author: Harshit Aghera <haghera@nvidia.com> Wed May 07 16:20:16 2025 +0530
committer: Ed Tanous <ed@tanous.net> Thu Jul 10 15:01:22 2025 +0000
tree: 84807211fb4ed2e7959c95f4c8c39b410e3bb3b3
parent: ba138dae62cfd96571372a3e22317dc57ab72c80 [diff] [blame]
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
index 17f71e0..7a48b30 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp

@@ -152,5 +152,70 @@
 
     return 0;
 }
+
+int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
+                                       std::span<uint8_t> buf)
+{
+    if (buf.size() < sizeof(ReadThermalParametersRequest))
+    {
+        return EINVAL;
+    }
+
+    auto* msg = reinterpret_cast<ReadThermalParametersRequest*>(buf.data());
+
+    ocp::accelerator_management::BindingPciVidInfo header{};
+    header.ocp_accelerator_management_msg_type =
+        static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+    header.instance_id = instanceId &
+                         ocp::accelerator_management::instanceIdBitMask;
+    header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
+
+    auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
+
+    if (rc != 0)
+    {
+        return rc;
+    }
+
+    msg->hdr.command = static_cast<uint8_t>(
+        PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    msg->hdr.data_size = sizeof(sensorId);
+    msg->sensor_id = sensorId;
+
+    return 0;
+}
+
+int decodeReadThermalParametersResponse(
+    std::span<const uint8_t> buf,
+    ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
+    int32_t& threshold)
+{
+    auto rc =
+        ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        return rc;
+    }
+
+    if (buf.size() < sizeof(ReadThermalParametersResponse))
+    {
+        return EINVAL;
+    }
+
+    const auto* response =
+        reinterpret_cast<const ReadThermalParametersResponse*>(buf.data());
+
+    uint16_t dataSize = le16toh(response->hdr.data_size);
+
+    if (dataSize != sizeof(int32_t))
+    {
+        return EINVAL;
+    }
+
+    threshold = le32toh(response->threshold);
+
+    return 0;
+}
 // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast)
 } // namespace gpu
commit	5e7deccd14dcac790028a6641291cc019c1c4e52	[log] [tgz]
author	Harshit Aghera <haghera@nvidia.com>	Wed May 07 16:20:16 2025 +0530
committer	Ed Tanous <ed@tanous.net>	Thu Jul 10 15:01:22 2025 +0000
tree	84807211fb4ed2e7959c95f4c8c39b410e3bb3b3
parent	ba138dae62cfd96571372a3e22317dc57ab72c80 [diff] [blame]