nvidia-gpu: add thresholds support to TLimit
This patch introduces support for retrieving GPU TLimit thresholds
directly from the GPU device. TLimit Temperature represents the
difference in degrees Celsius between the current GPU temperature and
the initial throttle threshold. The patch also enables the extraction of
three critical throttle thresholds — Warning Low, Critical Low, and Hard
Shutdown Low — from the GPU hardware.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1",
  "Name": "NVIDIA GB200 GPU 0 TEMP 1",
  "Reading": 57.3984375,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  },
  "Thresholds": {
    "LowerCaution": {
      "Reading": 0.0
    },
    "LowerCritical": {
      "Reading": 0.0
    },
    "LowerFatal": {
      "Reading": 0.0
    }
  }
}%
```
Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index ab476da..509a353 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -14,6 +14,7 @@
 #include <bits/basic_string.h>
 
 #include <MctpRequester.hpp>
+#include <NvidiaGpuThresholds.hpp>
 #include <boost/asio/io_context.hpp>
 #include <phosphor-logging/lg2.hpp>
 #include <sdbusplus/asio/connection.hpp>
@@ -21,8 +22,10 @@
 
 #include <chrono>
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
@@ -45,9 +48,13 @@
         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
         objectServer, std::vector<thresholds::Threshold>{});
 
-    tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
-        conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
-        objectServer, std::vector<thresholds::Threshold>{});
+    readThermalParameters(
+        eid,
+        std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
+                                    gpuTLimitCriticalThresholdId,
+                                    gpuTLimitHardshutDownThresholdId},
+        mctpRequester,
+        std::bind_front(&GpuDevice::processTLimitThresholds, this));
 
     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
               name, "PATH", path);
@@ -55,10 +62,36 @@
     read();
 }
 
+void GpuDevice::processTLimitThresholds(uint8_t rc,
+                                        const std::vector<int32_t>& thresholds)
+{
+    std::vector<thresholds::Threshold> tLimitThresholds{};
+    if (rc == 0)
+    {
+        tLimitThresholds = {
+            thresholds::Threshold{thresholds::Level::WARNING,
+                                  thresholds::Direction::LOW,
+                                  static_cast<double>(thresholds[0])},
+            thresholds::Threshold{thresholds::Level::CRITICAL,
+                                  thresholds::Direction::LOW,
+                                  static_cast<double>(thresholds[1])},
+            thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
+                                  thresholds::Direction::LOW,
+                                  static_cast<double>(thresholds[2])}};
+    }
+
+    tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
+        conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
+        objectServer, std::move(tLimitThresholds));
+}
+
 void GpuDevice::read()
 {
     tempSensor->update();
-    tLimitSensor->update();
+    if (tLimitSensor)
+    {
+        tLimitSensor->update();
+    }
 
     waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
     waitTimer.async_wait([this](const boost::system::error_code& ec) {
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.hpp b/src/nvidia-gpu/NvidiaGpuDevice.hpp
index 78b3e4a..b9e0791 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.hpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.hpp
@@ -19,6 +19,7 @@
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <vector>
 
 class GpuDevice
 {
@@ -40,6 +41,9 @@
 
     void read();
 
+    void processTLimitThresholds(uint8_t rc,
+                                 const std::vector<int32_t>& thresholds);
+
     uint8_t eid{};
 
     std::chrono::milliseconds sensorPollMs;
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
index 17f71e0..7a48b30 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
@@ -152,5 +152,70 @@
 
     return 0;
 }
+
+int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
+                                       std::span<uint8_t> buf)
+{
+    if (buf.size() < sizeof(ReadThermalParametersRequest))
+    {
+        return EINVAL;
+    }
+
+    auto* msg = reinterpret_cast<ReadThermalParametersRequest*>(buf.data());
+
+    ocp::accelerator_management::BindingPciVidInfo header{};
+    header.ocp_accelerator_management_msg_type =
+        static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+    header.instance_id = instanceId &
+                         ocp::accelerator_management::instanceIdBitMask;
+    header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
+
+    auto rc = packHeader(header, msg->hdr.msgHdr.hdr);
+
+    if (rc != 0)
+    {
+        return rc;
+    }
+
+    msg->hdr.command = static_cast<uint8_t>(
+        PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    msg->hdr.data_size = sizeof(sensorId);
+    msg->sensor_id = sensorId;
+
+    return 0;
+}
+
+int decodeReadThermalParametersResponse(
+    std::span<const uint8_t> buf,
+    ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
+    int32_t& threshold)
+{
+    auto rc =
+        ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        return rc;
+    }
+
+    if (buf.size() < sizeof(ReadThermalParametersResponse))
+    {
+        return EINVAL;
+    }
+
+    const auto* response =
+        reinterpret_cast<const ReadThermalParametersResponse*>(buf.data());
+
+    uint16_t dataSize = le16toh(response->hdr.data_size);
+
+    if (dataSize != sizeof(int32_t))
+    {
+        return EINVAL;
+    }
+
+    threshold = le32toh(response->threshold);
+
+    return 0;
+}
 // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast)
 } // namespace gpu
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
index ce3b393..f7c78b8 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
@@ -30,6 +30,7 @@
 enum class PlatformEnvironmentalCommands : uint8_t
 {
     GET_TEMPERATURE_READING = 0x00,
+    READ_THERMAL_PARAMETERS = 0x02,
 };
 
 enum class DeviceIdentification : uint8_t
@@ -57,12 +58,20 @@
 
 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
 
+using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
+
 struct GetTemperatureReadingResponse
 {
     ocp::accelerator_management::CommonResponse hdr;
     int32_t reading;
 } __attribute__((packed));
 
+struct ReadThermalParametersResponse
+{
+    ocp::accelerator_management::CommonResponse hdr;
+    int32_t threshold;
+} __attribute__((packed));
+
 int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
                ocp::accelerator_management::BindingPciVid& msg);
 
@@ -82,4 +91,12 @@
     ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
     double& temperatureReading);
 
+int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
+                                       std::span<uint8_t> buf);
+
+int decodeReadThermalParametersResponse(
+    std::span<const uint8_t> buf,
+    ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
+    int32_t& threshold);
+
 } // namespace gpu
diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.cpp b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
new file mode 100644
index 0000000..16141f1
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
@@ -0,0 +1,128 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuThresholds.hpp"
+
+#include <MctpRequester.hpp>
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+
+#include <array>
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <span>
+#include <vector>
+
+void processReadThermalParameterResponse(
+    const std::function<void(uint8_t, int32_t)>& callback,
+    const std::span<const uint8_t> respMsg, int sendRecvMsgResult)
+{
+    if (sendRecvMsgResult != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
+            "RC", sendRecvMsgResult);
+        callback(EPROTO, 0);
+        return;
+    }
+
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode = 0;
+    int32_t threshold = 0;
+
+    auto rc = gpu::decodeReadThermalParametersResponse(respMsg, cc, reasonCode,
+                                                       threshold);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+            "RC", rc, "CC", cc, "RESC", reasonCode);
+        callback(EPROTO, 0);
+        return;
+    }
+
+    callback(0, threshold);
+};
+
+void readThermalParameter(uint8_t eid, uint8_t id,
+                          mctp::MctpRequester& mctpRequester,
+                          const std::function<void(uint8_t, int32_t)>& callback)
+{
+    auto reqMsg = std::make_shared<
+        std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>>();
+
+    auto respMsg = std::make_shared<
+        std::array<uint8_t, sizeof(gpu::ReadThermalParametersResponse)>>();
+
+    auto rc = gpu::encodeReadThermalParametersRequest(0, id, *reqMsg);
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
+            "EID", eid, "PID", id, "RC", rc);
+        callback(rc, 0);
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, *reqMsg, *respMsg,
+        [reqMsg, respMsg, callback](int sendRecvMsgResult) {
+            processReadThermalParameterResponse(callback, *respMsg,
+                                                sendRecvMsgResult);
+        });
+}
+
+void readThermalParameterCallback(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
+    size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
+    uint8_t rc, int32_t threshold)
+{
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID}. rc={RC}",
+            "EID", eid, "PID", (*ids)[index], "RC", rc);
+        callback(rc, *thresholds);
+        return;
+    }
+
+    thresholds->push_back(threshold);
+
+    ++index;
+    if (index == ids->size())
+    {
+        callback(rc, *thresholds);
+    }
+    else
+    {
+        readThermalParameter(eid, (*ids)[index], mctpRequester,
+                             std::bind_front(readThermalParameterCallback, eid,
+                                             ids, std::ref(mctpRequester),
+                                             callback, index, thresholds));
+    }
+}
+
+void readThermalParameters(
+    uint8_t eid, const std::vector<uint8_t>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
+{
+    auto thresholds = std::make_shared<std::vector<int32_t>>();
+    size_t index = 0;
+
+    readThermalParameter(
+        eid, ids[index], mctpRequester,
+        std::bind_front(readThermalParameterCallback, eid,
+                        std::make_shared<std::vector<uint8_t>>(ids),
+                        std::ref(mctpRequester), callback, index, thresholds));
+}
diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.hpp b/src/nvidia-gpu/NvidiaGpuThresholds.hpp
new file mode 100644
index 0000000..9d1970f
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuThresholds.hpp
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+using gpuThresholdId = uint8_t;
+
+constexpr gpuThresholdId gpuTLimitCriticalThresholdId{1};
+constexpr gpuThresholdId gpuTLimitWarnringThresholdId{2};
+constexpr gpuThresholdId gpuTLimitHardshutDownThresholdId{4};
+
+void readThermalParameters(
+    uint8_t eid, const std::vector<gpuThresholdId>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback);
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index 57bab75..4044eb3 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -5,6 +5,7 @@
     'NvidiaGpuMctpVdm.cpp',
     'NvidiaGpuSensor.cpp',
     'NvidiaGpuSensorMain.cpp',
+    'NvidiaGpuThresholds.cpp',
     'OcpMctpVdm.cpp',
 )
 
diff --git a/src/nvidia-gpu/tests/NvidiaGpuSensorTest.cpp b/src/nvidia-gpu/tests/NvidiaGpuSensorTest.cpp
index c630ffa..9455326 100644
--- a/src/nvidia-gpu/tests/NvidiaGpuSensorTest.cpp
+++ b/src/nvidia-gpu/tests/NvidiaGpuSensorTest.cpp
@@ -491,6 +491,156 @@
     EXPECT_EQ(result, EINVAL); // Should indicate error for invalid data size
 }
 
+// Tests for GpuMctpVdm::encodeReadThermalParametersRequest function
+TEST_F(GpuMctpVdmTests, EncodeReadThermalParametersRequestSuccess)
+{
+    const uint8_t instanceId = 5;
+    const uint8_t sensorId = 1;
+    std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)> buf{};
+
+    int result =
+        gpu::encodeReadThermalParametersRequest(instanceId, sensorId, buf);
+
+    EXPECT_EQ(result, 0);
+
+    gpu::ReadThermalParametersRequest request{};
+    std::memcpy(&request, buf.data(), sizeof(request));
+
+    EXPECT_EQ(request.hdr.msgHdr.hdr.pci_vendor_id,
+              htobe16(gpu::nvidiaPciVendorId));
+    EXPECT_EQ(request.hdr.msgHdr.hdr.instance_id &
+                  ocp::accelerator_management::instanceIdBitMask,
+              instanceId & ocp::accelerator_management::instanceIdBitMask);
+    EXPECT_NE(request.hdr.msgHdr.hdr.instance_id &
+                  ocp::accelerator_management::requestBitMask,
+              0);
+    EXPECT_EQ(request.hdr.msgHdr.hdr.ocp_accelerator_management_msg_type,
+              static_cast<uint8_t>(gpu::MessageType::PLATFORM_ENVIRONMENTAL));
+
+    // Verify request data
+    EXPECT_EQ(request.hdr.command,
+              static_cast<uint8_t>(
+                  gpu::PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS));
+    EXPECT_EQ(request.hdr.data_size, sizeof(sensorId));
+    EXPECT_EQ(request.sensor_id, sensorId);
+}
+
+// Tests for GpuMctpVdm::decodeReadThermalParametersResponse function
+TEST_F(GpuMctpVdmTests, DecodeReadThermalParametersResponseSuccess)
+{
+    // Create a mock successful response
+    std::array<uint8_t, sizeof(gpu::ReadThermalParametersResponse)> buf{};
+
+    gpu::ReadThermalParametersResponse response{};
+    ocp::accelerator_management::BindingPciVidInfo headerInfo{};
+    headerInfo.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
+        ocp::accelerator_management::MessageType::RESPONSE);
+    headerInfo.instance_id = 5;
+    headerInfo.msg_type =
+        static_cast<uint8_t>(gpu::MessageType::PLATFORM_ENVIRONMENTAL);
+
+    gpu::packHeader(headerInfo, response.hdr.msgHdr.hdr);
+
+    // Populate response data
+    response.hdr.command = static_cast<uint8_t>(
+        gpu::PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    response.hdr.completion_code = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::SUCCESS);
+    response.hdr.reserved = 0;
+    response.hdr.data_size = htole16(sizeof(int32_t));
+
+    // Set a threshold value of 85°C (85 * 256 = 21760)
+    response.threshold = htole32(21760);
+
+    std::memcpy(buf.data(), &response, sizeof(response));
+
+    // Test decoding
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode{};
+    int32_t threshold{};
+
+    int result = gpu::decodeReadThermalParametersResponse(
+        buf, cc, reasonCode, threshold);
+
+    EXPECT_EQ(result, 0);
+    EXPECT_EQ(cc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(reasonCode, 0);
+    EXPECT_EQ(threshold, 21760);
+}
+
+TEST_F(GpuMctpVdmTests, DecodeReadThermalParametersResponseError)
+{
+    std::array<uint8_t,
+               sizeof(ocp::accelerator_management::CommonNonSuccessResponse)>
+        buf{};
+
+    // Populate error response data
+    ocp::accelerator_management::CommonNonSuccessResponse errorResponse{};
+    ocp::accelerator_management::BindingPciVidInfo headerInfo{};
+    headerInfo.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
+        ocp::accelerator_management::MessageType::RESPONSE);
+    headerInfo.instance_id = 5;
+    headerInfo.msg_type =
+        static_cast<uint8_t>(gpu::MessageType::PLATFORM_ENVIRONMENTAL);
+
+    gpu::packHeader(headerInfo, errorResponse.msgHdr.hdr);
+
+    errorResponse.command = static_cast<uint8_t>(
+        gpu::PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    errorResponse.completion_code = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::ERR_NOT_READY);
+    errorResponse.reason_code = htole16(0x5678);
+
+    std::memcpy(buf.data(), &errorResponse, sizeof(errorResponse));
+
+    // Test decoding
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode{};
+    int32_t threshold{};
+
+    int result = gpu::decodeReadThermalParametersResponse(
+        buf, cc, reasonCode, threshold);
+
+    EXPECT_EQ(result, 0);
+    EXPECT_EQ(cc, ocp::accelerator_management::CompletionCode::ERR_NOT_READY);
+    EXPECT_EQ(reasonCode, 0x5678);
+}
+
+TEST_F(GpuMctpVdmTests, DecodeReadThermalParametersResponseInvalidSize)
+{
+    // Create a mock response with invalid data_size
+    std::array<uint8_t, sizeof(gpu::ReadThermalParametersResponse)> buf{};
+
+    gpu::ReadThermalParametersResponse response{};
+    ocp::accelerator_management::BindingPciVidInfo headerInfo{};
+    headerInfo.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
+        ocp::accelerator_management::MessageType::RESPONSE);
+    headerInfo.instance_id = 5;
+    headerInfo.msg_type =
+        static_cast<uint8_t>(gpu::MessageType::PLATFORM_ENVIRONMENTAL);
+
+    gpu::packHeader(headerInfo, response.hdr.msgHdr.hdr);
+
+    response.hdr.command = static_cast<uint8_t>(
+        gpu::PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    response.hdr.completion_code = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::SUCCESS);
+    response.hdr.reserved = 0;
+    response.hdr.data_size = htole16(2); // Invalid - should be sizeof(int32_t)
+    response.threshold = htole32(21760);
+
+    std::memcpy(buf.data(), &response, sizeof(response));
+
+    // Test decoding
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode{};
+    int32_t threshold{};
+
+    int result = gpu::decodeReadThermalParametersResponse(
+        buf, cc, reasonCode, threshold);
+
+    EXPECT_EQ(result, EINVAL); // Should indicate error for invalid data size
+}
 } // namespace gpu_mctp_tests
 
 int main(int argc, char** argv)