gpu : add thresholds support to TLimit

This patch adds support to fetch TLimit thresholds from gpu

Tested.

The TEMP_0 update is disabled while testing this patch as it requires
MCTP request queueing since OCP MCTP VDM specifies at max one
outstanding request to the device. The MCTP request queueing is being
introduces with this patch -
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80023

Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```

Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```

```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_TEMP_1",
  "Name": "NVIDIA GB200 GPU TEMP 1",
  "Reading": 49.0,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  },
  "Thresholds": {
    "LowerCaution": {
      "Reading": 0.0
    },
    "LowerCritical": {
      "Reading": 0.0
    }
  }
}%

root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
NAME                                                  TYPE      SIGNATURE RESULT/VALUE                             FLAGS
org.freedesktop.DBus.Introspectable                   interface -         -                                        -
.Introspect                                           method    -         s                                        -
org.freedesktop.DBus.Peer                             interface -         -                                        -
.GetMachineId                                         method    -         s                                        -
.Ping                                                 method    -         -                                        -
org.freedesktop.DBus.Properties                       interface -         -                                        -
.Get                                                  method    ss        v                                        -
.GetAll                                               method    s         a{sv}                                    -
.Set                                                  method    ssv       -                                        -
.PropertiesChanged                                    signal    sa{sv}as  -                                        -
xyz.openbmc_project.Association.Definitions           interface -         -                                        -
.Associations                                         property  a(sss)    1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Inventory.Item                    interface -         -                                        -
.PrettyName                                           property  s         "Thermal Limit(TLIMIT) Temperature is t… emits-change
xyz.openbmc_project.Sensor.Threshold.Critical         interface -         -                                        -
.CriticalAlarmHigh                                    property  b         false                                    emits-change
.CriticalAlarmLow                                     property  b         false                                    emits-change
.CriticalHigh                                         property  d         nan                                      emits-change writable
.CriticalLow                                          property  d         0                                        emits-change writable
xyz.openbmc_project.Sensor.Threshold.HardShutdown     interface -         -                                        -
.HardShutdownAlarmHigh                                property  b         false                                    emits-change
.HardShutdownAlarmLow                                 property  b         false                                    emits-change
.HardShutdownHigh                                     property  d         nan                                      emits-change writable
.HardShutdownLow                                      property  d         0                                        emits-change writable
xyz.openbmc_project.Sensor.Threshold.Warning          interface -         -                                        -
.WarningAlarmHigh                                     property  b         false                                    emits-change
.WarningAlarmLow                                      property  b         false                                    emits-change
.WarningHigh                                          property  d         nan                                      emits-change writable
.WarningLow                                           property  d         0                                        emits-change writable
xyz.openbmc_project.Sensor.Value                      interface -         -                                        -
.MaxValue                                             property  d         127                                      emits-change
.MinValue                                             property  d         -128                                     emits-change
.Unit                                                 property  s         "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value                                                property  d         48.9688                                  emits-change writable
xyz.openbmc_project.Sensor.ValueMutability            interface -         -                                        -
.Mutable                                              property  b         true                                     emits-change
xyz.openbmc_project.State.Decorator.Availability      interface -         -                                        -
.Available                                            property  b         true                                     emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface -         -                                        -
.Functional                                           property  b         true                                     emits-change
```

Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuDevice.cpp b/src/gpu/GpuDevice.cpp
index 11423dd..54b6df2 100644
--- a/src/gpu/GpuDevice.cpp
+++ b/src/gpu/GpuDevice.cpp
@@ -13,6 +13,7 @@
 #include <bits/basic_string.h>
 
 #include <GpuMctpVdm.hpp>
+#include <GpuThresholds.hpp>
 #include <MctpRequester.hpp>
 #include <OcpMctpVdm.hpp>
 #include <boost/asio/io_context.hpp>
@@ -59,12 +60,33 @@
         conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
         std::vector<thresholds::Threshold>{}));
 
-    sensors.push_back(std::make_shared<GpuTLimitSensor>(
-        conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
-        std::vector<thresholds::Threshold>{}));
-
     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
               name, "PATH", path);
+
+    readThermalParametersBatched(
+        eid,
+        std::make_shared<std::vector<uint8_t>>(std::vector<uint8_t>{1, 2, 4}),
+        mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
+            if (rc)
+            {
+                return;
+            }
+
+            std::vector<thresholds::Threshold> tLimitThresholds{
+                thresholds::Threshold{thresholds::Level::CRITICAL,
+                                      thresholds::Direction::LOW,
+                                      static_cast<double>(thresholds[0])},
+                thresholds::Threshold{thresholds::Level::WARNING,
+                                      thresholds::Direction::LOW,
+                                      static_cast<double>(thresholds[1])},
+                thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
+                                      thresholds::Direction::LOW,
+                                      static_cast<double>(thresholds[2])}};
+
+            sensors.push_back(std::make_shared<GpuTLimitSensor>(
+                conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
+                std::move(tLimitThresholds)));
+        });
 }
 
 void GpuDevice::read()
diff --git a/src/gpu/GpuMctpVdm.cpp b/src/gpu/GpuMctpVdm.cpp
index 1afc6ca..24bc9b3 100644
--- a/src/gpu/GpuMctpVdm.cpp
+++ b/src/gpu/GpuMctpVdm.cpp
@@ -270,4 +270,129 @@
 
     return ocp::accelerator_management::CompletionCode::SUCCESS;
 }
+
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersRequest(
+    uint8_t instanceId, uint8_t sensorId,
+    ocp::accelerator_management::Message& msg)
+{
+    ocp::accelerator_management::BindingPciVidInfo header{};
+    header.ocp_accelerator_management_msg_type =
+        static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+    header.instance_id = instanceId &
+                         ocp::accelerator_management::instanceIdMask;
+    header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
+
+    auto rc = packHeader(header, msg.hdr);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        return rc;
+    }
+
+    ReadThermalParametersRequest request{};
+    request.hdr.command = static_cast<uint8_t>(
+        PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    request.hdr.data_size = sizeof(sensorId);
+    request.sensor_id = sensorId;
+
+    std::memcpy(&msg.data, &request, sizeof(request));
+
+    return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
+
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersRequest(
+    const ocp::accelerator_management::Message& msg, size_t msgLen,
+    uint8_t& sensorId)
+{
+    if (msgLen < sizeof(ocp::accelerator_management::BindingPciVid) +
+                     sizeof(ReadThermalParametersRequest))
+    {
+        return ocp::accelerator_management::CompletionCode::
+            ERR_INVALID_DATA_LENGTH;
+    }
+
+    ReadThermalParametersRequest request{};
+    std::memcpy(&request, &msg.data, sizeof(request));
+
+    if (request.hdr.data_size < sizeof(request.sensor_id))
+    {
+        return ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA;
+    }
+
+    sensorId = request.sensor_id;
+
+    return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
+
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersResponse(
+    uint8_t instanceId, uint8_t cc, uint16_t reasonCode, int32_t threshold,
+    ocp::accelerator_management::Message& msg)
+{
+    ocp::accelerator_management::BindingPciVidInfo header{};
+    header.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
+        ocp::accelerator_management::MessageType::RESPONSE);
+    header.instance_id = instanceId &
+                         ocp::accelerator_management::instanceIdMask;
+    header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
+
+    auto rc = packHeader(header, msg.hdr);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        return rc;
+    }
+
+    if (cc != static_cast<uint8_t>(
+                  ocp::accelerator_management::CompletionCode::SUCCESS))
+    {
+        return gpu::encodeReasonCode(
+            cc, reasonCode,
+            static_cast<uint8_t>(
+                PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS),
+            msg);
+    }
+
+    ReadThermalParametersResponse response{};
+    response.hdr.command = static_cast<uint8_t>(
+        PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+    response.hdr.completion_code = cc;
+    response.hdr.data_size = htole16(sizeof(uint32_t));
+
+    response.threshold = htole32(threshold);
+
+    std::memcpy(&msg.data, &response, sizeof(response));
+
+    return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
+
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersResponse(
+    const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
+    uint16_t& reasonCode, int32_t& threshold)
+{
+    auto rc = gpu::decodeReasonCodeAndCC(msg, msgLen, cc, reasonCode);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+        cc != static_cast<uint8_t>(
+                  ocp::accelerator_management::CompletionCode::SUCCESS))
+    {
+        return rc;
+    }
+
+    if (msgLen < sizeof(ocp::accelerator_management::BindingPciVid) +
+                     sizeof(ReadThermalParametersResponse))
+    {
+        return ocp::accelerator_management::CompletionCode::
+            ERR_INVALID_DATA_LENGTH;
+    }
+
+    ReadThermalParametersResponse response{};
+    std::memcpy(&response, &msg.data, sizeof(response));
+
+    uint16_t dataSize = le16toh(response.hdr.data_size);
+    if (dataSize != sizeof(int32_t))
+    {
+        return ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA;
+    }
+
+    threshold = response.threshold;
+
+    return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
 } // namespace gpu
diff --git a/src/gpu/GpuMctpVdm.hpp b/src/gpu/GpuMctpVdm.hpp
index 21c69cd..6663df1 100644
--- a/src/gpu/GpuMctpVdm.hpp
+++ b/src/gpu/GpuMctpVdm.hpp
@@ -42,6 +42,7 @@
 enum class PlatformEnvironmentalCommands : uint8_t
 {
     GET_TEMPERATURE_READING = 0x00,
+    READ_THERMAL_PARAMETERS = 0x02,
 };
 
 /** @brief device identification types
@@ -92,6 +93,12 @@
  */
 using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
 
+/** @struct ReadThermalParametersRequest
+ *
+ *  Structure representing request to read thermal parameters.
+ */
+using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
+
 /** @struct GetTemperatureReadingResponse
  *
  *  Structure representing get temperature reading response.
@@ -102,6 +109,17 @@
     int32_t reading;
 } __attribute__((packed));
 
+/** @struct ReadThermalParametersResponse
+ *
+ *  Structure representing response to read thermal parameters request.
+ *  Contains the thermal threshold value for the requested sensor.
+ */
+struct ReadThermalParametersResponse
+{
+    ocp::accelerator_management::CommonResponse hdr;
+    int32_t threshold;
+} __attribute__((packed));
+
 /**
  * @brief Populate the GPU message with the GPU header.
  *        The caller of this API allocates buffer for the GPU header
@@ -243,4 +261,56 @@
     const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
     uint16_t& reasonCode, double& temperatureReading);
 
+/** @brief Encode a Read thermal parameters request message
+ *
+ *  @param[in] instance_id - instance ID
+ *  @param[in] sensor_id - sensor id
+ *  @param[out] msg - Reference to message that will be written to
+ *  @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ *  otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersRequest(
+    uint8_t instanceId, uint8_t sensorId,
+    ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Read thermal parameters request message
+ *
+ *  @param[in] msg - request message
+ *  @param[in] msg_len - Length of request message
+ *  @param[out] sensor_id - reference to sensor id
+ *  @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ *  otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersRequest(
+    const ocp::accelerator_management::Message& msg, size_t msgLen,
+    uint8_t& sensorId);
+
+/** @brief Encode a Read thermal parameters response message
+ *
+ *  @param[in] instance_id - instance ID
+ *  @param[in] cc - completion code
+ *  @param[in] reason_code - reason code
+ *  @param[in] threshold - thermal threshold
+ *  @param[out] msg - Reference to message that will be written to
+ *  @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ *  otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersResponse(
+    uint8_t instanceId, uint8_t cc, uint16_t reasonCode, int32_t threshold,
+    ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Read thermal parameters response message
+ *
+ *  @param[in] msg - response message
+ *  @param[in] msg_len - Length of response message
+ *  @param[out] cc - reference to completion code
+ *  @param[out] reason_code - reference to reason code
+ *  @param[out] threshold - reference to thermal threshold
+ *  @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ *  otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersResponse(
+    const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
+    uint16_t& reasonCode, int32_t& threshold);
+
 } // namespace gpu
diff --git a/src/gpu/GpuThresholds.cpp b/src/gpu/GpuThresholds.cpp
new file mode 100644
index 0000000..9eb17c4
--- /dev/null
+++ b/src/gpu/GpuThresholds.cpp
@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <GpuMctpVdm.hpp>
+#include <MctpRequester.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+void readThermalParameter(uint8_t eid, uint8_t id,
+                          mctp::MctpRequester& mctpRequester,
+                          const std::function<void(uint8_t, int32_t)>& callback)
+{
+    std::vector<uint8_t> reqMsg(
+        sizeof(ocp::accelerator_management::BindingPciVid) +
+        sizeof(gpu::ReadThermalParametersRequest));
+
+    auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+    auto rc = gpu::encodeReadThermalParametersRequest(0, id, *msg);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error("encodeReadThermalParametersRequest failed, rc={RC}", "RC",
+                   static_cast<int>(rc));
+
+        callback(-1, 0);
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, reqMsg,
+        [callback](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+            if (sendRecvMsgResult != 0)
+            {
+                lg2::error("MctpRequester::sendRecvMsg() failed, rc={RC}", "RC",
+                           sendRecvMsgResult);
+
+                callback(-2, 0);
+                return;
+            }
+
+            if (respMsg.empty())
+            {
+                lg2::error("MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+
+                callback(-3, 0);
+                return;
+            }
+
+            uint8_t cc = 0;
+            uint16_t reasonCode = 0;
+            int32_t threshold = 0;
+
+            auto rc = gpu::decodeReadThermalParametersResponse(
+                *new (respMsg.data()) ocp::accelerator_management::Message,
+                respMsg.size(), cc, reasonCode, threshold);
+
+            if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+                cc != static_cast<uint8_t>(
+                          ocp::accelerator_management::CompletionCode::SUCCESS))
+            {
+                lg2::error(
+                    "decodeReadThermalParametersResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+                    "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+
+                callback(-4, 0);
+                return;
+            }
+
+            callback(0, threshold);
+        });
+}
+
+void readThermalParameterCallback(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
+    size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
+    uint8_t rc, int32_t threshold)
+{
+    if (rc != 0)
+    {
+        callback(rc, *thresholds);
+        return;
+    }
+
+    thresholds->push_back(threshold);
+
+    ++index;
+    if (index == ids->size())
+    {
+        callback(rc, *thresholds);
+    }
+    else
+    {
+        readThermalParameter(eid, (*ids)[index], mctpRequester,
+                             std::bind_front(readThermalParameterCallback, eid,
+                                             ids, std::ref(mctpRequester),
+                                             callback, index, thresholds));
+    }
+}
+
+void readThermalParametersBatched(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
+{
+    auto thresholds = std::make_shared<std::vector<int32_t>>();
+    size_t index = 0;
+
+    readThermalParameter(
+        eid, (*ids)[index], mctpRequester,
+        std::bind_front(readThermalParameterCallback, eid, ids,
+                        std::ref(mctpRequester), callback, index, thresholds));
+}
diff --git a/src/gpu/GpuThresholds.hpp b/src/gpu/GpuThresholds.hpp
new file mode 100644
index 0000000..2b426b4
--- /dev/null
+++ b/src/gpu/GpuThresholds.hpp
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+/** @brief Read thermal parameters for multiple sensors in a batch operation
+ *
+ *  @param[in] eid - Endpoint ID
+ *  @param[in] ids - Shared pointer to vector of sensor IDs to read
+ *  @param[in] mctpRequester - Reference to MCTP requester
+ *  @param[in] callback - Callback function to process results
+ *              Takes sensor ID and vector of threshold values
+ */
+void readThermalParametersBatched(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback);
diff --git a/src/gpu/meson.build b/src/gpu/meson.build
index c38d254..da9a4b9 100644
--- a/src/gpu/meson.build
+++ b/src/gpu/meson.build
@@ -4,6 +4,7 @@
     'GpuSensor.cpp',
     'GpuSensorMain.cpp',
     'GpuTLimitSensor.cpp',
+    'GpuThresholds.cpp',
     'MctpRequester.cpp',
     'OcpMctpVdm.cpp',
 )