gpu : add thresholds support to TLimit

This patch adds support to fetch TLimit thresholds from gpu

Tested.

The TEMP_0 update is disabled while testing this patch as it requires
MCTP request queueing since OCP MCTP VDM specifies at max one
outstanding request to the device. The MCTP request queueing is being
introduces with this patch -
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80023

Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```

Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```

```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_TEMP_1",
  "Name": "NVIDIA GB200 GPU TEMP 1",
  "Reading": 49.0,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  },
  "Thresholds": {
    "LowerCaution": {
      "Reading": 0.0
    },
    "LowerCritical": {
      "Reading": 0.0
    }
  }
}%

root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
NAME                                                  TYPE      SIGNATURE RESULT/VALUE                             FLAGS
org.freedesktop.DBus.Introspectable                   interface -         -                                        -
.Introspect                                           method    -         s                                        -
org.freedesktop.DBus.Peer                             interface -         -                                        -
.GetMachineId                                         method    -         s                                        -
.Ping                                                 method    -         -                                        -
org.freedesktop.DBus.Properties                       interface -         -                                        -
.Get                                                  method    ss        v                                        -
.GetAll                                               method    s         a{sv}                                    -
.Set                                                  method    ssv       -                                        -
.PropertiesChanged                                    signal    sa{sv}as  -                                        -
xyz.openbmc_project.Association.Definitions           interface -         -                                        -
.Associations                                         property  a(sss)    1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Inventory.Item                    interface -         -                                        -
.PrettyName                                           property  s         "Thermal Limit(TLIMIT) Temperature is t… emits-change
xyz.openbmc_project.Sensor.Threshold.Critical         interface -         -                                        -
.CriticalAlarmHigh                                    property  b         false                                    emits-change
.CriticalAlarmLow                                     property  b         false                                    emits-change
.CriticalHigh                                         property  d         nan                                      emits-change writable
.CriticalLow                                          property  d         0                                        emits-change writable
xyz.openbmc_project.Sensor.Threshold.HardShutdown     interface -         -                                        -
.HardShutdownAlarmHigh                                property  b         false                                    emits-change
.HardShutdownAlarmLow                                 property  b         false                                    emits-change
.HardShutdownHigh                                     property  d         nan                                      emits-change writable
.HardShutdownLow                                      property  d         0                                        emits-change writable
xyz.openbmc_project.Sensor.Threshold.Warning          interface -         -                                        -
.WarningAlarmHigh                                     property  b         false                                    emits-change
.WarningAlarmLow                                      property  b         false                                    emits-change
.WarningHigh                                          property  d         nan                                      emits-change writable
.WarningLow                                           property  d         0                                        emits-change writable
xyz.openbmc_project.Sensor.Value                      interface -         -                                        -
.MaxValue                                             property  d         127                                      emits-change
.MinValue                                             property  d         -128                                     emits-change
.Unit                                                 property  s         "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value                                                property  d         48.9688                                  emits-change writable
xyz.openbmc_project.Sensor.ValueMutability            interface -         -                                        -
.Mutable                                              property  b         true                                     emits-change
xyz.openbmc_project.State.Decorator.Availability      interface -         -                                        -
.Available                                            property  b         true                                     emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface -         -                                        -
.Functional                                           property  b         true                                     emits-change
```

Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuThresholds.cpp b/src/gpu/GpuThresholds.cpp
new file mode 100644
index 0000000..9eb17c4
--- /dev/null
+++ b/src/gpu/GpuThresholds.cpp
@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <GpuMctpVdm.hpp>
+#include <MctpRequester.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+void readThermalParameter(uint8_t eid, uint8_t id,
+                          mctp::MctpRequester& mctpRequester,
+                          const std::function<void(uint8_t, int32_t)>& callback)
+{
+    std::vector<uint8_t> reqMsg(
+        sizeof(ocp::accelerator_management::BindingPciVid) +
+        sizeof(gpu::ReadThermalParametersRequest));
+
+    auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+    auto rc = gpu::encodeReadThermalParametersRequest(0, id, *msg);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error("encodeReadThermalParametersRequest failed, rc={RC}", "RC",
+                   static_cast<int>(rc));
+
+        callback(-1, 0);
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, reqMsg,
+        [callback](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+            if (sendRecvMsgResult != 0)
+            {
+                lg2::error("MctpRequester::sendRecvMsg() failed, rc={RC}", "RC",
+                           sendRecvMsgResult);
+
+                callback(-2, 0);
+                return;
+            }
+
+            if (respMsg.empty())
+            {
+                lg2::error("MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+
+                callback(-3, 0);
+                return;
+            }
+
+            uint8_t cc = 0;
+            uint16_t reasonCode = 0;
+            int32_t threshold = 0;
+
+            auto rc = gpu::decodeReadThermalParametersResponse(
+                *new (respMsg.data()) ocp::accelerator_management::Message,
+                respMsg.size(), cc, reasonCode, threshold);
+
+            if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+                cc != static_cast<uint8_t>(
+                          ocp::accelerator_management::CompletionCode::SUCCESS))
+            {
+                lg2::error(
+                    "decodeReadThermalParametersResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+                    "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+
+                callback(-4, 0);
+                return;
+            }
+
+            callback(0, threshold);
+        });
+}
+
+void readThermalParameterCallback(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
+    size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
+    uint8_t rc, int32_t threshold)
+{
+    if (rc != 0)
+    {
+        callback(rc, *thresholds);
+        return;
+    }
+
+    thresholds->push_back(threshold);
+
+    ++index;
+    if (index == ids->size())
+    {
+        callback(rc, *thresholds);
+    }
+    else
+    {
+        readThermalParameter(eid, (*ids)[index], mctpRequester,
+                             std::bind_front(readThermalParameterCallback, eid,
+                                             ids, std::ref(mctpRequester),
+                                             callback, index, thresholds));
+    }
+}
+
+void readThermalParametersBatched(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
+{
+    auto thresholds = std::make_shared<std::vector<int32_t>>();
+    size_t index = 0;
+
+    readThermalParameter(
+        eid, (*ids)[index], mctpRequester,
+        std::bind_front(readThermalParameterCallback, eid, ids,
+                        std::ref(mctpRequester), callback, index, thresholds));
+}