nvidia-gpu: add thresholds support to TLimit

This patch introduces support for retrieving GPU TLimit thresholds
directly from the GPU device. TLimit Temperature represents the
difference in degrees Celsius between the current GPU temperature and
the initial throttle threshold. The patch also enables the extraction of
three critical throttle thresholds — Warning Low, Critical Low, and Hard
Shutdown Low — from the GPU hardware.

Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

```
$ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1",
  "Name": "NVIDIA GB200 GPU 0 TEMP 1",
  "Reading": 57.3984375,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  },
  "Thresholds": {
    "LowerCaution": {
      "Reading": 0.0
    },
    "LowerCritical": {
      "Reading": 0.0
    },
    "LowerFatal": {
      "Reading": 0.0
    }
  }
}%
```

Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.cpp b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
new file mode 100644
index 0000000..16141f1
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
@@ -0,0 +1,128 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuThresholds.hpp"
+
+#include <MctpRequester.hpp>
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+
+#include <array>
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <span>
+#include <vector>
+
+void processReadThermalParameterResponse(
+    const std::function<void(uint8_t, int32_t)>& callback,
+    const std::span<const uint8_t> respMsg, int sendRecvMsgResult)
+{
+    if (sendRecvMsgResult != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
+            "RC", sendRecvMsgResult);
+        callback(EPROTO, 0);
+        return;
+    }
+
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode = 0;
+    int32_t threshold = 0;
+
+    auto rc = gpu::decodeReadThermalParametersResponse(respMsg, cc, reasonCode,
+                                                       threshold);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+            "RC", rc, "CC", cc, "RESC", reasonCode);
+        callback(EPROTO, 0);
+        return;
+    }
+
+    callback(0, threshold);
+};
+
+void readThermalParameter(uint8_t eid, uint8_t id,
+                          mctp::MctpRequester& mctpRequester,
+                          const std::function<void(uint8_t, int32_t)>& callback)
+{
+    auto reqMsg = std::make_shared<
+        std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>>();
+
+    auto respMsg = std::make_shared<
+        std::array<uint8_t, sizeof(gpu::ReadThermalParametersResponse)>>();
+
+    auto rc = gpu::encodeReadThermalParametersRequest(0, id, *reqMsg);
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
+            "EID", eid, "PID", id, "RC", rc);
+        callback(rc, 0);
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, *reqMsg, *respMsg,
+        [reqMsg, respMsg, callback](int sendRecvMsgResult) {
+            processReadThermalParameterResponse(callback, *respMsg,
+                                                sendRecvMsgResult);
+        });
+}
+
+void readThermalParameterCallback(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
+    size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
+    uint8_t rc, int32_t threshold)
+{
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID}. rc={RC}",
+            "EID", eid, "PID", (*ids)[index], "RC", rc);
+        callback(rc, *thresholds);
+        return;
+    }
+
+    thresholds->push_back(threshold);
+
+    ++index;
+    if (index == ids->size())
+    {
+        callback(rc, *thresholds);
+    }
+    else
+    {
+        readThermalParameter(eid, (*ids)[index], mctpRequester,
+                             std::bind_front(readThermalParameterCallback, eid,
+                                             ids, std::ref(mctpRequester),
+                                             callback, index, thresholds));
+    }
+}
+
+void readThermalParameters(
+    uint8_t eid, const std::vector<uint8_t>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
+{
+    auto thresholds = std::make_shared<std::vector<int32_t>>();
+    size_t index = 0;
+
+    readThermalParameter(
+        eid, ids[index], mctpRequester,
+        std::bind_front(readThermalParameterCallback, eid,
+                        std::make_shared<std::vector<uint8_t>>(ids),
+                        std::ref(mctpRequester), callback, index, thresholds));
+}