nvidia-gpu: add thresholds support to TLimit This patch introduces support for retrieving GPU TLimit thresholds directly from the GPU device. TLimit Temperature represents the difference in degrees Celsius between the current GPU temperature and the initial throttle threshold. The patch also enables the extraction of three critical throttle thresholds — Warning Low, Critical Low, and Hard Shutdown Low — from the GPU hardware. Tested: Build an image for gb200nvl-obmc machine with the following patches cherry picked. This patches are needed to enable the mctp stack. https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422 ``` $ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1 { "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1", "@odata.type": "#Sensor.v1_2_0.Sensor", "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1", "Name": "NVIDIA GB200 GPU 0 TEMP 1", "Reading": 57.3984375, "ReadingRangeMax": 127.0, "ReadingRangeMin": -128.0, "ReadingType": "Temperature", "ReadingUnits": "Cel", "Status": { "Health": "OK", "State": "Enabled" }, "Thresholds": { "LowerCaution": { "Reading": 0.0 }, "LowerCritical": { "Reading": 0.0 }, "LowerFatal": { "Reading": 0.0 } } }% ``` Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963 Signed-off-by: Harshit Aghera <haghera@nvidia.com>

commit: 5e7deccd14dcac790028a6641291cc019c1c4e52 [log] [tgz]
author: Harshit Aghera <haghera@nvidia.com> Wed May 07 16:20:16 2025 +0530
committer: Ed Tanous <ed@tanous.net> Thu Jul 10 15:01:22 2025 +0000
tree: 84807211fb4ed2e7959c95f4c8c39b410e3bb3b3
parent: ba138dae62cfd96571372a3e22317dc57ab72c80 [diff] [blame]
diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.cpp b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
new file mode 100644
index 0000000..16141f1
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuThresholds.cpp

@@ -0,0 +1,128 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuThresholds.hpp"
+
+#include <MctpRequester.hpp>
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+
+#include <array>
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <span>
+#include <vector>
+
+void processReadThermalParameterResponse(
+    const std::function<void(uint8_t, int32_t)>& callback,
+    const std::span<const uint8_t> respMsg, int sendRecvMsgResult)
+{
+    if (sendRecvMsgResult != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
+            "RC", sendRecvMsgResult);
+        callback(EPROTO, 0);
+        return;
+    }
+
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode = 0;
+    int32_t threshold = 0;
+
+    auto rc = gpu::decodeReadThermalParametersResponse(respMsg, cc, reasonCode,
+                                                       threshold);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+            "RC", rc, "CC", cc, "RESC", reasonCode);
+        callback(EPROTO, 0);
+        return;
+    }
+
+    callback(0, threshold);
+};
+
+void readThermalParameter(uint8_t eid, uint8_t id,
+                          mctp::MctpRequester& mctpRequester,
+                          const std::function<void(uint8_t, int32_t)>& callback)
+{
+    auto reqMsg = std::make_shared<
+        std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>>();
+
+    auto respMsg = std::make_shared<
+        std::array<uint8_t, sizeof(gpu::ReadThermalParametersResponse)>>();
+
+    auto rc = gpu::encodeReadThermalParametersRequest(0, id, *reqMsg);
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
+            "EID", eid, "PID", id, "RC", rc);
+        callback(rc, 0);
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, *reqMsg, *respMsg,
+        [reqMsg, respMsg, callback](int sendRecvMsgResult) {
+            processReadThermalParameterResponse(callback, *respMsg,
+                                                sendRecvMsgResult);
+        });
+}
+
+void readThermalParameterCallback(
+    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
+    size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
+    uint8_t rc, int32_t threshold)
+{
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID}. rc={RC}",
+            "EID", eid, "PID", (*ids)[index], "RC", rc);
+        callback(rc, *thresholds);
+        return;
+    }
+
+    thresholds->push_back(threshold);
+
+    ++index;
+    if (index == ids->size())
+    {
+        callback(rc, *thresholds);
+    }
+    else
+    {
+        readThermalParameter(eid, (*ids)[index], mctpRequester,
+                             std::bind_front(readThermalParameterCallback, eid,
+                                             ids, std::ref(mctpRequester),
+                                             callback, index, thresholds));
+    }
+}
+
+void readThermalParameters(
+    uint8_t eid, const std::vector<uint8_t>& ids,
+    mctp::MctpRequester& mctpRequester,
+    const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
+{
+    auto thresholds = std::make_shared<std::vector<int32_t>>();
+    size_t index = 0;
+
+    readThermalParameter(
+        eid, ids[index], mctpRequester,
+        std::bind_front(readThermalParameterCallback, eid,
+                        std::make_shared<std::vector<uint8_t>>(ids),
+                        std::ref(mctpRequester), callback, index, thresholds));
+}
commit	5e7deccd14dcac790028a6641291cc019c1c4e52	[log] [tgz]
author	Harshit Aghera <haghera@nvidia.com>	Wed May 07 16:20:16 2025 +0530
committer	Ed Tanous <ed@tanous.net>	Thu Jul 10 15:01:22 2025 +0000
tree	84807211fb4ed2e7959c95f4c8c39b410e3bb3b3
parent	ba138dae62cfd96571372a3e22317dc57ab72c80 [diff] [blame]