gpu : add thresholds support to TLimit
This patch adds support to fetch TLimit thresholds from gpu
Tested.
The TEMP_0 update is disabled while testing this patch as it requires
MCTP request queueing since OCP MCTP VDM specifies at max one
outstanding request to the device. The MCTP request queueing is being
introduces with this patch -
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80023
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_TEMP_1",
"Name": "NVIDIA GB200 GPU TEMP 1",
"Reading": 49.0,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
},
"Thresholds": {
"LowerCaution": {
"Reading": 0.0
},
"LowerCritical": {
"Reading": 0.0
}
}
}%
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Inventory.Item interface - - -
.PrettyName property s "Thermal Limit(TLIMIT) Temperature is t… emits-change
xyz.openbmc_project.Sensor.Threshold.Critical interface - - -
.CriticalAlarmHigh property b false emits-change
.CriticalAlarmLow property b false emits-change
.CriticalHigh property d nan emits-change writable
.CriticalLow property d 0 emits-change writable
xyz.openbmc_project.Sensor.Threshold.HardShutdown interface - - -
.HardShutdownAlarmHigh property b false emits-change
.HardShutdownAlarmLow property b false emits-change
.HardShutdownHigh property d nan emits-change writable
.HardShutdownLow property d 0 emits-change writable
xyz.openbmc_project.Sensor.Threshold.Warning interface - - -
.WarningAlarmHigh property b false emits-change
.WarningAlarmLow property b false emits-change
.WarningHigh property d nan emits-change writable
.WarningLow property d 0 emits-change writable
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 48.9688 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
Change-Id: I6f2ff2652ce9246287f9bd63c4297d9ad3229963
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuDevice.cpp b/src/gpu/GpuDevice.cpp
index 11423dd..54b6df2 100644
--- a/src/gpu/GpuDevice.cpp
+++ b/src/gpu/GpuDevice.cpp
@@ -13,6 +13,7 @@
#include <bits/basic_string.h>
#include <GpuMctpVdm.hpp>
+#include <GpuThresholds.hpp>
#include <MctpRequester.hpp>
#include <OcpMctpVdm.hpp>
#include <boost/asio/io_context.hpp>
@@ -59,12 +60,33 @@
conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
std::vector<thresholds::Threshold>{}));
- sensors.push_back(std::make_shared<GpuTLimitSensor>(
- conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
- std::vector<thresholds::Threshold>{}));
-
lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
name, "PATH", path);
+
+ readThermalParametersBatched(
+ eid,
+ std::make_shared<std::vector<uint8_t>>(std::vector<uint8_t>{1, 2, 4}),
+ mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
+ if (rc)
+ {
+ return;
+ }
+
+ std::vector<thresholds::Threshold> tLimitThresholds{
+ thresholds::Threshold{thresholds::Level::CRITICAL,
+ thresholds::Direction::LOW,
+ static_cast<double>(thresholds[0])},
+ thresholds::Threshold{thresholds::Level::WARNING,
+ thresholds::Direction::LOW,
+ static_cast<double>(thresholds[1])},
+ thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
+ thresholds::Direction::LOW,
+ static_cast<double>(thresholds[2])}};
+
+ sensors.push_back(std::make_shared<GpuTLimitSensor>(
+ conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
+ std::move(tLimitThresholds)));
+ });
}
void GpuDevice::read()
diff --git a/src/gpu/GpuMctpVdm.cpp b/src/gpu/GpuMctpVdm.cpp
index 1afc6ca..24bc9b3 100644
--- a/src/gpu/GpuMctpVdm.cpp
+++ b/src/gpu/GpuMctpVdm.cpp
@@ -270,4 +270,129 @@
return ocp::accelerator_management::CompletionCode::SUCCESS;
}
+
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersRequest(
+ uint8_t instanceId, uint8_t sensorId,
+ ocp::accelerator_management::Message& msg)
+{
+ ocp::accelerator_management::BindingPciVidInfo header{};
+ header.ocp_accelerator_management_msg_type =
+ static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+ header.instance_id = instanceId &
+ ocp::accelerator_management::instanceIdMask;
+ header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
+
+ auto rc = packHeader(header, msg.hdr);
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ return rc;
+ }
+
+ ReadThermalParametersRequest request{};
+ request.hdr.command = static_cast<uint8_t>(
+ PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+ request.hdr.data_size = sizeof(sensorId);
+ request.sensor_id = sensorId;
+
+ std::memcpy(&msg.data, &request, sizeof(request));
+
+ return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
+
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersRequest(
+ const ocp::accelerator_management::Message& msg, size_t msgLen,
+ uint8_t& sensorId)
+{
+ if (msgLen < sizeof(ocp::accelerator_management::BindingPciVid) +
+ sizeof(ReadThermalParametersRequest))
+ {
+ return ocp::accelerator_management::CompletionCode::
+ ERR_INVALID_DATA_LENGTH;
+ }
+
+ ReadThermalParametersRequest request{};
+ std::memcpy(&request, &msg.data, sizeof(request));
+
+ if (request.hdr.data_size < sizeof(request.sensor_id))
+ {
+ return ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA;
+ }
+
+ sensorId = request.sensor_id;
+
+ return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
+
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersResponse(
+ uint8_t instanceId, uint8_t cc, uint16_t reasonCode, int32_t threshold,
+ ocp::accelerator_management::Message& msg)
+{
+ ocp::accelerator_management::BindingPciVidInfo header{};
+ header.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
+ ocp::accelerator_management::MessageType::RESPONSE);
+ header.instance_id = instanceId &
+ ocp::accelerator_management::instanceIdMask;
+ header.msg_type = static_cast<uint8_t>(MessageType::PLATFORM_ENVIRONMENTAL);
+
+ auto rc = packHeader(header, msg.hdr);
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ return rc;
+ }
+
+ if (cc != static_cast<uint8_t>(
+ ocp::accelerator_management::CompletionCode::SUCCESS))
+ {
+ return gpu::encodeReasonCode(
+ cc, reasonCode,
+ static_cast<uint8_t>(
+ PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS),
+ msg);
+ }
+
+ ReadThermalParametersResponse response{};
+ response.hdr.command = static_cast<uint8_t>(
+ PlatformEnvironmentalCommands::READ_THERMAL_PARAMETERS);
+ response.hdr.completion_code = cc;
+ response.hdr.data_size = htole16(sizeof(uint32_t));
+
+ response.threshold = htole32(threshold);
+
+ std::memcpy(&msg.data, &response, sizeof(response));
+
+ return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
+
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersResponse(
+ const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
+ uint16_t& reasonCode, int32_t& threshold)
+{
+ auto rc = gpu::decodeReasonCodeAndCC(msg, msgLen, cc, reasonCode);
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+ cc != static_cast<uint8_t>(
+ ocp::accelerator_management::CompletionCode::SUCCESS))
+ {
+ return rc;
+ }
+
+ if (msgLen < sizeof(ocp::accelerator_management::BindingPciVid) +
+ sizeof(ReadThermalParametersResponse))
+ {
+ return ocp::accelerator_management::CompletionCode::
+ ERR_INVALID_DATA_LENGTH;
+ }
+
+ ReadThermalParametersResponse response{};
+ std::memcpy(&response, &msg.data, sizeof(response));
+
+ uint16_t dataSize = le16toh(response.hdr.data_size);
+ if (dataSize != sizeof(int32_t))
+ {
+ return ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA;
+ }
+
+ threshold = response.threshold;
+
+ return ocp::accelerator_management::CompletionCode::SUCCESS;
+}
} // namespace gpu
diff --git a/src/gpu/GpuMctpVdm.hpp b/src/gpu/GpuMctpVdm.hpp
index 21c69cd..6663df1 100644
--- a/src/gpu/GpuMctpVdm.hpp
+++ b/src/gpu/GpuMctpVdm.hpp
@@ -42,6 +42,7 @@
enum class PlatformEnvironmentalCommands : uint8_t
{
GET_TEMPERATURE_READING = 0x00,
+ READ_THERMAL_PARAMETERS = 0x02,
};
/** @brief device identification types
@@ -92,6 +93,12 @@
*/
using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
+/** @struct ReadThermalParametersRequest
+ *
+ * Structure representing request to read thermal parameters.
+ */
+using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
+
/** @struct GetTemperatureReadingResponse
*
* Structure representing get temperature reading response.
@@ -102,6 +109,17 @@
int32_t reading;
} __attribute__((packed));
+/** @struct ReadThermalParametersResponse
+ *
+ * Structure representing response to read thermal parameters request.
+ * Contains the thermal threshold value for the requested sensor.
+ */
+struct ReadThermalParametersResponse
+{
+ ocp::accelerator_management::CommonResponse hdr;
+ int32_t threshold;
+} __attribute__((packed));
+
/**
* @brief Populate the GPU message with the GPU header.
* The caller of this API allocates buffer for the GPU header
@@ -243,4 +261,56 @@
const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
uint16_t& reasonCode, double& temperatureReading);
+/** @brief Encode a Read thermal parameters request message
+ *
+ * @param[in] instance_id - instance ID
+ * @param[in] sensor_id - sensor id
+ * @param[out] msg - Reference to message that will be written to
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersRequest(
+ uint8_t instanceId, uint8_t sensorId,
+ ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Read thermal parameters request message
+ *
+ * @param[in] msg - request message
+ * @param[in] msg_len - Length of request message
+ * @param[out] sensor_id - reference to sensor id
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersRequest(
+ const ocp::accelerator_management::Message& msg, size_t msgLen,
+ uint8_t& sensorId);
+
+/** @brief Encode a Read thermal parameters response message
+ *
+ * @param[in] instance_id - instance ID
+ * @param[in] cc - completion code
+ * @param[in] reason_code - reason code
+ * @param[in] threshold - thermal threshold
+ * @param[out] msg - Reference to message that will be written to
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeReadThermalParametersResponse(
+ uint8_t instanceId, uint8_t cc, uint16_t reasonCode, int32_t threshold,
+ ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Read thermal parameters response message
+ *
+ * @param[in] msg - response message
+ * @param[in] msg_len - Length of response message
+ * @param[out] cc - reference to completion code
+ * @param[out] reason_code - reference to reason code
+ * @param[out] threshold - reference to thermal threshold
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeReadThermalParametersResponse(
+ const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
+ uint16_t& reasonCode, int32_t& threshold);
+
} // namespace gpu
diff --git a/src/gpu/GpuThresholds.cpp b/src/gpu/GpuThresholds.cpp
new file mode 100644
index 0000000..9eb17c4
--- /dev/null
+++ b/src/gpu/GpuThresholds.cpp
@@ -0,0 +1,122 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <GpuMctpVdm.hpp>
+#include <MctpRequester.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+void readThermalParameter(uint8_t eid, uint8_t id,
+ mctp::MctpRequester& mctpRequester,
+ const std::function<void(uint8_t, int32_t)>& callback)
+{
+ std::vector<uint8_t> reqMsg(
+ sizeof(ocp::accelerator_management::BindingPciVid) +
+ sizeof(gpu::ReadThermalParametersRequest));
+
+ auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+ auto rc = gpu::encodeReadThermalParametersRequest(0, id, *msg);
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error("encodeReadThermalParametersRequest failed, rc={RC}", "RC",
+ static_cast<int>(rc));
+
+ callback(-1, 0);
+ return;
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, reqMsg,
+ [callback](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error("MctpRequester::sendRecvMsg() failed, rc={RC}", "RC",
+ sendRecvMsgResult);
+
+ callback(-2, 0);
+ return;
+ }
+
+ if (respMsg.empty())
+ {
+ lg2::error("MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+
+ callback(-3, 0);
+ return;
+ }
+
+ uint8_t cc = 0;
+ uint16_t reasonCode = 0;
+ int32_t threshold = 0;
+
+ auto rc = gpu::decodeReadThermalParametersResponse(
+ *new (respMsg.data()) ocp::accelerator_management::Message,
+ respMsg.size(), cc, reasonCode, threshold);
+
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+ cc != static_cast<uint8_t>(
+ ocp::accelerator_management::CompletionCode::SUCCESS))
+ {
+ lg2::error(
+ "decodeReadThermalParametersResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+ "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+
+ callback(-4, 0);
+ return;
+ }
+
+ callback(0, threshold);
+ });
+}
+
+void readThermalParameterCallback(
+ uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+ mctp::MctpRequester& mctpRequester,
+ const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
+ size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
+ uint8_t rc, int32_t threshold)
+{
+ if (rc != 0)
+ {
+ callback(rc, *thresholds);
+ return;
+ }
+
+ thresholds->push_back(threshold);
+
+ ++index;
+ if (index == ids->size())
+ {
+ callback(rc, *thresholds);
+ }
+ else
+ {
+ readThermalParameter(eid, (*ids)[index], mctpRequester,
+ std::bind_front(readThermalParameterCallback, eid,
+ ids, std::ref(mctpRequester),
+ callback, index, thresholds));
+ }
+}
+
+void readThermalParametersBatched(
+ uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+ mctp::MctpRequester& mctpRequester,
+ const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
+{
+ auto thresholds = std::make_shared<std::vector<int32_t>>();
+ size_t index = 0;
+
+ readThermalParameter(
+ eid, (*ids)[index], mctpRequester,
+ std::bind_front(readThermalParameterCallback, eid, ids,
+ std::ref(mctpRequester), callback, index, thresholds));
+}
diff --git a/src/gpu/GpuThresholds.hpp b/src/gpu/GpuThresholds.hpp
new file mode 100644
index 0000000..2b426b4
--- /dev/null
+++ b/src/gpu/GpuThresholds.hpp
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+/** @brief Read thermal parameters for multiple sensors in a batch operation
+ *
+ * @param[in] eid - Endpoint ID
+ * @param[in] ids - Shared pointer to vector of sensor IDs to read
+ * @param[in] mctpRequester - Reference to MCTP requester
+ * @param[in] callback - Callback function to process results
+ * Takes sensor ID and vector of threshold values
+ */
+void readThermalParametersBatched(
+ uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
+ mctp::MctpRequester& mctpRequester,
+ const std::function<void(uint8_t, std::vector<int32_t>)>& callback);
diff --git a/src/gpu/meson.build b/src/gpu/meson.build
index c38d254..da9a4b9 100644
--- a/src/gpu/meson.build
+++ b/src/gpu/meson.build
@@ -4,6 +4,7 @@
'GpuSensor.cpp',
'GpuSensorMain.cpp',
'GpuTLimitSensor.cpp',
+ 'GpuThresholds.cpp',
'MctpRequester.cpp',
'OcpMctpVdm.cpp',
)