gpu: add TLimit sensor
This commit introduces a new thermal limit (TLimit) sensor for the GPU,
enhancing the existing temperature monitoring capabilities.
Tested.
The TEMP_0 update is disabled while testing this patch as it requires
MCTP request queueing since OCP MCTP VDM specifies at max one
outstanding request to the device. The MCTP request queueing is being
introduces with this patch -
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80023
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_TEMP_1",
"Name": "NVIDIA GB200 GPU TEMP 1",
"Reading": 47.875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
├─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_0
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 48 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
Change-Id: Ib8e0ef93a4acbb8870671665b098fb61d0205cb2
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuDevice.cpp b/src/gpu/GpuDevice.cpp
index dccd730..11423dd 100644
--- a/src/gpu/GpuDevice.cpp
+++ b/src/gpu/GpuDevice.cpp
@@ -6,6 +6,7 @@
#include "GpuDevice.hpp"
#include "GpuSensor.hpp"
+#include "GpuTLimitSensor.hpp"
#include "Thresholds.hpp"
#include "Utils.hpp"
@@ -58,13 +59,17 @@
conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
std::vector<thresholds::Threshold>{}));
- lg2::info("Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
- "NAME", name, "PATH", path);
+ sensors.push_back(std::make_shared<GpuTLimitSensor>(
+ conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
+ std::vector<thresholds::Threshold>{}));
+
+ lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
+ name, "PATH", path);
}
void GpuDevice::read()
{
- for ([[maybe_unused]] const auto& sensor : sensors)
+ for (const auto& sensor : sensors)
{
sensor->update();
}
diff --git a/src/gpu/GpuSensor.hpp b/src/gpu/GpuSensor.hpp
index 2961404..c74c57e 100644
--- a/src/gpu/GpuSensor.hpp
+++ b/src/gpu/GpuSensor.hpp
@@ -18,16 +18,6 @@
#include <vector>
/**
- * @struct DeviceInfo
- * @brief Contains information about a device
- */
-struct DeviceInfo
-{
- uint8_t deviceType;
- uint8_t instanceId;
-};
-
-/**
* @struct GpuTempSensor
* @brief Implements a GPU temperature sensor that monitors temperature values
* @details Inherits from Sensor base class and enables shared pointer
diff --git a/src/gpu/GpuTLimitSensor.cpp b/src/gpu/GpuTLimitSensor.cpp
new file mode 100644
index 0000000..5a02326
--- /dev/null
+++ b/src/gpu/GpuTLimitSensor.cpp
@@ -0,0 +1,137 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "GpuTLimitSensor.hpp"
+
+#include "SensorPaths.hpp"
+#include "Thresholds.hpp"
+#include "UpdatableSensor.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <GpuDevice.hpp>
+#include <GpuMctpVdm.hpp>
+#include <MctpRequester.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std::literals;
+
+constexpr uint8_t gpuTLimitSensorIdm{2};
+static constexpr double gpuTLimitSensorMaxReading = 127;
+static constexpr double gpuTLimitSensorMinReading = -128;
+
+GpuTLimitSensor::GpuTLimitSensor(
+ std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
+ const std::string& sensorConfiguration, uint8_t eid,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData) :
+ GpuSensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
+ "temperature", false, true, gpuTLimitSensorMaxReading,
+ gpuTLimitSensorMinReading, conn),
+ eid(eid), sensorId{gpuTLimitSensorIdm}, mctpRequester(mctpRequester),
+ objectServer(objectServer)
+{
+ std::string dbusPath =
+ sensorPathPrefix + "temperature/"s + escapeName(name);
+
+ sensorInterface = objectServer.add_interface(
+ dbusPath, "xyz.openbmc_project.Sensor.Value");
+
+ for (const auto& threshold : thresholds)
+ {
+ std::string interface = thresholds::getInterface(threshold.level);
+ thresholdInterfaces[static_cast<size_t>(threshold.level)] =
+ objectServer.add_interface(dbusPath, interface);
+ }
+
+ association = objectServer.add_interface(dbusPath, association::interface);
+
+ setInitialProperties(sensor_paths::unitDegreesC);
+}
+
+GpuTLimitSensor::~GpuTLimitSensor()
+{
+ for (const auto& iface : thresholdInterfaces)
+ {
+ objectServer.remove_interface(iface);
+ }
+ objectServer.remove_interface(sensorInterface);
+ objectServer.remove_interface(association);
+}
+
+void GpuTLimitSensor::checkThresholds()
+{
+ thresholds::checkThresholds(this);
+}
+
+void GpuTLimitSensor::update()
+{
+ std::vector<uint8_t> reqMsg(
+ sizeof(ocp::accelerator_management::BindingPciVid) +
+ sizeof(gpu::GetTemperatureReadingRequest));
+
+ auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+ auto rc = gpu::encodeGetTemperatureReadingRequest(0, sensorId, *msg);
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error(
+ "GpuTLimitSensor::update(): gpuEncodeGetTemperatureReadingRequest failed, rc={RC}",
+ "RC", static_cast<int>(rc));
+ return;
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, reqMsg,
+ [this](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error(
+ "GpuTLimitSensor::update(): MctpRequester::sendRecvMsg() failed, rc={RC}",
+ "RC", sendRecvMsgResult);
+ return;
+ }
+
+ if (respMsg.empty())
+ {
+ lg2::error(
+ "GpuTLimitSensor::update(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+ return;
+ }
+
+ uint8_t cc = 0;
+ uint16_t reasonCode = 0;
+ double tempValue = 0;
+
+ auto rc = gpu::decodeGetTemperatureReadingResponse(
+ *new (respMsg.data()) ocp::accelerator_management::Message,
+ respMsg.size(), cc, reasonCode, tempValue);
+
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+ cc != static_cast<uint8_t>(
+ ocp::accelerator_management::CompletionCode::SUCCESS))
+ {
+ lg2::error(
+ "GpuTLimitSensor::update(): gpuDecodeGetTemperatureReadingResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+ "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+ return;
+ }
+
+ updateValue(tempValue);
+ });
+}
diff --git a/src/gpu/GpuTLimitSensor.hpp b/src/gpu/GpuTLimitSensor.hpp
new file mode 100644
index 0000000..0407814
--- /dev/null
+++ b/src/gpu/GpuTLimitSensor.hpp
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+#include "Thresholds.hpp"
+#include "UpdatableSensor.hpp"
+
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+/**
+ * @struct GpuTLimitSensor
+ * @brief Implements a GPU temperature sensor that monitors temperature values
+ * @details Inherits from Sensor base class and enables shared pointer
+ * management via std::enable_shared_from_this
+ */
+struct GpuTLimitSensor :
+ public GpuSensor,
+ public std::enable_shared_from_this<GpuTLimitSensor>
+{
+ public:
+ /**
+ * @brief Constructor for GpuTLimitSensor
+ * @param conn D-Bus connection for system communication
+ * @param mctpRequester MCTP protocol requester for GPU communication
+ * @param name Name of the sensor for identification in the system
+ * @param sensorConfiguration Configuration string for the sensor containing
+ * setup parameters
+ * @param eid EID of the device endpoint
+ * @param objectServer D-Bus object server for exposing sensor interfaces
+ * @param thresholdData Vector of threshold configurations for temperature
+ * monitoring
+ */
+ GpuTLimitSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
+ const std::string& sensorConfiguration, uint8_t eid,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
+
+ /**
+ * @brief Destructor
+ */
+ ~GpuTLimitSensor() override;
+
+ /**
+ * @brief Check if any thresholds have been crossed
+ * @details Overrides the base class method to implement GPU-specific
+ * threshold checking
+ */
+ void checkThresholds() override;
+
+ private:
+ /**
+ * @brief Update the sensor reading
+ */
+ void update() final;
+
+ /**
+ * @brief MCTP endpoint ID
+ */
+ uint8_t eid{};
+
+ /**
+ * @brief The sensor ID
+ */
+ uint8_t sensorId;
+
+ /**
+ * @brief Reference to the MCTP requester for communication
+ */
+ mctp::MctpRequester& mctpRequester;
+
+ /**
+ * @brief D-Bus object server
+ */
+ sdbusplus::asio::object_server& objectServer;
+};
diff --git a/src/gpu/meson.build b/src/gpu/meson.build
index f8cfe39..c38d254 100644
--- a/src/gpu/meson.build
+++ b/src/gpu/meson.build
@@ -3,6 +3,7 @@
'GpuMctpVdm.cpp',
'GpuSensor.cpp',
'GpuSensorMain.cpp',
+ 'GpuTLimitSensor.cpp',
'MctpRequester.cpp',
'OcpMctpVdm.cpp',
)