gpu: add TLimit sensor
This commit introduces a new thermal limit (TLimit) sensor for the GPU,
enhancing the existing temperature monitoring capabilities.
Tested.
The TEMP_0 update is disabled while testing this patch as it requires
MCTP request queueing since OCP MCTP VDM specifies at max one
outstanding request to the device. The MCTP request queueing is being
introduces with this patch -
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80023
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_1",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_TEMP_1",
"Name": "NVIDIA GB200 GPU TEMP 1",
"Reading": 47.875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
├─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_0
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU_TEMP_1
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 48 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
Change-Id: Ib8e0ef93a4acbb8870671665b098fb61d0205cb2
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuTLimitSensor.hpp b/src/gpu/GpuTLimitSensor.hpp
new file mode 100644
index 0000000..0407814
--- /dev/null
+++ b/src/gpu/GpuTLimitSensor.hpp
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+#include "Thresholds.hpp"
+#include "UpdatableSensor.hpp"
+
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+/**
+ * @struct GpuTLimitSensor
+ * @brief Implements a GPU temperature sensor that monitors temperature values
+ * @details Inherits from Sensor base class and enables shared pointer
+ * management via std::enable_shared_from_this
+ */
+struct GpuTLimitSensor :
+ public GpuSensor,
+ public std::enable_shared_from_this<GpuTLimitSensor>
+{
+ public:
+ /**
+ * @brief Constructor for GpuTLimitSensor
+ * @param conn D-Bus connection for system communication
+ * @param mctpRequester MCTP protocol requester for GPU communication
+ * @param name Name of the sensor for identification in the system
+ * @param sensorConfiguration Configuration string for the sensor containing
+ * setup parameters
+ * @param eid EID of the device endpoint
+ * @param objectServer D-Bus object server for exposing sensor interfaces
+ * @param thresholdData Vector of threshold configurations for temperature
+ * monitoring
+ */
+ GpuTLimitSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
+ const std::string& sensorConfiguration, uint8_t eid,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
+
+ /**
+ * @brief Destructor
+ */
+ ~GpuTLimitSensor() override;
+
+ /**
+ * @brief Check if any thresholds have been crossed
+ * @details Overrides the base class method to implement GPU-specific
+ * threshold checking
+ */
+ void checkThresholds() override;
+
+ private:
+ /**
+ * @brief Update the sensor reading
+ */
+ void update() final;
+
+ /**
+ * @brief MCTP endpoint ID
+ */
+ uint8_t eid{};
+
+ /**
+ * @brief The sensor ID
+ */
+ uint8_t sensorId;
+
+ /**
+ * @brief Reference to the MCTP requester for communication
+ */
+ mctp::MctpRequester& mctpRequester;
+
+ /**
+ * @brief D-Bus object server
+ */
+ sdbusplus::asio::object_server& objectServer;
+};