nvidia-gpu: add TLimit sensor
This commit introduces a new thermal limit (TLimit) sensor for the GPU,
enhancing the existing temperature monitoring capabilities.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1",
"Name": "NVIDIA GB200 GPU 0 TEMP 1",
"Reading": 57.3984375,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ib8e0ef93a4acbb8870671665b098fb61d0205cb2
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index cd39b56..ab476da 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -42,8 +42,12 @@
void GpuDevice::makeSensors()
{
tempSensor = std::make_shared<NvidiaGpuTempSensor>(
- conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
- std::vector<thresholds::Threshold>{});
+ conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
+ objectServer, std::vector<thresholds::Threshold>{});
+
+ tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
+ conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
+ objectServer, std::vector<thresholds::Threshold>{});
lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
name, "PATH", path);
@@ -54,6 +58,7 @@
void GpuDevice::read()
{
tempSensor->update();
+ tLimitSensor->update();
waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
waitTimer.async_wait([this](const boost::system::error_code& ec) {
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.hpp b/src/nvidia-gpu/NvidiaGpuDevice.hpp
index 3653928..78b3e4a 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.hpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.hpp
@@ -53,6 +53,7 @@
sdbusplus::asio::object_server& objectServer;
std::shared_ptr<NvidiaGpuTempSensor> tempSensor;
+ std::shared_ptr<NvidiaGpuTempSensor> tLimitSensor;
SensorConfigs configs;
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.cpp b/src/nvidia-gpu/NvidiaGpuSensor.cpp
index 1626545..3853048 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.cpp
@@ -31,20 +31,19 @@
using namespace std::literals;
-constexpr uint8_t gpuTempSensorId{0};
static constexpr double gpuTempSensorMaxReading = 127;
static constexpr double gpuTempSensorMinReading = -128;
NvidiaGpuTempSensor::NvidiaGpuTempSensor(
std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const std::string& name,
- const std::string& sensorConfiguration, const uint8_t eid,
+ const std::string& sensorConfiguration, const uint8_t eid, uint8_t sensorId,
sdbusplus::asio::object_server& objectServer,
std::vector<thresholds::Threshold>&& thresholdData) :
Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
"temperature", false, true, gpuTempSensorMaxReading,
gpuTempSensorMinReading, conn),
- eid(eid), sensorId{gpuTempSensorId}, mctpRequester(mctpRequester),
+ eid(eid), sensorId{sensorId}, mctpRequester(mctpRequester),
objectServer(objectServer)
{
std::string dbusPath =
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 25fe069..1e21f4d 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -20,17 +20,20 @@
#include <string>
#include <vector>
+constexpr uint8_t gpuTempSensorId{0};
+constexpr uint8_t gpuTLimitSensorId{2};
+
struct NvidiaGpuTempSensor :
public Sensor,
public std::enable_shared_from_this<NvidiaGpuTempSensor>
{
public:
- NvidiaGpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
- mctp::MctpRequester& mctpRequester,
- const std::string& name,
- const std::string& sensorConfiguration, uint8_t eid,
- sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData);
+ NvidiaGpuTempSensor(
+ std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
+ const std::string& sensorConfiguration, uint8_t eid, uint8_t sensorId,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
~NvidiaGpuTempSensor() override;