nvidia-gpu: introduce notion of a device
Perform device discovery tasks only once per device to prepare for
introducing additional gpu sensors.
In the current implementation, sensor updates and device discovery via
MCTP are managed within a single class for simplicity. However, since a
GPU device typically includes multiple sensors, performing device
discovery for each individual sensor is inefficient. Instead, it would
be more effective to execute device discovery once per device.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"Name": "NVIDIA GB200 GPU 0 TEMP 0",
"Reading": 37.6875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ie3dcd43caa031b4aaa61d8be3f5d71aefd53bc9a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
new file mode 100644
index 0000000..cd39b56
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuDevice.hpp"
+
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <MctpRequester.hpp>
+#include <boost/asio/io_context.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
+ const std::string& path,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ uint8_t eid, boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer) :
+ eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
+ waitTimer(io, std::chrono::steady_clock::duration(0)),
+ mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
+ configs(configs), name(escapeName(name)), path(path)
+{
+ makeSensors();
+}
+
+void GpuDevice::makeSensors()
+{
+ tempSensor = std::make_shared<NvidiaGpuTempSensor>(
+ conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
+ std::vector<thresholds::Threshold>{});
+
+ lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
+ name, "PATH", path);
+
+ read();
+}
+
+void GpuDevice::read()
+{
+ tempSensor->update();
+
+ waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+ waitTimer.async_wait([this](const boost::system::error_code& ec) {
+ if (ec)
+ {
+ return;
+ }
+ read();
+ });
+}