gpu : introduce notion of a device
The concept of a device is being introduced for the GPU, which enables a
more efficient and scalable approach to managing multiple sensors for a
single endpoint.
Refactoring of Entity-Manager Configuration and Endpoint Discovery To
support multiple sensors for an endpoint, the following changes are
being made:
Entity-Manager Configuration Discovery: This task will be moved out of
the GPU Temperature Sensor implementation.
Endpoint Discovery: Similarly, endpoint discovery will also be performed
outside of the GPU Temperature Sensor implementation.
Frequency of Task Execution: Both entity-manager configuration discovery
and endpoint discovery will be performed only once per endpoint, rather
than repeatedly for each sensor. This optimization will improve
performance and reduce redundancy.
Tested.
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_TEMP_0",
"Name": "NVIDIA GB200 GPU TEMP 0",
"Reading": 35.96875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ie3dcd43caa031b4aaa61d8be3f5d71aefd53bc9a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuDevice.cpp b/src/gpu/GpuDevice.cpp
new file mode 100644
index 0000000..dccd730
--- /dev/null
+++ b/src/gpu/GpuDevice.cpp
@@ -0,0 +1,355 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "GpuDevice.hpp"
+
+#include "GpuSensor.hpp"
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <GpuMctpVdm.hpp>
+#include <MctpRequester.hpp>
+#include <OcpMctpVdm.hpp>
+#include <boost/asio/io_context.hpp>
+#include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+#include <sdbusplus/message/native_types.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+using namespace std::chrono_literals;
+
+constexpr std::chrono::milliseconds samplingInterval{1000ms};
+
+std::unique_ptr<GpuDevice> gpuDevice;
+
+GpuDevice::GpuDevice(const std::string& name, const std::string& path,
+ std::shared_ptr<sdbusplus::asio::connection>& conn,
+ boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer) :
+ sensorPollMs(samplingInterval),
+ waitTimer(io, std::chrono::steady_clock::duration(0)),
+ mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
+ name(escapeName(name)), path(path)
+{
+ discoverGpus();
+}
+
+void GpuDevice::createSensors()
+{
+ sensors.push_back(std::make_shared<GpuTempSensor>(
+ conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
+ std::vector<thresholds::Threshold>{}));
+
+ lg2::info("Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
+ "NAME", name, "PATH", path);
+}
+
+void GpuDevice::read()
+{
+ for ([[maybe_unused]] const auto& sensor : sensors)
+ {
+ sensor->update();
+ }
+
+ waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+ waitTimer.async_wait([this](const boost::system::error_code& ec) {
+ if (ec)
+ {
+ return;
+ }
+ read();
+ });
+}
+
+void GpuDevice::processGpuEndpoint(uint8_t eid)
+{
+ std::vector<uint8_t> reqMsg(
+ sizeof(ocp::accelerator_management::BindingPciVid) +
+ sizeof(gpu::QueryDeviceIdentificationRequest));
+
+ auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+ auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error(
+ "GpuDevice::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
+ "RC", static_cast<int>(rc));
+ return;
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, reqMsg,
+ [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error(
+ "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
+ "RC", sendRecvMsgResult);
+ return;
+ }
+
+ if (respMsg.empty())
+ {
+ lg2::error(
+ "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+ return;
+ }
+
+ uint8_t cc = 0;
+ uint16_t reasonCode = 0;
+ uint8_t responseDeviceType = 0;
+ uint8_t responseInstanceId = 0;
+
+ auto rc = gpu::decodeQueryDeviceIdentificationResponse(
+ *new (respMsg.data()) ocp::accelerator_management::Message,
+ respMsg.size(), cc, reasonCode, responseDeviceType,
+ responseInstanceId);
+
+ if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+ cc != static_cast<uint8_t>(
+ ocp::accelerator_management::CompletionCode::SUCCESS))
+ {
+ lg2::error(
+ "GpuDevice::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+ "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+ return;
+ }
+
+ if (responseDeviceType ==
+ static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
+ {
+ lg2::info(
+ "GpuDevice::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+ "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+ responseInstanceId);
+
+ this->eid = eid;
+ this->createSensors();
+ this->read();
+ }
+ });
+}
+
+void GpuDevice::processMctpEndpoints(const boost::system::error_code& ec,
+ const getSubTreeRet& ret)
+{
+ if (ec)
+ {
+ lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
+ ec.message());
+ return;
+ }
+
+ if (ret.empty())
+ {
+ return;
+ }
+
+ for (const auto& [objPath, services] : ret)
+ {
+ for (const auto& [service, ifaces] : services)
+ {
+ for (const auto& iface : ifaces)
+ {
+ if (iface == "xyz.openbmc_project.MCTP.Endpoint")
+ {
+ conn->async_method_call(
+ [this](const boost::system::error_code& ec,
+ const GpuSensorConfigMap& configs) {
+ this->processEndpointConfigs(ec, configs);
+ },
+ service, objPath, "org.freedesktop.DBus.Properties",
+ "GetAll", iface);
+ }
+ }
+ }
+ }
+}
+
+void GpuDevice::processEndpointConfigs(const boost::system::error_code& ec,
+ const GpuSensorConfigMap& configs)
+{
+ if (ec)
+ {
+ lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
+ ec.message());
+ return;
+ }
+
+ uint8_t eid{};
+ std::vector<uint8_t> mctpTypes{};
+
+ auto hasEid = configs.find("EID");
+ if (hasEid != configs.end())
+ {
+ const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
+ if (eidPtr != nullptr)
+ {
+ eid = *eidPtr;
+ }
+ else
+ {
+ lg2::error(
+ "GpuDevice::discoverGpus(): Property EID does not have valid type.");
+ return;
+ }
+ }
+ else
+ {
+ lg2::error(
+ "GpuDevice::discoverGpus(): Property EID not found in the configuration.");
+ return;
+ }
+
+ auto hasMctpTypes = configs.find("SupportedMessageTypes");
+ if (hasMctpTypes != configs.end())
+ {
+ const auto* mctpTypePtr =
+ std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
+ if (mctpTypePtr != nullptr)
+ {
+ mctpTypes = *mctpTypePtr;
+ }
+ else
+ {
+ lg2::error(
+ "GpuDevice::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
+ return;
+ }
+ }
+ else
+ {
+ lg2::error(
+ "GpuDevice::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
+ return;
+ }
+
+ if (std::find(mctpTypes.begin(), mctpTypes.end(),
+ ocp::accelerator_management::messageType) != mctpTypes.end())
+ {
+ lg2::info(
+ "GpuDevice::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
+ "EID", eid);
+ this->processGpuEndpoint(eid);
+ }
+}
+
+void GpuDevice::discoverGpus()
+{
+ std::string searchPath{"/au/com/codeconstruct/"};
+ std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
+
+ conn->async_method_call(
+ [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
+ processMctpEndpoints(ec, ret);
+ },
+ "xyz.openbmc_project.ObjectMapper",
+ "/xyz/openbmc_project/object_mapper",
+ "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
+ ifaceList);
+}
+
+void processSensorConfigs(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevice,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
+{
+ for (const auto& [path, interfaces] : resp)
+ {
+ for (const auto& [intf, cfg] : interfaces)
+ {
+ if (intf != configInterfaceName(sensorType))
+ {
+ continue;
+ }
+
+ std::string name = loadVariant<std::string>(cfg, "Name");
+
+ gpuDevice[name] = std::make_shared<GpuDevice>(
+ name, path, dbusConnection, io, mctpRequester, objectServer);
+ }
+ }
+}
+
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevice,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester)
+{
+ if (!dbusConnection)
+ {
+ lg2::error("Connection not created");
+ return;
+ }
+ dbusConnection->async_method_call(
+ [&gpuDevice, &mctpRequester, &dbusConnection, &io, &objectServer](
+ boost::system::error_code ec, const ManagedObjectType& resp) {
+ if (ec)
+ {
+ lg2::error("Error contacting entity manager");
+ return;
+ }
+
+ processSensorConfigs(io, objectServer, gpuDevice, dbusConnection,
+ mctpRequester, resp);
+ },
+ entityManagerName, "/xyz/openbmc_project/inventory",
+ "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
+}
+
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevice)
+{
+ if (message.is_method_error())
+ {
+ lg2::error("interfacesRemoved callback method error");
+ return;
+ }
+
+ sdbusplus::message::object_path removedPath;
+ std::vector<std::string> interfaces;
+
+ message.read(removedPath, interfaces);
+
+ // If the xyz.openbmc_project.Confguration.X interface was removed
+ // for one or more sensors, delete those sensor objects.
+ auto sensorIt = gpuDevice.begin();
+ while (sensorIt != gpuDevice.end())
+ {
+ if ((sensorIt->second->getPath() == removedPath) &&
+ (std::find(interfaces.begin(), interfaces.end(),
+ configInterfaceName(sensorType)) != interfaces.end()))
+ {
+ sensorIt = gpuDevice.erase(sensorIt);
+ }
+ else
+ {
+ sensorIt++;
+ }
+ }
+}
diff --git a/src/gpu/GpuDevice.hpp b/src/gpu/GpuDevice.hpp
new file mode 100644
index 0000000..e5128be
--- /dev/null
+++ b/src/gpu/GpuDevice.hpp
@@ -0,0 +1,201 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+#include "Thresholds.hpp"
+#include "UpdatableSensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/container/flat_map.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
+constexpr const char* sensorType = "NvidiaMctpVdm";
+
+using getSubTreeRet = std::vector<
+ std::pair<std::string,
+ std::vector<std::pair<std::string, std::vector<std::string>>>>>;
+using GpuSensorConfigMap =
+ std::map<std::string, std::variant<std::string, bool, uint32_t, uint8_t,
+ int64_t, std::vector<uint8_t>>>;
+
+/**
+ * @struct GpuDevice
+ * @brief Represents a GPU device in the system
+ * @details Manages the lifecycle of a GPU device including discovery, sensor
+ * creation, communication, and monitoring. Handles MCTP protocol interactions
+ * with the physical GPU hardware.
+ */
+struct GpuDevice
+{
+ public:
+ /**
+ * @brief Constructor for GpuDevice
+ * @details Initializes a GPU device object with the provided parameters and
+ * starts the process of discovering available sensors on the
+ * device
+ *
+ * @param name Name of the GPU device for identification
+ * @param path D-Bus object path for this GPU device
+ * @param conn D-Bus connection for system communication
+ * @param io Boost ASIO I/O context for asynchronous operations
+ * @param mctpRequester MCTP protocol requester for GPU communication
+ * @param objectServer D-Bus object server for exposing interfaces
+ */
+ GpuDevice(const std::string& name, const std::string& path,
+ std::shared_ptr<sdbusplus::asio::connection>& conn,
+ boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer);
+
+ const std::string& getPath()
+ {
+ return path;
+ }
+
+ private:
+ /**
+ * @brief Add a sensor to this GPU device
+ * @details Associates a sensor with this GPU device and adds it to the
+ * internal list of sensors managed by this device
+ *
+ * @param name Name of the sensor to add
+ * @param sensor Shared pointer to the sensor object
+ */
+ void addSensor(const std::string& name,
+ const std::shared_ptr<Sensor>& sensor);
+
+ /**
+ * @brief Create sensors for this GPU device
+ * @details Discovers and creates all available sensor types on this GPU
+ */
+ void createSensors();
+
+ /**
+ * @brief Read the current temperature value from the GPU
+ */
+ void read();
+
+ /**
+ * @brief Discover available GPUs on the system
+ */
+ void discoverGpus();
+
+ /**
+ * @brief Process MCTP endpoints discovered on the system
+ *
+ * @param[in] ec Error code from the D-Bus method call
+ * @param[in] ret Object tree results containing MCTP endpoint information
+ */
+ void processMctpEndpoints(const boost::system::error_code& ec,
+ const getSubTreeRet& ret);
+
+ /**
+ * @brief Process configuration properties for MCTP endpoints
+ *
+ * @param[in] ec Error code from the D-Bus properties method call
+ * @param[in] configs Map of configuration properties for the endpoint
+ */
+ void processEndpointConfigs(const boost::system::error_code& ec,
+ const GpuSensorConfigMap& configs);
+
+ /**
+ * @brief Process a discovered GPU endpoint
+ * @param eid The endpoint ID of the discovered GPU
+ */
+ void processGpuEndpoint(uint8_t eid);
+
+ /**
+ * @brief MCTP endpoint ID
+ */
+ uint8_t eid{};
+
+ /**
+ * @brief How often to poll the sensor in milliseconds
+ */
+ std::chrono::milliseconds sensorPollMs;
+
+ /**
+ * @brief Timer for scheduling sensor reads
+ */
+ boost::asio::steady_timer waitTimer;
+
+ /**
+ * @brief Reference to the MCTP requester for communication
+ */
+ mctp::MctpRequester& mctpRequester;
+
+ /**
+ * @brief D-Bus connection
+ */
+ std::shared_ptr<sdbusplus::asio::connection> conn;
+
+ /**
+ * @brief D-Bus object server
+ */
+ sdbusplus::asio::object_server& objectServer;
+
+ /**
+ * @brief Collection of sensors associated with this GPU device
+ * @details Stores all sensor objects created for this GPU
+ */
+ std::vector<std::shared_ptr<GpuSensor>> sensors;
+
+ /**
+ * @brief Name of this GPU device
+ * @details Human-readable identifier for the GPU
+ */
+ std::string name;
+
+ /**
+ * @brief D-Bus object path for this GPU device
+ * @details Path where this GPU device is exposed in the D-Bus object
+ * hierarchy
+ */
+ std::string path;
+};
+
+/**
+ * @brief Create GPU temperature sensors
+ * @details Discovers and creates GPU devices and their associated sensors in
+ * the system. This function is called at startup and whenever configuration
+ * changes are detected.
+ *
+ * @param io Boost ASIO I/O context for scheduling asynchronous operations
+ * @param objectServer D-Bus object server for exposing sensor interfaces
+ * @param gpuDevice Map to store created GPU device objects, keyed by their
+ * paths
+ * @param dbusConnection D-Bus connection for system communication
+ * @param mctpRequester MCTP requester for GPU communication protocol
+ */
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevice,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester);
+
+/**
+ * @brief Handle D-Bus interface removal events
+ * @param message D-Bus message containing interface removal information
+ * @param gpuDevice Map of GPU devices to check for removal
+ */
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevice);
diff --git a/src/gpu/GpuSensor.cpp b/src/gpu/GpuSensor.cpp
index 119554d..a857862 100644
--- a/src/gpu/GpuSensor.cpp
+++ b/src/gpu/GpuSensor.cpp
@@ -7,54 +7,44 @@
#include "SensorPaths.hpp"
#include "Thresholds.hpp"
+#include "UpdatableSensor.hpp"
#include "Utils.hpp"
-#include "sensor.hpp"
#include <bits/basic_string.h>
+#include <GpuDevice.hpp>
#include <GpuMctpVdm.hpp>
#include <MctpRequester.hpp>
#include <OcpMctpVdm.hpp>
-#include <boost/asio/io_context.hpp>
-#include <boost/container/flat_map.hpp>
#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
-#include <sdbusplus/message.hpp>
-#include <sdbusplus/message/native_types.hpp>
-#include <algorithm>
-#include <chrono>
#include <cstddef>
#include <cstdint>
#include <functional>
-#include <map>
#include <memory>
#include <string>
#include <utility>
-#include <variant>
#include <vector>
using namespace std::literals;
constexpr uint8_t gpuTempSensorId{0};
-constexpr std::chrono::milliseconds samplingInterval{1000ms};
static constexpr double gpuTempSensorMaxReading = 127;
static constexpr double gpuTempSensorMinReading = -128;
GpuTempSensor::GpuTempSensor(
std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
- const std::string& name, const std::string& sensorConfiguration,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
+ const std::string& sensorConfiguration, const uint8_t eid,
sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData,
- std::chrono::milliseconds pollRate) :
- Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
- "temperature", false, true, gpuTempSensorMaxReading,
- gpuTempSensorMinReading, conn),
- sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
- waitTimer(io, std::chrono::steady_clock::duration(0)),
- mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
+ std::vector<thresholds::Threshold>&& thresholdData) :
+ GpuSensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
+ "temperature", false, true, gpuTempSensorMaxReading,
+ gpuTempSensorMinReading, conn),
+ eid(eid), sensorId{gpuTempSensorId}, mctpRequester(mctpRequester),
+ objectServer(objectServer)
{
std::string dbusPath =
sensorPathPrefix + "temperature/"s + escapeName(name);
@@ -71,12 +61,11 @@
association = objectServer.add_interface(dbusPath, association::interface);
- init();
+ setInitialProperties(sensor_paths::unitDegreesC);
}
GpuTempSensor::~GpuTempSensor()
{
- waitTimer.cancel();
for (const auto& iface : thresholdInterfaces)
{
objectServer.remove_interface(iface);
@@ -90,25 +79,6 @@
thresholds::checkThresholds(this);
}
-void GpuTempSensor::init()
-{
- discoverGpus();
-}
-
-void GpuTempSensor::read()
-{
- update();
-
- waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
- waitTimer.async_wait([this](const boost::system::error_code& ec) {
- if (ec)
- {
- return;
- }
- read();
- });
-}
-
void GpuTempSensor::update()
{
std::vector<uint8_t> reqMsg(
@@ -165,283 +135,3 @@
updateValue(tempValue);
});
}
-
-void GpuTempSensor::processGpuEndpoint(uint8_t eid)
-{
- std::vector<uint8_t> reqMsg(
- sizeof(ocp::accelerator_management::BindingPciVid) +
- sizeof(gpu::QueryDeviceIdentificationRequest));
-
- auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
-
- auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
- if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
- {
- lg2::error(
- "GpuTempSensor::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
- "RC", static_cast<int>(rc));
- return;
- }
-
- mctpRequester.sendRecvMsg(
- eid, reqMsg,
- [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
- if (sendRecvMsgResult != 0)
- {
- lg2::error(
- "GpuTempSensor::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
- "RC", sendRecvMsgResult);
- return;
- }
-
- if (respMsg.empty())
- {
- lg2::error(
- "GpuTempSensor::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
- return;
- }
-
- uint8_t cc = 0;
- uint16_t reasonCode = 0;
- uint8_t responseDeviceType = 0;
- uint8_t responseInstanceId = 0;
-
- auto rc = gpu::decodeQueryDeviceIdentificationResponse(
- *new (respMsg.data()) ocp::accelerator_management::Message,
- respMsg.size(), cc, reasonCode, responseDeviceType,
- responseInstanceId);
-
- if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
- cc != static_cast<uint8_t>(
- ocp::accelerator_management::CompletionCode::SUCCESS))
- {
- lg2::error(
- "GpuTempSensor::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
- "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
- return;
- }
-
- if (responseDeviceType ==
- static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
- {
- lg2::info(
- "GpuTempSensor::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
- "EID", eid, "DEVTYPE", responseDeviceType, "IID",
- responseInstanceId);
-
- this->eid = eid;
- setInitialProperties(sensor_paths::unitDegreesC);
- this->read();
- }
- });
-}
-
-void GpuTempSensor::processMctpEndpoints(const boost::system::error_code& ec,
- const getSubTreeRet& ret)
-{
- if (ec)
- {
- lg2::error("GpuTempSensor::discoverGpus(): Error:{ERROR}", "ERROR",
- ec.message());
- return;
- }
-
- if (ret.empty())
- {
- return;
- }
-
- for (const auto& [objPath, services] : ret)
- {
- for (const auto& [service, ifaces] : services)
- {
- for (const auto& iface : ifaces)
- {
- if (iface == "xyz.openbmc_project.MCTP.Endpoint")
- {
- conn->async_method_call(
- [this](const boost::system::error_code& ec,
- const GpuSensorConfigMap& configs) {
- this->processEndpointConfigs(ec, configs);
- },
- service, objPath, "org.freedesktop.DBus.Properties",
- "GetAll", iface);
- }
- }
- }
- }
-}
-
-void GpuTempSensor::processEndpointConfigs(const boost::system::error_code& ec,
- const GpuSensorConfigMap& configs)
-{
- if (ec)
- {
- lg2::error("GpuTempSensor::discoverGpus(): Error:{ERROR}", "ERROR",
- ec.message());
- return;
- }
-
- uint8_t eid{};
- std::vector<uint8_t> mctpTypes{};
-
- auto hasEid = configs.find("EID");
- if (hasEid != configs.end())
- {
- const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
- if (eidPtr != nullptr)
- {
- eid = *eidPtr;
- }
- else
- {
- lg2::error(
- "GpuTempSensor::discoverGpus(): Property EID does not have valid type.");
- return;
- }
- }
- else
- {
- lg2::error(
- "GpuTempSensor::discoverGpus(): Property EID not found in the configuration.");
- return;
- }
-
- auto hasMctpTypes = configs.find("SupportedMessageTypes");
- if (hasMctpTypes != configs.end())
- {
- const auto* mctpTypePtr =
- std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
- if (mctpTypePtr != nullptr)
- {
- mctpTypes = *mctpTypePtr;
- }
- else
- {
- lg2::error(
- "GpuTempSensor::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
- return;
- }
- }
- else
- {
- lg2::error(
- "GpuTempSensor::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
- return;
- }
-
- if (std::find(mctpTypes.begin(), mctpTypes.end(),
- ocp::accelerator_management::messageType) != mctpTypes.end())
- {
- lg2::info(
- "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
- "EID", eid);
- this->processGpuEndpoint(eid);
- }
-}
-
-void GpuTempSensor::discoverGpus()
-{
- std::string searchPath{"/au/com/codeconstruct/"};
- std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
-
- conn->async_method_call(
- [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
- processMctpEndpoints(ec, ret);
- },
- "xyz.openbmc_project.ObjectMapper",
- "/xyz/openbmc_project/object_mapper",
- "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
- ifaceList);
-}
-
-void processSensorConfigs(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
-{
- for (const auto& [path, interfaces] : resp)
- {
- for (const auto& [intf, cfg] : interfaces)
- {
- if (intf != configInterfaceName(sensorType))
- {
- continue;
- }
-
- std::string name = loadVariant<std::string>(cfg, "Name");
-
- sensors[name] = std::make_shared<GpuTempSensor>(
- dbusConnection, io, mctpRequester, name, path, objectServer,
- std::vector<thresholds::Threshold>{}, samplingInterval);
-
- lg2::info(
- "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
- "NAME", name, "PATH", path);
- }
- }
-}
-
-void createSensors(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester)
-{
- if (!dbusConnection)
- {
- lg2::error("Connection not created");
- return;
- }
- dbusConnection->async_method_call(
- [&sensors, &mctpRequester, &dbusConnection, &io, &objectServer](
- boost::system::error_code ec, const ManagedObjectType& resp) {
- if (ec)
- {
- lg2::error("Error contacting entity manager");
- return;
- }
-
- processSensorConfigs(io, objectServer, sensors, dbusConnection,
- mctpRequester, resp);
- },
- entityManagerName, "/xyz/openbmc_project/inventory",
- "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
-}
-
-void interfaceRemoved(
- sdbusplus::message_t& message,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors)
-{
- if (message.is_method_error())
- {
- lg2::error("interfacesRemoved callback method error");
- return;
- }
-
- sdbusplus::message::object_path removedPath;
- std::vector<std::string> interfaces;
-
- message.read(removedPath, interfaces);
-
- // If the xyz.openbmc_project.Confguration.X interface was removed
- // for one or more sensors, delete those sensor objects.
- auto sensorIt = sensors.begin();
- while (sensorIt != sensors.end())
- {
- if ((sensorIt->second->configurationPath == removedPath) &&
- (std::find(interfaces.begin(), interfaces.end(),
- configInterfaceName(sensorType)) != interfaces.end()))
- {
- sensorIt = sensors.erase(sensorIt);
- }
- else
- {
- sensorIt++;
- }
- }
-}
diff --git a/src/gpu/GpuSensor.hpp b/src/gpu/GpuSensor.hpp
index 7c70d55..2961404 100644
--- a/src/gpu/GpuSensor.hpp
+++ b/src/gpu/GpuSensor.hpp
@@ -7,34 +7,16 @@
#include "MctpRequester.hpp"
#include "Thresholds.hpp"
-#include "sensor.hpp"
+#include "UpdatableSensor.hpp"
-#include <boost/asio/io_context.hpp>
-#include <boost/asio/steady_timer.hpp>
-#include <boost/container/flat_map.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
-#include <sdbusplus/message.hpp>
-#include <chrono>
#include <cstdint>
-#include <map>
#include <memory>
#include <string>
-#include <utility>
-#include <variant>
#include <vector>
-constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
-constexpr const char* sensorType = "NvidiaMctpVdm";
-
-using getSubTreeRet = std::vector<
- std::pair<std::string,
- std::vector<std::pair<std::string, std::vector<std::string>>>>>;
-using GpuSensorConfigMap =
- std::map<std::string, std::variant<std::string, bool, uint32_t, uint8_t,
- int64_t, std::vector<uint8_t>>>;
-
/**
* @struct DeviceInfo
* @brief Contains information about a device
@@ -52,30 +34,27 @@
* management via std::enable_shared_from_this
*/
struct GpuTempSensor :
- public Sensor,
+ public GpuSensor,
public std::enable_shared_from_this<GpuTempSensor>
{
public:
/**
* @brief Constructor for GpuTempSensor
- * @param conn D-Bus connection
- * @param io Boost ASIO I/O context for asynchronous operations
+ * @param conn D-Bus connection for system communication
* @param mctpRequester MCTP protocol requester for GPU communication
- * @param name Name of the sensor
- * @param sensorConfiguration Configuration string for the sensor
- * @param objectServer D-Bus object server
- * @param thresholdData Vector of threshold configurations
- * @param pollRate How often to poll for new readings
- * @param deviceInfo Information about the GPU device
- * @param verbose Whether to enable verbose logging
+ * @param name Name of the sensor for identification in the system
+ * @param sensorConfiguration Configuration string for the sensor containing
+ * setup parameters
+ * @param eid EID of the device endpoint
+ * @param objectServer D-Bus object server for exposing sensor interfaces
+ * @param thresholdData Vector of threshold configurations for temperature
+ * monitoring
*/
GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io,
mctp::MctpRequester& mctpRequester, const std::string& name,
- const std::string& sensorConfiguration,
+ const std::string& sensorConfiguration, uint8_t eid,
sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData,
- std::chrono::milliseconds pollRate);
+ std::vector<thresholds::Threshold>&& thresholdData);
/**
* @brief Destructor
@@ -91,47 +70,9 @@
private:
/**
- * @brief Read the current temperature value from the GPU
- */
- void read();
-
- /**
- * @brief Initialize the sensor
- */
- void init();
-
- /**
* @brief Update the sensor reading
*/
- void update();
-
- /**
- * @brief Discover available GPUs on the system
- */
- void discoverGpus();
-
- /**
- * @brief Process MCTP endpoints discovered on the system
- *
- * @param[in] ec Error code from the D-Bus method call
- * @param[in] ret Object tree results containing MCTP endpoint information
- */
- void processMctpEndpoints(const boost::system::error_code& ec,
- const getSubTreeRet& ret);
-
- /**
- * @brief Process configuration properties for MCTP endpoints
- *
- * @param[in] ec Error code from the D-Bus properties method call
- * @param[in] configs Map of configuration properties for the endpoint
- */
- void processEndpointConfigs(const boost::system::error_code& ec,
- const GpuSensorConfigMap& configs);
- /**
- * @brief Process a discovered GPU endpoint
- * @param eid The endpoint ID of the discovered GPU
- */
- void processGpuEndpoint(uint8_t eid);
+ void update() final;
/**
* @brief MCTP endpoint ID
@@ -144,52 +85,12 @@
uint8_t sensorId;
/**
- * @brief How often to poll the sensor in milliseconds
- */
- std::chrono::milliseconds sensorPollMs;
-
- /**
- * @brief Timer for scheduling sensor reads
- */
- boost::asio::steady_timer waitTimer;
-
- /**
* @brief Reference to the MCTP requester for communication
*/
mctp::MctpRequester& mctpRequester;
/**
- * @brief D-Bus connection
- */
- std::shared_ptr<sdbusplus::asio::connection> conn;
-
- /**
* @brief D-Bus object server
*/
sdbusplus::asio::object_server& objectServer;
};
-
-/**
- * @brief Create GPU temperature sensors
- * @param io Boost ASIO I/O context
- * @param objectServer D-Bus object server
- * @param sensors Map to store created sensors
- * @param dbusConnection D-Bus connection
- * @param mctpRequester MCTP requester for GPU communication
- */
-void createSensors(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester);
-
-/**
- * @brief Handle D-Bus interface removal events
- * @param message D-Bus message containing interface removal information
- * @param sensors Map of GPU temperature sensors to check for removal
- */
-void interfaceRemoved(
- sdbusplus::message_t& message,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors);
diff --git a/src/gpu/GpuSensorMain.cpp b/src/gpu/GpuSensorMain.cpp
index 254a11a..9345b84 100644
--- a/src/gpu/GpuSensorMain.cpp
+++ b/src/gpu/GpuSensorMain.cpp
@@ -3,17 +3,16 @@
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*/
-#include "GpuSensor.hpp"
#include "MctpRequester.hpp"
#include "OcpMctpVdm.hpp"
#include "Utils.hpp"
+#include <GpuDevice.hpp>
#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/post.hpp>
#include <boost/asio/steady_timer.hpp>
#include <boost/container/flat_map.hpp>
-#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/bus.hpp>
@@ -27,15 +26,24 @@
#include <string>
#include <vector>
-boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>> sensors;
+/**
+ * @brief Global map of GPU devices keyed by their paths
+ * @details Stores all discovered GPU devices in the system for management
+ * and tracking throughout the application lifecycle
+ */
+boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>> gpuDevice;
/**
- * @brief config timer expiry callback
- * @param io Boost ASIO I/O context
- * @param objectServer D-Bus object server
- * @param dbusConnection D-Bus connection
- * @param mctpRequester MCTP requester for GPU communication
- * @param ec Boost ASIO error code
+ * @brief Callback function executed when configuration timer expires
+ * @details Triggers sensor creation or reconfiguration process when
+ * configuration changes are detected in the system. If the timer was canceled
+ * due to application shutdown or other reasons, the function returns early.
+ *
+ * @param io Boost ASIO I/O context for scheduling asynchronous operations
+ * @param objectServer D-Bus object server for exposing sensor interfaces
+ * @param dbusConnection D-Bus connection for system communication
+ * @param mctpRequester MCTP requester for GPU communication protocol
+ * @param ec Boost ASIO error code indicating success or failure reason
*/
void configTimerExpiryCallback(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
@@ -46,11 +54,7 @@
{
return; // we're being canceled
}
- createSensors(io, objectServer, sensors, dbusConnection, mctpRequester);
- if (sensors.empty())
- {
- lg2::info("Configuration not detected");
- }
+ createSensors(io, objectServer, gpuDevice, dbusConnection, mctpRequester);
}
int main()
@@ -65,7 +69,7 @@
ocp::accelerator_management::messageType);
boost::asio::post(io, [&]() {
- createSensors(io, objectServer, sensors, systemBus, mctpRequester);
+ createSensors(io, objectServer, gpuDevice, systemBus, mctpRequester);
});
boost::asio::steady_timer configTimer(io);
@@ -90,7 +94,7 @@
static_cast<sdbusplus::bus_t&>(*systemBus),
"type='signal',member='InterfacesRemoved',arg0path='" +
std::string(inventoryPath) + "/'",
- [](sdbusplus::message_t& msg) { interfaceRemoved(msg, sensors); });
+ [](sdbusplus::message_t& msg) { interfaceRemoved(msg, gpuDevice); });
io.run();
return 0;
diff --git a/src/gpu/UpdatableSensor.hpp b/src/gpu/UpdatableSensor.hpp
new file mode 100644
index 0000000..a0d29b7
--- /dev/null
+++ b/src/gpu/UpdatableSensor.hpp
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "sensor.hpp"
+
+class GpuSensor : public Sensor
+{
+ public:
+ using Sensor::Sensor;
+
+ virtual void update() = 0;
+};
diff --git a/src/gpu/meson.build b/src/gpu/meson.build
index 1ec72c4..f8cfe39 100644
--- a/src/gpu/meson.build
+++ b/src/gpu/meson.build
@@ -1,4 +1,5 @@
gpusensor_sources = files(
+ 'GpuDevice.cpp',
'GpuMctpVdm.cpp',
'GpuSensor.cpp',
'GpuSensorMain.cpp',