nvidia-gpu: add entity-manager support
The commit add support for reading of the entity-manager configurations
to the gpu dbus sensor app.
Tested.
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./nvidiagpusensor
```
The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree.
Change-Id: I05a0597964bcc0c135484fed714b6f677adc5891
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.cpp b/src/nvidia-gpu/NvidiaGpuSensor.cpp
new file mode 100644
index 0000000..3594c29
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuSensor.cpp
@@ -0,0 +1,285 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuSensor.hpp"
+
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+#include "sensor.hpp"
+
+#include <bits/basic_string.h>
+
+#include <boost/asio/io_context.hpp>
+#include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+#include <sdbusplus/message/native_types.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+using namespace std::literals;
+
+static constexpr double gpuTempSensorMaxReading = 127;
+static constexpr double gpuTempSensorMinReading = -128;
+
+GpuTempSensor::GpuTempSensor(
+ std::shared_ptr<sdbusplus::asio::connection>& conn,
+ boost::asio::io_context& io, const std::string& name,
+ const std::string& sensorConfiguration,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData) :
+ Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
+ "temperature", false, true, gpuTempSensorMaxReading,
+ gpuTempSensorMinReading, conn),
+ waitTimer(io, std::chrono::steady_clock::duration(0)), conn(conn),
+ objectServer(objectServer)
+{
+ std::string dbusPath =
+ sensorPathPrefix + "temperature/"s + escapeName(name);
+
+ sensorInterface = objectServer.add_interface(
+ dbusPath, "xyz.openbmc_project.Sensor.Value");
+
+ for (const auto& threshold : thresholds)
+ {
+ std::string interface = thresholds::getInterface(threshold.level);
+ thresholdInterfaces[static_cast<size_t>(threshold.level)] =
+ objectServer.add_interface(dbusPath, interface);
+ }
+
+ association = objectServer.add_interface(dbusPath, association::interface);
+
+ discoverGpus();
+}
+
+GpuTempSensor::~GpuTempSensor()
+{
+ waitTimer.cancel();
+ for (const auto& iface : thresholdInterfaces)
+ {
+ objectServer.remove_interface(iface);
+ }
+ objectServer.remove_interface(association);
+ objectServer.remove_interface(sensorInterface);
+}
+
+void GpuTempSensor::checkThresholds()
+{
+ thresholds::checkThresholds(this);
+}
+
+void GpuTempSensor::queryEndpoints(const boost::system::error_code& ec,
+ const GetSubTreeType& ret)
+{
+ if (ec)
+ {
+ lg2::error("Error querying endoints :{ERROR}", "ERROR", ec.message());
+ return;
+ }
+
+ if (ret.empty())
+ {
+ return;
+ }
+
+ for (const auto& [objPath, services] : ret)
+ {
+ for (const auto& [service, ifaces] : services)
+ {
+ for (const auto& iface : ifaces)
+ {
+ if (iface == "xyz.openbmc_project.MCTP.Endpoint")
+ {
+ conn->async_method_call(
+ [this](const boost::system::error_code& ec,
+ const SensorBaseConfigMap& configs) {
+ this->processEndpoint(ec, configs);
+ },
+ service, objPath, "org.freedesktop.DBus.Properties",
+ "GetAll", iface);
+ }
+ }
+ }
+ }
+}
+
+void GpuTempSensor::processEndpoint(const boost::system::error_code& ec,
+ const SensorBaseConfigMap& endpoint)
+{
+ if (ec)
+ {
+ lg2::error("Error processing MCTP endpoint: {ERROR}", "ERROR",
+ ec.message());
+ return;
+ }
+
+ [[maybe_unused]] uint8_t eid{};
+ std::vector<uint8_t> mctpTypes{};
+
+ auto hasEid = endpoint.find("EID");
+ if (hasEid != endpoint.end())
+ {
+ const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
+ if (eidPtr != nullptr)
+ {
+ eid = *eidPtr;
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint: Property EID does not have valid type.");
+ return;
+ }
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint: Property EID not found in the configuration.");
+ return;
+ }
+
+ auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
+ if (hasMctpTypes != endpoint.end())
+ {
+ const auto* mctpTypePtr =
+ std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
+ if (mctpTypePtr != nullptr)
+ {
+ mctpTypes = *mctpTypePtr;
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint: Property SupportedMessageTypes does not have valid type.");
+ return;
+ }
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint: Property SupportedMessageTypes not found in the configuration.");
+ return;
+ }
+
+ // if the OCP MCTP VDM Message type (0x7E) is found in mctpTypes
+ // process the endpoint further.
+ (void)this;
+}
+
+void GpuTempSensor::discoverGpus()
+{
+ std::string searchPath{"/au/com/codeconstruct/"};
+ std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
+
+ conn->async_method_call(
+ [this](const boost::system::error_code& ec, const GetSubTreeType& ret) {
+ queryEndpoints(ec, ret);
+ },
+ "xyz.openbmc_project.ObjectMapper",
+ "/xyz/openbmc_project/object_mapper",
+ "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
+ ifaceList);
+}
+
+void processSensorConfigs(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ const ManagedObjectType& resp)
+{
+ for (const auto& [path, interfaces] : resp)
+ {
+ for (const auto& [intf, cfg] : interfaces)
+ {
+ if (intf != configInterfaceName(sensorType))
+ {
+ continue;
+ }
+
+ std::string name = loadVariant<std::string>(cfg, "Name");
+
+ sensors[name] = std::make_shared<GpuTempSensor>(
+ dbusConnection, io, name, path, objectServer,
+ std::vector<thresholds::Threshold>{});
+
+ lg2::info(
+ "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
+ "NAME", name, "PATH", path);
+ }
+ }
+}
+
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection)
+{
+ if (!dbusConnection)
+ {
+ lg2::error("Connection not created");
+ return;
+ }
+ dbusConnection->async_method_call(
+ [&sensors, &dbusConnection, &io,
+ &objectServer](const boost::system::error_code& ec,
+ const ManagedObjectType& resp) {
+ if (ec)
+ {
+ lg2::error("Error contacting entity manager");
+ return;
+ }
+
+ processSensorConfigs(io, objectServer, sensors, dbusConnection,
+ resp);
+ },
+ entityManagerName, "/xyz/openbmc_project/inventory",
+ "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
+}
+
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors)
+{
+ if (message.is_method_error())
+ {
+ lg2::error("interfacesRemoved callback method error");
+ return;
+ }
+
+ sdbusplus::message::object_path removedPath;
+ std::vector<std::string> interfaces;
+
+ message.read(removedPath, interfaces);
+
+ // If the xyz.openbmc_project.Confguration.X interface was removed
+ // for one or more sensors, delete those sensor objects.
+ auto sensorIt = sensors.begin();
+ while (sensorIt != sensors.end())
+ {
+ if ((sensorIt->second->configurationPath == removedPath) &&
+ (std::find(interfaces.begin(), interfaces.end(),
+ configInterfaceName(sensorType)) != interfaces.end()))
+ {
+ sensorIt = sensors.erase(sensorIt);
+ }
+ else
+ {
+ sensorIt++;
+ }
+ }
+}
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
new file mode 100644
index 0000000..14627fc
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -0,0 +1,123 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+#include "sensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/container/flat_map.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
+constexpr const char* sensorType = "NvidiaMctpVdm";
+
+struct GpuTempSensor :
+ public Sensor,
+ public std::enable_shared_from_this<GpuTempSensor>
+{
+ public:
+ /**
+ * @brief Constructor for GpuTempSensor
+ * @param conn D-Bus connection
+ * @param io Boost ASIO I/O context for asynchronous operations
+ * @param mctpRequester MCTP protocol requester for GPU communication
+ * @param name Name of the sensor
+ * @param sensorConfiguration Configuration string for the sensor
+ * @param objectServer D-Bus object server
+ * @param thresholdData Vector of threshold configurations
+ * @param pollRate How often to poll for new readings
+ * @param deviceInfo Information about the GPU device
+ * @param verbose Whether to enable verbose logging
+ */
+ GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+ boost::asio::io_context& io, const std::string& name,
+ const std::string& sensorConfiguration,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
+
+ /**
+ * @brief Destructor
+ */
+ ~GpuTempSensor() override;
+
+ /**
+ * @brief Check if any thresholds have been crossed
+ * @details Overrides the base class method to implement GPU-specific
+ * threshold checking
+ */
+ void checkThresholds() override;
+
+ private:
+ /**
+ * @brief Discover available GPUs on the system
+ */
+ void discoverGpus();
+
+ /**
+ * @brief Process MCTP endpoints discovered on the system
+ *
+ * @param[in] ec Error code from the D-Bus method call
+ * @param[in] ret Object tree results containing MCTP endpoint information
+ */
+ void queryEndpoints(const boost::system::error_code& ec,
+ const GetSubTreeType& ret);
+
+ /**
+ * @brief Process configuration properties for MCTP endpoints
+ *
+ * @param[in] ec Error code from the D-Bus properties method call
+ * @param[in] configs Map of configuration properties for the endpoint
+ */
+ void processEndpoint(const boost::system::error_code& ec,
+ const SensorBaseConfigMap& endpoint);
+
+ /**
+ * @brief Timer for scheduling sensor reads
+ */
+ boost::asio::steady_timer waitTimer;
+
+ /**
+ * @brief D-Bus connection
+ */
+ std::shared_ptr<sdbusplus::asio::connection> conn;
+
+ /**
+ * @brief D-Bus object server
+ */
+ sdbusplus::asio::object_server& objectServer;
+};
+
+/**
+ * @brief Create GPU temperature sensors
+ * @param io Boost ASIO I/O context
+ * @param objectServer D-Bus object server
+ * @param sensors Map to store created sensors
+ * @param dbusConnection D-Bus connection
+ */
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection);
+
+/**
+ * @brief Handle D-Bus interface removal events
+ * @param message D-Bus message containing interface removal information
+ * @param sensors Map of GPU temperature sensors to check for removal
+ */
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors);
diff --git a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
index 85fec45..9879c2c 100644
--- a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
@@ -3,12 +3,45 @@
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*/
+#include "NvidiaGpuSensor.hpp"
+#include "Utils.hpp"
+
+#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
+#include <boost/asio/post.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/bus.hpp>
+#include <sdbusplus/bus/match.hpp>
+#include <sdbusplus/message.hpp>
+#include <array>
+#include <chrono>
+#include <functional>
#include <memory>
#include <string>
+#include <vector>
+
+boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>> sensors;
+
+void configTimerExpiryCallback(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ const boost::system::error_code& ec)
+{
+ if (ec == boost::asio::error::operation_aborted)
+ {
+ return; // we're being canceled
+ }
+ createSensors(io, objectServer, sensors, dbusConnection);
+ if (sensors.empty())
+ {
+ lg2::info("Configuration not detected");
+ }
+}
int main()
{
@@ -18,6 +51,33 @@
objectServer.add_manager("/xyz/openbmc_project/sensors");
systemBus->request_name("xyz.openbmc_project.GpuSensor");
+ boost::asio::post(io, [&]() {
+ createSensors(io, objectServer, sensors, systemBus);
+ });
+
+ boost::asio::steady_timer configTimer(io);
+
+ std::function<void(sdbusplus::message_t&)> eventHandler =
+ [&configTimer, &io, &objectServer, &systemBus](sdbusplus::message_t&) {
+ configTimer.expires_after(std::chrono::seconds(1));
+ // create a timer because normally multiple properties change
+ configTimer.async_wait(
+ std::bind_front(configTimerExpiryCallback, std::ref(io),
+ std::ref(objectServer), std::ref(systemBus)));
+ };
+
+ std::vector<std::unique_ptr<sdbusplus::bus::match_t>> matches =
+ setupPropertiesChangedMatches(
+ *systemBus, std::to_array<const char*>({sensorType}), eventHandler);
+
+ // Watch for entity-manager to remove configuration interfaces
+ // so the corresponding sensors can be removed.
+ auto ifaceRemovedMatch = std::make_shared<sdbusplus::bus::match_t>(
+ static_cast<sdbusplus::bus_t&>(*systemBus),
+ sdbusplus::bus::match::rules::interfacesRemovedAtPath(
+ std::string(inventoryPath)),
+ [](sdbusplus::message_t& msg) { interfaceRemoved(msg, sensors); });
+
io.run();
return 0;
}
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index 4a28491..9918435 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -1,4 +1,4 @@
-gpusensor_sources = files('NvidiaGpuSensorMain.cpp')
+gpusensor_sources = files('NvidiaGpuSensor.cpp', 'NvidiaGpuSensorMain.cpp')
gpusensor_include_dir = include_directories('.', is_system: true)
sensor_include_dir = include_directories('../..')