gpu : add entity-manager support
The commit add support for reading of the entity-manager configurations
to the gpu dbus sensor app.
Tested.
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree.
Change-Id: I05a0597964bcc0c135484fed714b6f677adc5891
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuSensor.hpp b/src/gpu/GpuSensor.hpp
new file mode 100644
index 0000000..01eb23f
--- /dev/null
+++ b/src/gpu/GpuSensor.hpp
@@ -0,0 +1,154 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "Thresholds.hpp"
+#include "sensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/container/flat_map.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
+constexpr const char* sensorType = "NvidiaMctpVdm";
+
+using getSubTreeRet = std::vector<
+ std::pair<std::string,
+ std::vector<std::pair<std::string, std::vector<std::string>>>>>;
+using GpuSensorConfigMap =
+ std::map<std::string, std::variant<std::string, bool, uint32_t, uint8_t,
+ int64_t, std::vector<uint8_t>>>;
+
+/**
+ * @struct DeviceInfo
+ * @brief Contains information about a device
+ */
+struct DeviceInfo
+{
+ uint8_t deviceType;
+ uint8_t instanceId;
+};
+
+/**
+ * @struct GpuTempSensor
+ * @brief Implements a GPU temperature sensor that monitors temperature values
+ * @details Inherits from Sensor base class and enables shared pointer
+ * management via std::enable_shared_from_this
+ */
+struct GpuTempSensor :
+ public Sensor,
+ public std::enable_shared_from_this<GpuTempSensor>
+{
+ public:
+ /**
+ * @brief Constructor for GpuTempSensor
+ * @param conn D-Bus connection
+ * @param io Boost ASIO I/O context for asynchronous operations
+ * @param mctpRequester MCTP protocol requester for GPU communication
+ * @param name Name of the sensor
+ * @param sensorConfiguration Configuration string for the sensor
+ * @param objectServer D-Bus object server
+ * @param thresholdData Vector of threshold configurations
+ * @param pollRate How often to poll for new readings
+ * @param deviceInfo Information about the GPU device
+ * @param verbose Whether to enable verbose logging
+ */
+ GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+ boost::asio::io_context& io, const std::string& name,
+ const std::string& sensorConfiguration,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
+
+ /**
+ * @brief Destructor
+ */
+ ~GpuTempSensor() override;
+
+ /**
+ * @brief Check if any thresholds have been crossed
+ * @details Overrides the base class method to implement GPU-specific
+ * threshold checking
+ */
+ void checkThresholds() override;
+
+ private:
+ /**
+ * @brief Initialize the sensor
+ */
+ void init();
+
+ /**
+ * @brief Discover available GPUs on the system
+ */
+ void discoverGpus();
+
+ /**
+ * @brief Process MCTP endpoints discovered on the system
+ *
+ * @param[in] ec Error code from the D-Bus method call
+ * @param[in] ret Object tree results containing MCTP endpoint information
+ */
+ void processMctpEndpoints(const boost::system::error_code& ec,
+ const getSubTreeRet& ret);
+
+ /**
+ * @brief Process configuration properties for MCTP endpoints
+ *
+ * @param[in] ec Error code from the D-Bus properties method call
+ * @param[in] configs Map of configuration properties for the endpoint
+ */
+ void processEndpointConfigs(const boost::system::error_code& ec,
+ const GpuSensorConfigMap& configs);
+
+ /**
+ * @brief Timer for scheduling sensor reads
+ */
+ boost::asio::steady_timer waitTimer;
+
+ /**
+ * @brief D-Bus connection
+ */
+ std::shared_ptr<sdbusplus::asio::connection> conn;
+
+ /**
+ * @brief D-Bus object server
+ */
+ sdbusplus::asio::object_server& objectServer;
+};
+
+/**
+ * @brief Create GPU temperature sensors
+ * @param io Boost ASIO I/O context
+ * @param objectServer D-Bus object server
+ * @param sensors Map to store created sensors
+ * @param dbusConnection D-Bus connection
+ */
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors,
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection);
+
+/**
+ * @brief Handle D-Bus interface removal events
+ * @param message D-Bus message containing interface removal information
+ * @param sensors Map of GPU temperature sensors to check for removal
+ */
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+ sensors);