gpu : add entity-manager support

The commit add support for reading of the entity-manager configurations
to the gpu dbus sensor app.

Tested.

Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```

Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```

The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree.

Change-Id: I05a0597964bcc0c135484fed714b6f677adc5891
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuSensor.cpp b/src/gpu/GpuSensor.cpp
new file mode 100644
index 0000000..ed81339
--- /dev/null
+++ b/src/gpu/GpuSensor.cpp
@@ -0,0 +1,292 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "GpuSensor.hpp"
+
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+#include "sensor.hpp"
+
+#include <bits/basic_string.h>
+
+#include <boost/asio/io_context.hpp>
+#include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+#include <sdbusplus/message/native_types.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+using namespace std::literals;
+
+static constexpr double gpuTempSensorMaxReading = 127;
+static constexpr double gpuTempSensorMinReading = -128;
+
+GpuTempSensor::GpuTempSensor(
+    std::shared_ptr<sdbusplus::asio::connection>& conn,
+    boost::asio::io_context& io, const std::string& name,
+    const std::string& sensorConfiguration,
+    sdbusplus::asio::object_server& objectServer,
+    std::vector<thresholds::Threshold>&& thresholdData) :
+    Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
+           "temperature", false, true, gpuTempSensorMaxReading,
+           gpuTempSensorMinReading, conn),
+    waitTimer(io, std::chrono::steady_clock::duration(0)), conn(conn),
+    objectServer(objectServer)
+{
+    std::string dbusPath =
+        sensorPathPrefix + "temperature/"s + escapeName(name);
+
+    sensorInterface = objectServer.add_interface(
+        dbusPath, "xyz.openbmc_project.Sensor.Value");
+
+    for (const auto& threshold : thresholds)
+    {
+        std::string interface = thresholds::getInterface(threshold.level);
+        thresholdInterfaces[static_cast<size_t>(threshold.level)] =
+            objectServer.add_interface(dbusPath, interface);
+    }
+
+    association = objectServer.add_interface(dbusPath, association::interface);
+
+    init();
+}
+
+GpuTempSensor::~GpuTempSensor()
+{
+    waitTimer.cancel();
+    for (const auto& iface : thresholdInterfaces)
+    {
+        objectServer.remove_interface(iface);
+    }
+    objectServer.remove_interface(sensorInterface);
+    objectServer.remove_interface(association);
+}
+
+void GpuTempSensor::checkThresholds()
+{
+    thresholds::checkThresholds(this);
+}
+
+void GpuTempSensor::init()
+{
+    discoverGpus();
+}
+
+void GpuTempSensor::processMctpEndpoints(const boost::system::error_code& ec,
+                                         const getSubTreeRet& ret)
+{
+    if (ec)
+    {
+        lg2::error("GpuTempSensor::discoverGpus(): Error:{ERROR}", "ERROR",
+                   ec.message());
+        return;
+    }
+
+    if (ret.empty())
+    {
+        return;
+    }
+
+    for (const auto& [objPath, services] : ret)
+    {
+        for (const auto& [service, ifaces] : services)
+        {
+            for (const auto& iface : ifaces)
+            {
+                if (iface == "xyz.openbmc_project.MCTP.Endpoint")
+                {
+                    conn->async_method_call(
+                        [this](const boost::system::error_code& ec,
+                               const GpuSensorConfigMap& configs) {
+                            this->processEndpointConfigs(ec, configs);
+                        },
+                        service, objPath, "org.freedesktop.DBus.Properties",
+                        "GetAll", iface);
+                }
+            }
+        }
+    }
+}
+
+void GpuTempSensor::processEndpointConfigs(const boost::system::error_code& ec,
+                                           const GpuSensorConfigMap& configs)
+{
+    if (ec)
+    {
+        lg2::error("GpuTempSensor::discoverGpus(): Error:{ERROR}", "ERROR",
+                   ec.message());
+        return;
+    }
+
+    [[maybe_unused]] uint8_t eid{};
+    std::vector<uint8_t> mctpTypes{};
+
+    auto hasEid = configs.find("EID");
+    if (hasEid != configs.end())
+    {
+        const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
+        if (eidPtr != nullptr)
+        {
+            eid = *eidPtr;
+        }
+        else
+        {
+            lg2::error(
+                "GpuTempSensor::discoverGpus(): Property EID does not have valid type.");
+            return;
+        }
+    }
+    else
+    {
+        lg2::error(
+            "GpuTempSensor::discoverGpus(): Property EID not found in the configuration.");
+        return;
+    }
+
+    auto hasMctpTypes = configs.find("SupportedMessageTypes");
+    if (hasMctpTypes != configs.end())
+    {
+        const auto* mctpTypePtr =
+            std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
+        if (mctpTypePtr != nullptr)
+        {
+            mctpTypes = *mctpTypePtr;
+        }
+        else
+        {
+            lg2::error(
+                "GpuTempSensor::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
+            return;
+        }
+    }
+    else
+    {
+        lg2::error(
+            "GpuTempSensor::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
+        return;
+    }
+
+    // if the OCP MCTP VDM Message type (0x7E) is found in mctpTypes
+    // process the endpoint further.
+    (void)this;
+}
+
+void GpuTempSensor::discoverGpus()
+{
+    std::string searchPath{"/au/com/codeconstruct/"};
+    std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
+
+    conn->async_method_call(
+        [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
+            processMctpEndpoints(ec, ret);
+        },
+        "xyz.openbmc_project.ObjectMapper",
+        "/xyz/openbmc_project/object_mapper",
+        "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
+        ifaceList);
+}
+
+void processSensorConfigs(
+    boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+    boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+        sensors,
+    std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+    const ManagedObjectType& resp)
+{
+    for (const auto& [path, interfaces] : resp)
+    {
+        for (const auto& [intf, cfg] : interfaces)
+        {
+            if (intf != configInterfaceName(sensorType))
+            {
+                continue;
+            }
+
+            std::string name = loadVariant<std::string>(cfg, "Name");
+
+            sensors[name] = std::make_shared<GpuTempSensor>(
+                dbusConnection, io, name, path, objectServer,
+                std::vector<thresholds::Threshold>{});
+
+            lg2::info(
+                "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
+                "NAME", name, "PATH", path);
+        }
+    }
+}
+
+void createSensors(
+    boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+    boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+        sensors,
+    std::shared_ptr<sdbusplus::asio::connection>& dbusConnection)
+{
+    if (!dbusConnection)
+    {
+        lg2::error("Connection not created");
+        return;
+    }
+    dbusConnection->async_method_call(
+        [&sensors, &dbusConnection, &io,
+         &objectServer](const boost::system::error_code& ec,
+                        const ManagedObjectType& resp) {
+            if (ec)
+            {
+                lg2::error("Error contacting entity manager");
+                return;
+            }
+
+            processSensorConfigs(io, objectServer, sensors, dbusConnection,
+                                 resp);
+        },
+        entityManagerName, "/xyz/openbmc_project/inventory",
+        "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
+}
+
+void interfaceRemoved(
+    sdbusplus::message_t& message,
+    boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+        sensors)
+{
+    if (message.is_method_error())
+    {
+        lg2::error("interfacesRemoved callback method error");
+        return;
+    }
+
+    sdbusplus::message::object_path removedPath;
+    std::vector<std::string> interfaces;
+
+    message.read(removedPath, interfaces);
+
+    // If the xyz.openbmc_project.Confguration.X interface was removed
+    // for one or more sensors, delete those sensor objects.
+    auto sensorIt = sensors.begin();
+    while (sensorIt != sensors.end())
+    {
+        if ((sensorIt->second->configurationPath == removedPath) &&
+            (std::find(interfaces.begin(), interfaces.end(),
+                       configInterfaceName(sensorType)) != interfaces.end()))
+        {
+            sensorIt = sensors.erase(sensorIt);
+        }
+        else
+        {
+            sensorIt++;
+        }
+    }
+}
diff --git a/src/gpu/GpuSensor.hpp b/src/gpu/GpuSensor.hpp
new file mode 100644
index 0000000..01eb23f
--- /dev/null
+++ b/src/gpu/GpuSensor.hpp
@@ -0,0 +1,154 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "Thresholds.hpp"
+#include "sensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/container/flat_map.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
+constexpr const char* sensorType = "NvidiaMctpVdm";
+
+using getSubTreeRet = std::vector<
+    std::pair<std::string,
+              std::vector<std::pair<std::string, std::vector<std::string>>>>>;
+using GpuSensorConfigMap =
+    std::map<std::string, std::variant<std::string, bool, uint32_t, uint8_t,
+                                       int64_t, std::vector<uint8_t>>>;
+
+/**
+ * @struct DeviceInfo
+ * @brief Contains information about a device
+ */
+struct DeviceInfo
+{
+    uint8_t deviceType;
+    uint8_t instanceId;
+};
+
+/**
+ * @struct GpuTempSensor
+ * @brief Implements a GPU temperature sensor that monitors temperature values
+ * @details Inherits from Sensor base class and enables shared pointer
+ * management via std::enable_shared_from_this
+ */
+struct GpuTempSensor :
+    public Sensor,
+    public std::enable_shared_from_this<GpuTempSensor>
+{
+  public:
+    /**
+     * @brief Constructor for GpuTempSensor
+     * @param conn D-Bus connection
+     * @param io Boost ASIO I/O context for asynchronous operations
+     * @param mctpRequester MCTP protocol requester for GPU communication
+     * @param name Name of the sensor
+     * @param sensorConfiguration Configuration string for the sensor
+     * @param objectServer D-Bus object server
+     * @param thresholdData Vector of threshold configurations
+     * @param pollRate How often to poll for new readings
+     * @param deviceInfo Information about the GPU device
+     * @param verbose Whether to enable verbose logging
+     */
+    GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+                  boost::asio::io_context& io, const std::string& name,
+                  const std::string& sensorConfiguration,
+                  sdbusplus::asio::object_server& objectServer,
+                  std::vector<thresholds::Threshold>&& thresholdData);
+
+    /**
+     * @brief Destructor
+     */
+    ~GpuTempSensor() override;
+
+    /**
+     * @brief Check if any thresholds have been crossed
+     * @details Overrides the base class method to implement GPU-specific
+     * threshold checking
+     */
+    void checkThresholds() override;
+
+  private:
+    /**
+     * @brief Initialize the sensor
+     */
+    void init();
+
+    /**
+     * @brief Discover available GPUs on the system
+     */
+    void discoverGpus();
+
+    /**
+     * @brief Process MCTP endpoints discovered on the system
+     *
+     * @param[in] ec Error code from the D-Bus method call
+     * @param[in] ret Object tree results containing MCTP endpoint information
+     */
+    void processMctpEndpoints(const boost::system::error_code& ec,
+                              const getSubTreeRet& ret);
+
+    /**
+     * @brief Process configuration properties for MCTP endpoints
+     *
+     * @param[in] ec Error code from the D-Bus properties method call
+     * @param[in] configs Map of configuration properties for the endpoint
+     */
+    void processEndpointConfigs(const boost::system::error_code& ec,
+                                const GpuSensorConfigMap& configs);
+
+    /**
+     * @brief Timer for scheduling sensor reads
+     */
+    boost::asio::steady_timer waitTimer;
+
+    /**
+     * @brief D-Bus connection
+     */
+    std::shared_ptr<sdbusplus::asio::connection> conn;
+
+    /**
+     * @brief D-Bus object server
+     */
+    sdbusplus::asio::object_server& objectServer;
+};
+
+/**
+ * @brief Create GPU temperature sensors
+ * @param io Boost ASIO I/O context
+ * @param objectServer D-Bus object server
+ * @param sensors Map to store created sensors
+ * @param dbusConnection D-Bus connection
+ */
+void createSensors(
+    boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+    boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+        sensors,
+    std::shared_ptr<sdbusplus::asio::connection>& dbusConnection);
+
+/**
+ * @brief Handle D-Bus interface removal events
+ * @param message D-Bus message containing interface removal information
+ * @param sensors Map of GPU temperature sensors to check for removal
+ */
+void interfaceRemoved(
+    sdbusplus::message_t& message,
+    boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
+        sensors);
diff --git a/src/gpu/GpuSensorMain.cpp b/src/gpu/GpuSensorMain.cpp
index 85fec45..a7b1d7f 100644
--- a/src/gpu/GpuSensorMain.cpp
+++ b/src/gpu/GpuSensorMain.cpp
@@ -3,12 +3,52 @@
  * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
  */
 
+#include "GpuSensor.hpp"
+#include "Utils.hpp"
+
+#include <boost/asio/error.hpp>
 #include <boost/asio/io_context.hpp>
+#include <boost/asio/post.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
 #include <sdbusplus/asio/connection.hpp>
 #include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/bus.hpp>
+#include <sdbusplus/bus/match.hpp>
+#include <sdbusplus/message.hpp>
 
+#include <array>
+#include <chrono>
+#include <functional>
 #include <memory>
 #include <string>
+#include <vector>
+
+boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>> sensors;
+
+/**
+ * @brief config timer expiry callback
+ * @param io Boost ASIO I/O context
+ * @param objectServer D-Bus object server
+ * @param dbusConnection D-Bus connection
+ * @param ec Boost ASIO error code
+ */
+void configTimerExpiryCallback(
+    boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+    std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+    const boost::system::error_code& ec)
+{
+    if (ec == boost::asio::error::operation_aborted)
+    {
+        return; // we're being canceled
+    }
+    createSensors(io, objectServer, sensors, dbusConnection);
+    if (sensors.empty())
+    {
+        lg2::info("Configuration not detected");
+    }
+}
 
 int main()
 {
@@ -18,6 +58,33 @@
     objectServer.add_manager("/xyz/openbmc_project/sensors");
     systemBus->request_name("xyz.openbmc_project.GpuSensor");
 
+    boost::asio::post(io, [&]() {
+        createSensors(io, objectServer, sensors, systemBus);
+    });
+
+    boost::asio::steady_timer configTimer(io);
+
+    std::function<void(sdbusplus::message_t&)> eventHandler =
+        [&configTimer, &io, &objectServer, &systemBus](sdbusplus::message_t&) {
+            configTimer.expires_after(std::chrono::seconds(1));
+            // create a timer because normally multiple properties change
+            configTimer.async_wait(
+                std::bind_front(configTimerExpiryCallback, std::ref(io),
+                                std::ref(objectServer), std::ref(systemBus)));
+        };
+
+    std::vector<std::unique_ptr<sdbusplus::bus::match_t>> matches =
+        setupPropertiesChangedMatches(
+            *systemBus, std::to_array<const char*>({sensorType}), eventHandler);
+
+    // Watch for entity-manager to remove configuration interfaces
+    // so the corresponding sensors can be removed.
+    auto ifaceRemovedMatch = std::make_shared<sdbusplus::bus::match_t>(
+        static_cast<sdbusplus::bus_t&>(*systemBus),
+        "type='signal',member='InterfacesRemoved',arg0path='" +
+            std::string(inventoryPath) + "/'",
+        [](sdbusplus::message_t& msg) { interfaceRemoved(msg, sensors); });
+
     io.run();
     return 0;
 }
diff --git a/src/gpu/meson.build b/src/gpu/meson.build
index ec3a29b..df682e5 100644
--- a/src/gpu/meson.build
+++ b/src/gpu/meson.build
@@ -1,4 +1,4 @@
-gpusensor_sources = files('GpuSensorMain.cpp')
+gpusensor_sources = files('GpuSensor.cpp', 'GpuSensorMain.cpp')
 
 gpusensor_include_dir = include_directories('.', is_system: true)
 sensor_include_dir = include_directories('../..')