nvidia-gpu: add SMA Temperature Sensor

Add support for device type SMA (System Management Agent) and its
temperature sensor. It is typically an MCU device.

Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

```
$ curl -s -k -u 'root:0penBmc' https://10.137.203.193/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_SMA_255_TEMP_0
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_SMA_255_TEMP_0",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_SMA_255_TEMP_0",
  "Name": "NVIDIA GB200 GPU SMA 255 TEMP 0",
  "Reading": 34.0,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  }
}%
```

Change-Id: I560864758036a5b6ea6c1745145736c7bfa0a1c5
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
index adb21ea..a9ca525 100644
--- a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
@@ -7,6 +7,7 @@
 #include "NvidiaDeviceDiscovery.hpp"
 
 #include "NvidiaGpuDevice.hpp"
+#include "NvidiaSmaDevice.hpp"
 #include "Utils.hpp"
 
 #include <bits/basic_string.h>
@@ -36,6 +37,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& conn,
     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
     const std::string& path, uint8_t eid, int sendRecvMsgResult,
@@ -66,18 +69,39 @@
         return;
     }
 
-    if (responseDeviceType ==
-        static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
+    switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
     {
-        lg2::info(
-            "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
-            "EID", eid, "DEVTYPE", responseDeviceType, "IID",
-            responseInstanceId);
+        case gpu::DeviceIdentification::DEVICE_GPU:
+        {
+            lg2::info(
+                "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+                "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+                responseInstanceId);
 
-        auto gpuName = configs.name + '_' + std::to_string(responseInstanceId);
+            auto gpuName = configs.name + '_' +
+                           std::to_string(responseInstanceId);
 
-        gpuDevices[gpuName] = std::make_shared<GpuDevice>(
-            configs, gpuName, path, conn, eid, io, mctpRequester, objectServer);
+            gpuDevices[gpuName] =
+                std::make_shared<GpuDevice>(configs, gpuName, path, conn, eid,
+                                            io, mctpRequester, objectServer);
+            break;
+        }
+
+        case gpu::DeviceIdentification::DEVICE_SMA:
+        {
+            lg2::info(
+                "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+                "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+                responseInstanceId);
+
+            auto smaName = configs.name + "_SMA_" +
+                           std::to_string(responseInstanceId);
+
+            smaDevices[smaName] =
+                std::make_shared<SmaDevice>(configs, smaName, path, conn, eid,
+                                            io, mctpRequester, objectServer);
+            break;
+        }
     }
 }
 
@@ -85,6 +109,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& conn,
     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
     const std::string& path, uint8_t eid)
@@ -108,12 +134,12 @@
     mctpRequester.sendRecvMsg(
         eid, *queryDeviceIdentificationRequest,
         *queryDeviceIdentificationResponse,
-        [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs, path,
-         eid, queryDeviceIdentificationRequest,
+        [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
+         configs, path, eid, queryDeviceIdentificationRequest,
          queryDeviceIdentificationResponse](int sendRecvMsgResult) {
             processQueryDeviceIdResponse(
-                io, objectServer, gpuDevices, conn, mctpRequester, configs,
-                path, eid, sendRecvMsgResult,
+                io, objectServer, gpuDevices, smaDevices, conn, mctpRequester,
+                configs, path, eid, sendRecvMsgResult,
                 *queryDeviceIdentificationResponse);
         });
 }
@@ -122,6 +148,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& conn,
     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
     const std::string& path, const boost::system::error_code& ec,
@@ -189,8 +217,8 @@
                   ocp::accelerator_management::messageType) != mctpTypes.end())
     {
         lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
-        queryDeviceIdentification(io, objectServer, gpuDevices, conn,
-                                  mctpRequester, configs, path, eid);
+        queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
+                                  conn, mctpRequester, configs, path, eid);
     }
 }
 
@@ -198,6 +226,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& conn,
     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
     const std::string& path, const boost::system::error_code& ec,
@@ -224,12 +254,13 @@
                 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
                 {
                     conn->async_method_call(
-                        [&io, &objectServer, &gpuDevices, conn, &mctpRequester,
-                         configs, path](const boost::system::error_code& ec,
-                                        const SensorBaseConfigMap& endpoint) {
-                            processEndpoint(io, objectServer, gpuDevices, conn,
-                                            mctpRequester, configs, path, ec,
-                                            endpoint);
+                        [&io, &objectServer, &gpuDevices, &smaDevices, conn,
+                         &mctpRequester, configs,
+                         path](const boost::system::error_code& ec,
+                               const SensorBaseConfigMap& endpoint) {
+                            processEndpoint(io, objectServer, gpuDevices,
+                                            smaDevices, conn, mctpRequester,
+                                            configs, path, ec, endpoint);
                         },
                         service, objPath, "org.freedesktop.DBus.Properties",
                         "GetAll", iface);
@@ -243,6 +274,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& conn,
     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
     const std::string& path)
@@ -251,10 +284,11 @@
     std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
 
     conn->async_method_call(
-        [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs,
+        [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
+         configs,
          path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
-            queryEndpoints(io, objectServer, gpuDevices, conn, mctpRequester,
-                           configs, path, ec, ret);
+            queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
+                           mctpRequester, configs, path, ec, ret);
         },
         "xyz.openbmc_project.ObjectMapper",
         "/xyz/openbmc_project/object_mapper",
@@ -266,6 +300,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
     mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
 {
@@ -284,8 +320,8 @@
 
             configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
 
-            discoverDevices(io, objectServer, gpuDevices, dbusConnection,
-                            mctpRequester, configs, path);
+            discoverDevices(io, objectServer, gpuDevices, smaDevices,
+                            dbusConnection, mctpRequester, configs, path);
 
             lg2::info(
                 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
@@ -298,6 +334,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
     mctp::MctpRequester& mctpRequester)
 {
@@ -307,16 +345,17 @@
         return;
     }
     dbusConnection->async_method_call(
-        [&gpuDevices, &mctpRequester, dbusConnection, &io, &objectServer](
-            boost::system::error_code ec, const ManagedObjectType& resp) {
+        [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
+         &objectServer](boost::system::error_code ec,
+                        const ManagedObjectType& resp) {
             if (ec)
             {
                 lg2::error("Error contacting entity manager");
                 return;
             }
 
-            processSensorConfigs(io, objectServer, gpuDevices, dbusConnection,
-                                 mctpRequester, resp);
+            processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
+                                 dbusConnection, mctpRequester, resp);
         },
         entityManagerName, "/xyz/openbmc_project/inventory",
         "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
@@ -325,7 +364,9 @@
 void interfaceRemoved(
     sdbusplus::message_t& message,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
-        gpuDevices)
+        gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices)
 {
     if (message.is_method_error())
     {
@@ -354,4 +395,19 @@
             sensorIt++;
         }
     }
+
+    auto smaSensorIt = smaDevices.begin();
+    while (smaSensorIt != smaDevices.end())
+    {
+        if ((smaSensorIt->second->getPath() == removedPath) &&
+            (std::find(interfaces.begin(), interfaces.end(),
+                       configInterfaceName(deviceType)) != interfaces.end()))
+        {
+            smaSensorIt = smaDevices.erase(smaSensorIt);
+        }
+        else
+        {
+            smaSensorIt++;
+        }
+    }
 }
diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
index 86211a9..8b73453 100644
--- a/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
@@ -28,15 +28,20 @@
 };
 
 class GpuDevice;
+class SmaDevice;
 
 void createSensors(
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
         gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
     mctp::MctpRequester& mctpRequester);
 
 void interfaceRemoved(
     sdbusplus::message_t& message,
     boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
-        gpuDevices);
+        gpuDevices,
+    boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+        smaDevices);
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
index e378cfc..09df4e6 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
@@ -38,7 +38,8 @@
 
 enum class DeviceIdentification : uint8_t
 {
-    DEVICE_GPU = 0
+    DEVICE_GPU = 0,
+    DEVICE_SMA = 5
 };
 
 struct QueryDeviceIdentificationRequest
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 1e21f4d..e2ae89a 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -22,6 +22,7 @@
 
 constexpr uint8_t gpuTempSensorId{0};
 constexpr uint8_t gpuTLimitSensorId{2};
+constexpr uint8_t smaTempSensorId{5};
 
 struct NvidiaGpuTempSensor :
     public Sensor,
diff --git a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
index 6ccbb05..7e404f5 100644
--- a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
@@ -8,6 +8,7 @@
 #include "Utils.hpp"
 
 #include <NvidiaDeviceDiscovery.hpp>
+#include <NvidiaSmaDevice.hpp>
 #include <boost/asio/error.hpp>
 #include <boost/asio/io_context.hpp>
 #include <boost/asio/post.hpp>
@@ -27,6 +28,7 @@
 #include <vector>
 
 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>> gpuDevices;
+boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>> smaDevices;
 
 void configTimerExpiryCallback(
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
@@ -37,7 +39,8 @@
     {
         return; // we're being canceled
     }
-    createSensors(io, objectServer, gpuDevices, dbusConnection, mctpRequester);
+    createSensors(io, objectServer, gpuDevices, smaDevices, dbusConnection,
+                  mctpRequester);
 }
 
 int main()
@@ -51,7 +54,8 @@
     mctp::MctpRequester mctpRequester(io);
 
     boost::asio::post(io, [&]() {
-        createSensors(io, objectServer, gpuDevices, systemBus, mctpRequester);
+        createSensors(io, objectServer, gpuDevices, smaDevices, systemBus,
+                      mctpRequester);
     });
 
     boost::asio::steady_timer configTimer(io);
@@ -76,7 +80,9 @@
         static_cast<sdbusplus::bus_t&>(*systemBus),
         sdbusplus::bus::match::rules::interfacesRemovedAtPath(
             std::string(inventoryPath)),
-        [](sdbusplus::message_t& msg) { interfaceRemoved(msg, gpuDevices); });
+        [](sdbusplus::message_t& msg) {
+            interfaceRemoved(msg, gpuDevices, smaDevices);
+        });
 
     io.run();
     return 0;
diff --git a/src/nvidia-gpu/NvidiaSmaDevice.cpp b/src/nvidia-gpu/NvidiaSmaDevice.cpp
new file mode 100644
index 0000000..755e1f8
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaSmaDevice.cpp
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaSmaDevice.hpp"
+
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <MctpRequester.hpp>
+#include <boost/asio/io_context.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+SmaDevice::SmaDevice(const SensorConfigs& configs, const std::string& name,
+                     const std::string& path,
+                     const std::shared_ptr<sdbusplus::asio::connection>& conn,
+                     uint8_t eid, boost::asio::io_context& io,
+                     mctp::MctpRequester& mctpRequester,
+                     sdbusplus::asio::object_server& objectServer) :
+    eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
+    waitTimer(io, std::chrono::steady_clock::duration(0)),
+    mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
+    configs(configs), name(escapeName(name)), path(path)
+{
+    makeSensors();
+}
+
+void SmaDevice::makeSensors()
+{
+    tempSensor = std::make_shared<NvidiaGpuTempSensor>(
+        conn, mctpRequester, name + "_TEMP_0", path, eid, smaTempSensorId,
+        objectServer, std::vector<thresholds::Threshold>{});
+
+    lg2::info("Added MCA {NAME} Sensors with chassis path: {PATH}.", "NAME",
+              name, "PATH", path);
+
+    read();
+}
+
+void SmaDevice::read()
+{
+    tempSensor->update();
+
+    waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+    waitTimer.async_wait([this](const boost::system::error_code& ec) {
+        if (ec)
+        {
+            return;
+        }
+        read();
+    });
+}
diff --git a/src/nvidia-gpu/NvidiaSmaDevice.hpp b/src/nvidia-gpu/NvidiaSmaDevice.hpp
new file mode 100644
index 0000000..fc78de2
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaSmaDevice.hpp
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+class SmaDevice
+{
+  public:
+    SmaDevice(const SensorConfigs& configs, const std::string& name,
+              const std::string& path,
+              const std::shared_ptr<sdbusplus::asio::connection>& conn,
+              uint8_t eid, boost::asio::io_context& io,
+              mctp::MctpRequester& mctpRequester,
+              sdbusplus::asio::object_server& objectServer);
+
+    const std::string& getPath() const
+    {
+        return path;
+    }
+
+  private:
+    void makeSensors();
+
+    void read();
+
+    uint8_t eid{};
+
+    std::chrono::milliseconds sensorPollMs;
+
+    boost::asio::steady_timer waitTimer;
+
+    mctp::MctpRequester& mctpRequester;
+
+    std::shared_ptr<sdbusplus::asio::connection> conn;
+
+    sdbusplus::asio::object_server& objectServer;
+
+    std::shared_ptr<NvidiaGpuTempSensor> tempSensor;
+
+    SensorConfigs configs;
+
+    std::string name;
+
+    std::string path;
+};
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index 39ce871..fcf15fa 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -9,6 +9,7 @@
     'NvidiaGpuSensorMain.cpp',
     'NvidiaGpuThresholds.cpp',
     'NvidiaGpuVoltageSensor.cpp',
+    'NvidiaSmaDevice.cpp',
     'OcpMctpVdm.cpp',
 )