nvidia-gpu: add SMA Temperature Sensor
Add support for device type SMA (System Management Agent) and its
temperature sensor. It is typically an MCU device.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -s -k -u 'root:0penBmc' https://10.137.203.193/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_SMA_255_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_SMA_255_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_SMA_255_TEMP_0",
"Name": "NVIDIA GB200 GPU SMA 255 TEMP 0",
"Reading": 34.0,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: I560864758036a5b6ea6c1745145736c7bfa0a1c5
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
index adb21ea..a9ca525 100644
--- a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
@@ -7,6 +7,7 @@
#include "NvidiaDeviceDiscovery.hpp"
#include "NvidiaGpuDevice.hpp"
+#include "NvidiaSmaDevice.hpp"
#include "Utils.hpp"
#include <bits/basic_string.h>
@@ -36,6 +37,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
const std::string& path, uint8_t eid, int sendRecvMsgResult,
@@ -66,18 +69,39 @@
return;
}
- if (responseDeviceType ==
- static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
+ switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
{
- lg2::info(
- "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
- "EID", eid, "DEVTYPE", responseDeviceType, "IID",
- responseInstanceId);
+ case gpu::DeviceIdentification::DEVICE_GPU:
+ {
+ lg2::info(
+ "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+ "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+ responseInstanceId);
- auto gpuName = configs.name + '_' + std::to_string(responseInstanceId);
+ auto gpuName = configs.name + '_' +
+ std::to_string(responseInstanceId);
- gpuDevices[gpuName] = std::make_shared<GpuDevice>(
- configs, gpuName, path, conn, eid, io, mctpRequester, objectServer);
+ gpuDevices[gpuName] =
+ std::make_shared<GpuDevice>(configs, gpuName, path, conn, eid,
+ io, mctpRequester, objectServer);
+ break;
+ }
+
+ case gpu::DeviceIdentification::DEVICE_SMA:
+ {
+ lg2::info(
+ "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+ "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+ responseInstanceId);
+
+ auto smaName = configs.name + "_SMA_" +
+ std::to_string(responseInstanceId);
+
+ smaDevices[smaName] =
+ std::make_shared<SmaDevice>(configs, smaName, path, conn, eid,
+ io, mctpRequester, objectServer);
+ break;
+ }
}
}
@@ -85,6 +109,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
const std::string& path, uint8_t eid)
@@ -108,12 +134,12 @@
mctpRequester.sendRecvMsg(
eid, *queryDeviceIdentificationRequest,
*queryDeviceIdentificationResponse,
- [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs, path,
- eid, queryDeviceIdentificationRequest,
+ [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
+ configs, path, eid, queryDeviceIdentificationRequest,
queryDeviceIdentificationResponse](int sendRecvMsgResult) {
processQueryDeviceIdResponse(
- io, objectServer, gpuDevices, conn, mctpRequester, configs,
- path, eid, sendRecvMsgResult,
+ io, objectServer, gpuDevices, smaDevices, conn, mctpRequester,
+ configs, path, eid, sendRecvMsgResult,
*queryDeviceIdentificationResponse);
});
}
@@ -122,6 +148,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
const std::string& path, const boost::system::error_code& ec,
@@ -189,8 +217,8 @@
ocp::accelerator_management::messageType) != mctpTypes.end())
{
lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
- queryDeviceIdentification(io, objectServer, gpuDevices, conn,
- mctpRequester, configs, path, eid);
+ queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
+ conn, mctpRequester, configs, path, eid);
}
}
@@ -198,6 +226,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
const std::string& path, const boost::system::error_code& ec,
@@ -224,12 +254,13 @@
if (iface == "xyz.openbmc_project.MCTP.Endpoint")
{
conn->async_method_call(
- [&io, &objectServer, &gpuDevices, conn, &mctpRequester,
- configs, path](const boost::system::error_code& ec,
- const SensorBaseConfigMap& endpoint) {
- processEndpoint(io, objectServer, gpuDevices, conn,
- mctpRequester, configs, path, ec,
- endpoint);
+ [&io, &objectServer, &gpuDevices, &smaDevices, conn,
+ &mctpRequester, configs,
+ path](const boost::system::error_code& ec,
+ const SensorBaseConfigMap& endpoint) {
+ processEndpoint(io, objectServer, gpuDevices,
+ smaDevices, conn, mctpRequester,
+ configs, path, ec, endpoint);
},
service, objPath, "org.freedesktop.DBus.Properties",
"GetAll", iface);
@@ -243,6 +274,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
const std::string& path)
@@ -251,10 +284,11 @@
std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
conn->async_method_call(
- [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs,
+ [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
+ configs,
path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
- queryEndpoints(io, objectServer, gpuDevices, conn, mctpRequester,
- configs, path, ec, ret);
+ queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
+ mctpRequester, configs, path, ec, ret);
},
"xyz.openbmc_project.ObjectMapper",
"/xyz/openbmc_project/object_mapper",
@@ -266,6 +300,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
{
@@ -284,8 +320,8 @@
configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
- discoverDevices(io, objectServer, gpuDevices, dbusConnection,
- mctpRequester, configs, path);
+ discoverDevices(io, objectServer, gpuDevices, smaDevices,
+ dbusConnection, mctpRequester, configs, path);
lg2::info(
"Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
@@ -298,6 +334,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
mctp::MctpRequester& mctpRequester)
{
@@ -307,16 +345,17 @@
return;
}
dbusConnection->async_method_call(
- [&gpuDevices, &mctpRequester, dbusConnection, &io, &objectServer](
- boost::system::error_code ec, const ManagedObjectType& resp) {
+ [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
+ &objectServer](boost::system::error_code ec,
+ const ManagedObjectType& resp) {
if (ec)
{
lg2::error("Error contacting entity manager");
return;
}
- processSensorConfigs(io, objectServer, gpuDevices, dbusConnection,
- mctpRequester, resp);
+ processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
+ dbusConnection, mctpRequester, resp);
},
entityManagerName, "/xyz/openbmc_project/inventory",
"org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
@@ -325,7 +364,9 @@
void interfaceRemoved(
sdbusplus::message_t& message,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
- gpuDevices)
+ gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices)
{
if (message.is_method_error())
{
@@ -354,4 +395,19 @@
sensorIt++;
}
}
+
+ auto smaSensorIt = smaDevices.begin();
+ while (smaSensorIt != smaDevices.end())
+ {
+ if ((smaSensorIt->second->getPath() == removedPath) &&
+ (std::find(interfaces.begin(), interfaces.end(),
+ configInterfaceName(deviceType)) != interfaces.end()))
+ {
+ smaSensorIt = smaDevices.erase(smaSensorIt);
+ }
+ else
+ {
+ smaSensorIt++;
+ }
+ }
}
diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
index 86211a9..8b73453 100644
--- a/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
@@ -28,15 +28,20 @@
};
class GpuDevice;
+class SmaDevice;
void createSensors(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices,
const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
mctp::MctpRequester& mctpRequester);
void interfaceRemoved(
sdbusplus::message_t& message,
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
- gpuDevices);
+ gpuDevices,
+ boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
+ smaDevices);
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
index e378cfc..09df4e6 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
@@ -38,7 +38,8 @@
enum class DeviceIdentification : uint8_t
{
- DEVICE_GPU = 0
+ DEVICE_GPU = 0,
+ DEVICE_SMA = 5
};
struct QueryDeviceIdentificationRequest
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 1e21f4d..e2ae89a 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -22,6 +22,7 @@
constexpr uint8_t gpuTempSensorId{0};
constexpr uint8_t gpuTLimitSensorId{2};
+constexpr uint8_t smaTempSensorId{5};
struct NvidiaGpuTempSensor :
public Sensor,
diff --git a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
index 6ccbb05..7e404f5 100644
--- a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
@@ -8,6 +8,7 @@
#include "Utils.hpp"
#include <NvidiaDeviceDiscovery.hpp>
+#include <NvidiaSmaDevice.hpp>
#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/post.hpp>
@@ -27,6 +28,7 @@
#include <vector>
boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>> gpuDevices;
+boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>> smaDevices;
void configTimerExpiryCallback(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
@@ -37,7 +39,8 @@
{
return; // we're being canceled
}
- createSensors(io, objectServer, gpuDevices, dbusConnection, mctpRequester);
+ createSensors(io, objectServer, gpuDevices, smaDevices, dbusConnection,
+ mctpRequester);
}
int main()
@@ -51,7 +54,8 @@
mctp::MctpRequester mctpRequester(io);
boost::asio::post(io, [&]() {
- createSensors(io, objectServer, gpuDevices, systemBus, mctpRequester);
+ createSensors(io, objectServer, gpuDevices, smaDevices, systemBus,
+ mctpRequester);
});
boost::asio::steady_timer configTimer(io);
@@ -76,7 +80,9 @@
static_cast<sdbusplus::bus_t&>(*systemBus),
sdbusplus::bus::match::rules::interfacesRemovedAtPath(
std::string(inventoryPath)),
- [](sdbusplus::message_t& msg) { interfaceRemoved(msg, gpuDevices); });
+ [](sdbusplus::message_t& msg) {
+ interfaceRemoved(msg, gpuDevices, smaDevices);
+ });
io.run();
return 0;
diff --git a/src/nvidia-gpu/NvidiaSmaDevice.cpp b/src/nvidia-gpu/NvidiaSmaDevice.cpp
new file mode 100644
index 0000000..755e1f8
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaSmaDevice.cpp
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaSmaDevice.hpp"
+
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <MctpRequester.hpp>
+#include <boost/asio/io_context.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+SmaDevice::SmaDevice(const SensorConfigs& configs, const std::string& name,
+ const std::string& path,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ uint8_t eid, boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer) :
+ eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
+ waitTimer(io, std::chrono::steady_clock::duration(0)),
+ mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
+ configs(configs), name(escapeName(name)), path(path)
+{
+ makeSensors();
+}
+
+void SmaDevice::makeSensors()
+{
+ tempSensor = std::make_shared<NvidiaGpuTempSensor>(
+ conn, mctpRequester, name + "_TEMP_0", path, eid, smaTempSensorId,
+ objectServer, std::vector<thresholds::Threshold>{});
+
+ lg2::info("Added MCA {NAME} Sensors with chassis path: {PATH}.", "NAME",
+ name, "PATH", path);
+
+ read();
+}
+
+void SmaDevice::read()
+{
+ tempSensor->update();
+
+ waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+ waitTimer.async_wait([this](const boost::system::error_code& ec) {
+ if (ec)
+ {
+ return;
+ }
+ read();
+ });
+}
diff --git a/src/nvidia-gpu/NvidiaSmaDevice.hpp b/src/nvidia-gpu/NvidiaSmaDevice.hpp
new file mode 100644
index 0000000..fc78de2
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaSmaDevice.hpp
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+class SmaDevice
+{
+ public:
+ SmaDevice(const SensorConfigs& configs, const std::string& name,
+ const std::string& path,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ uint8_t eid, boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer);
+
+ const std::string& getPath() const
+ {
+ return path;
+ }
+
+ private:
+ void makeSensors();
+
+ void read();
+
+ uint8_t eid{};
+
+ std::chrono::milliseconds sensorPollMs;
+
+ boost::asio::steady_timer waitTimer;
+
+ mctp::MctpRequester& mctpRequester;
+
+ std::shared_ptr<sdbusplus::asio::connection> conn;
+
+ sdbusplus::asio::object_server& objectServer;
+
+ std::shared_ptr<NvidiaGpuTempSensor> tempSensor;
+
+ SensorConfigs configs;
+
+ std::string name;
+
+ std::string path;
+};
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index 39ce871..fcf15fa 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -9,6 +9,7 @@
'NvidiaGpuSensorMain.cpp',
'NvidiaGpuThresholds.cpp',
'NvidiaGpuVoltageSensor.cpp',
+ 'NvidiaSmaDevice.cpp',
'OcpMctpVdm.cpp',
)