nvidia-gpu: introduce notion of a device
Perform device discovery tasks only once per device to prepare for
introducing additional gpu sensors.
In the current implementation, sensor updates and device discovery via
MCTP are managed within a single class for simplicity. However, since a
GPU device typically includes multiple sensors, performing device
discovery for each individual sensor is inefficient. Instead, it would
be more effective to execute device discovery once per device.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"Name": "NVIDIA GB200 GPU 0 TEMP 0",
"Reading": 37.6875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ie3dcd43caa031b4aaa61d8be3f5d71aefd53bc9a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
new file mode 100644
index 0000000..adb21ea
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
@@ -0,0 +1,357 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaDeviceDiscovery.hpp"
+
+#include "NvidiaGpuDevice.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <MctpRequester.hpp>
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
+#include <boost/asio/io_context.hpp>
+#include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+#include <sdbusplus/message/native_types.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <span>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+void processQueryDeviceIdResponse(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
+ const std::string& path, uint8_t eid, int sendRecvMsgResult,
+ std::span<uint8_t> queryDeviceIdentificationResponse)
+{
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error(
+ "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
+ "EID", eid, "RC", sendRecvMsgResult);
+ return;
+ }
+
+ ocp::accelerator_management::CompletionCode cc{};
+ uint16_t reasonCode = 0;
+ uint8_t responseDeviceType = 0;
+ uint8_t responseInstanceId = 0;
+
+ auto rc = gpu::decodeQueryDeviceIdentificationResponse(
+ queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
+ responseInstanceId);
+
+ if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error(
+ "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+ "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
+ return;
+ }
+
+ if (responseDeviceType ==
+ static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
+ {
+ lg2::info(
+ "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+ "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+ responseInstanceId);
+
+ auto gpuName = configs.name + '_' + std::to_string(responseInstanceId);
+
+ gpuDevices[gpuName] = std::make_shared<GpuDevice>(
+ configs, gpuName, path, conn, eid, io, mctpRequester, objectServer);
+ }
+}
+
+void queryDeviceIdentification(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
+ const std::string& path, uint8_t eid)
+{
+ auto queryDeviceIdentificationRequest = std::make_shared<
+ std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
+
+ auto queryDeviceIdentificationResponse = std::make_shared<
+ std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationResponse)>>();
+
+ auto rc = gpu::encodeQueryDeviceIdentificationRequest(
+ 0, *queryDeviceIdentificationRequest);
+ if (rc != 0)
+ {
+ lg2::error(
+ "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
+ "EID", eid, "RC", rc);
+ return;
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, *queryDeviceIdentificationRequest,
+ *queryDeviceIdentificationResponse,
+ [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs, path,
+ eid, queryDeviceIdentificationRequest,
+ queryDeviceIdentificationResponse](int sendRecvMsgResult) {
+ processQueryDeviceIdResponse(
+ io, objectServer, gpuDevices, conn, mctpRequester, configs,
+ path, eid, sendRecvMsgResult,
+ *queryDeviceIdentificationResponse);
+ });
+}
+
+void processEndpoint(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
+ const std::string& path, const boost::system::error_code& ec,
+ const SensorBaseConfigMap& endpoint)
+{
+ if (ec)
+ {
+ lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
+ ec.message());
+ return;
+ }
+
+ auto hasEid = endpoint.find("EID");
+ uint8_t eid{};
+
+ if (hasEid != endpoint.end())
+ {
+ const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
+ if (eidPtr != nullptr)
+ {
+ eid = *eidPtr;
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint: Property EID does not have valid type.");
+ return;
+ }
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint: Property EID not found in the configuration.");
+ return;
+ }
+
+ auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
+ std::vector<uint8_t> mctpTypes{};
+
+ if (hasMctpTypes != endpoint.end())
+ {
+ const auto* mctpTypePtr =
+ std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
+ if (mctpTypePtr != nullptr)
+ {
+ mctpTypes = *mctpTypePtr;
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
+ "EID", eid);
+ return;
+ }
+ }
+ else
+ {
+ lg2::error(
+ "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
+ "EID", eid);
+ return;
+ }
+
+ if (std::find(mctpTypes.begin(), mctpTypes.end(),
+ ocp::accelerator_management::messageType) != mctpTypes.end())
+ {
+ lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
+ queryDeviceIdentification(io, objectServer, gpuDevices, conn,
+ mctpRequester, configs, path, eid);
+ }
+}
+
+void queryEndpoints(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
+ const std::string& path, const boost::system::error_code& ec,
+ const GetSubTreeType& ret)
+{
+ if (ec)
+ {
+ lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
+ ec.message());
+ return;
+ }
+
+ if (ret.empty())
+ {
+ return;
+ }
+
+ for (const auto& [objPath, services] : ret)
+ {
+ for (const auto& [service, ifaces] : services)
+ {
+ for (const auto& iface : ifaces)
+ {
+ if (iface == "xyz.openbmc_project.MCTP.Endpoint")
+ {
+ conn->async_method_call(
+ [&io, &objectServer, &gpuDevices, conn, &mctpRequester,
+ configs, path](const boost::system::error_code& ec,
+ const SensorBaseConfigMap& endpoint) {
+ processEndpoint(io, objectServer, gpuDevices, conn,
+ mctpRequester, configs, path, ec,
+ endpoint);
+ },
+ service, objPath, "org.freedesktop.DBus.Properties",
+ "GetAll", iface);
+ }
+ }
+ }
+ }
+}
+
+void discoverDevices(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
+ const std::string& path)
+{
+ std::string searchPath{"/au/com/codeconstruct/"};
+ std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
+
+ conn->async_method_call(
+ [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs,
+ path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
+ queryEndpoints(io, objectServer, gpuDevices, conn, mctpRequester,
+ configs, path, ec, ret);
+ },
+ "xyz.openbmc_project.ObjectMapper",
+ "/xyz/openbmc_project/object_mapper",
+ "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
+ ifaceList);
+}
+
+void processSensorConfigs(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
+{
+ for (const auto& [path, interfaces] : resp)
+ {
+ for (const auto& [intf, cfg] : interfaces)
+ {
+ if (intf != configInterfaceName(deviceType))
+ {
+ continue;
+ }
+
+ SensorConfigs configs;
+
+ configs.name = loadVariant<std::string>(cfg, "Name");
+
+ configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
+
+ discoverDevices(io, objectServer, gpuDevices, dbusConnection,
+ mctpRequester, configs, path);
+
+ lg2::info(
+ "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
+ "NAME", configs.name, "TYPE", deviceType, "PATH", path);
+ }
+ }
+}
+
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester)
+{
+ if (!dbusConnection)
+ {
+ lg2::error("Connection not created");
+ return;
+ }
+ dbusConnection->async_method_call(
+ [&gpuDevices, &mctpRequester, dbusConnection, &io, &objectServer](
+ boost::system::error_code ec, const ManagedObjectType& resp) {
+ if (ec)
+ {
+ lg2::error("Error contacting entity manager");
+ return;
+ }
+
+ processSensorConfigs(io, objectServer, gpuDevices, dbusConnection,
+ mctpRequester, resp);
+ },
+ entityManagerName, "/xyz/openbmc_project/inventory",
+ "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
+}
+
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices)
+{
+ if (message.is_method_error())
+ {
+ lg2::error("interfacesRemoved callback method error");
+ return;
+ }
+
+ sdbusplus::message::object_path removedPath;
+ std::vector<std::string> interfaces;
+
+ message.read(removedPath, interfaces);
+
+ // If the xyz.openbmc_project.Confguration.X interface was removed
+ // for one or more sensors, delete those sensor objects.
+ auto sensorIt = gpuDevices.begin();
+ while (sensorIt != gpuDevices.end())
+ {
+ if ((sensorIt->second->getPath() == removedPath) &&
+ (std::find(interfaces.begin(), interfaces.end(),
+ configInterfaceName(deviceType)) != interfaces.end()))
+ {
+ sensorIt = gpuDevices.erase(sensorIt);
+ }
+ else
+ {
+ sensorIt++;
+ }
+ }
+}
diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
new file mode 100644
index 0000000..86211a9
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.hpp
@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/container/flat_map.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/message.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
+constexpr const char* deviceType = "NvidiaMctpVdm";
+
+struct SensorConfigs
+{
+ std::string name;
+ uint64_t pollRate{};
+};
+
+class GpuDevice;
+
+void createSensors(
+ boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices,
+ const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester);
+
+void interfaceRemoved(
+ sdbusplus::message_t& message,
+ boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
+ gpuDevices);
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
new file mode 100644
index 0000000..cd39b56
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuDevice.hpp"
+
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+#include "Thresholds.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <MctpRequester.hpp>
+#include <boost/asio/io_context.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
+ const std::string& path,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ uint8_t eid, boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer) :
+ eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
+ waitTimer(io, std::chrono::steady_clock::duration(0)),
+ mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
+ configs(configs), name(escapeName(name)), path(path)
+{
+ makeSensors();
+}
+
+void GpuDevice::makeSensors()
+{
+ tempSensor = std::make_shared<NvidiaGpuTempSensor>(
+ conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
+ std::vector<thresholds::Threshold>{});
+
+ lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
+ name, "PATH", path);
+
+ read();
+}
+
+void GpuDevice::read()
+{
+ tempSensor->update();
+
+ waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+ waitTimer.async_wait([this](const boost::system::error_code& ec) {
+ if (ec)
+ {
+ return;
+ }
+ read();
+ });
+}
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.hpp b/src/nvidia-gpu/NvidiaGpuDevice.hpp
new file mode 100644
index 0000000..3653928
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuDevice.hpp
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+#include "NvidiaDeviceDiscovery.hpp"
+#include "NvidiaGpuSensor.hpp"
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
+#include <sdbusplus/asio/connection.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+class GpuDevice
+{
+ public:
+ GpuDevice(const SensorConfigs& configs, const std::string& name,
+ const std::string& path,
+ const std::shared_ptr<sdbusplus::asio::connection>& conn,
+ uint8_t eid, boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester,
+ sdbusplus::asio::object_server& objectServer);
+
+ const std::string& getPath() const
+ {
+ return path;
+ }
+
+ private:
+ void makeSensors();
+
+ void read();
+
+ uint8_t eid{};
+
+ std::chrono::milliseconds sensorPollMs;
+
+ boost::asio::steady_timer waitTimer;
+
+ mctp::MctpRequester& mctpRequester;
+
+ std::shared_ptr<sdbusplus::asio::connection> conn;
+
+ sdbusplus::asio::object_server& objectServer;
+
+ std::shared_ptr<NvidiaGpuTempSensor> tempSensor;
+
+ SensorConfigs configs;
+
+ std::string name;
+
+ std::string path;
+};
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.cpp b/src/nvidia-gpu/NvidiaGpuSensor.cpp
index 86b356b..1626545 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.cpp
@@ -14,25 +14,19 @@
#include <bits/basic_string.h>
#include <MctpRequester.hpp>
+#include <NvidiaDeviceDiscovery.hpp>
#include <NvidiaGpuMctpVdm.hpp>
#include <OcpMctpVdm.hpp>
-#include <boost/asio/io_context.hpp>
-#include <boost/container/flat_map.hpp>
#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
-#include <sdbusplus/message.hpp>
-#include <sdbusplus/message/native_types.hpp>
-#include <algorithm>
-#include <chrono>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <utility>
-#include <variant>
#include <vector>
using namespace std::literals;
@@ -41,19 +35,17 @@
static constexpr double gpuTempSensorMaxReading = 127;
static constexpr double gpuTempSensorMinReading = -128;
-GpuTempSensor::GpuTempSensor(
+NvidiaGpuTempSensor::NvidiaGpuTempSensor(
std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
- const std::string& name, const std::string& sensorConfiguration,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
+ const std::string& sensorConfiguration, const uint8_t eid,
sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData,
- std::chrono::milliseconds pollRate) :
+ std::vector<thresholds::Threshold>&& thresholdData) :
Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
"temperature", false, true, gpuTempSensorMaxReading,
gpuTempSensorMinReading, conn),
- sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
- waitTimer(io, std::chrono::steady_clock::duration(0)),
- mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
+ eid(eid), sensorId{gpuTempSensorId}, mctpRequester(mctpRequester),
+ objectServer(objectServer)
{
std::string dbusPath =
sensorPathPrefix + "temperature/"s + escapeName(name);
@@ -70,12 +62,11 @@
association = objectServer.add_interface(dbusPath, association::interface);
- discoverGpus();
+ setInitialProperties(sensor_paths::unitDegreesC);
}
-GpuTempSensor::~GpuTempSensor()
+NvidiaGpuTempSensor::~NvidiaGpuTempSensor()
{
- waitTimer.cancel();
for (const auto& iface : thresholdInterfaces)
{
objectServer.remove_interface(iface);
@@ -84,72 +75,18 @@
objectServer.remove_interface(sensorInterface);
}
-void GpuTempSensor::checkThresholds()
+void NvidiaGpuTempSensor::checkThresholds()
{
thresholds::checkThresholds(this);
}
-void GpuTempSensor::queryEndpoints(const boost::system::error_code& ec,
- const GetSubTreeType& ret)
-{
- if (ec)
- {
- lg2::error("Error querying endoints :{ERROR}", "ERROR", ec.message());
- return;
- }
-
- if (ret.empty())
- {
- return;
- }
-
- for (const auto& [objPath, services] : ret)
- {
- for (const auto& [service, ifaces] : services)
- {
- for (const auto& iface : ifaces)
- {
- if (iface == "xyz.openbmc_project.MCTP.Endpoint")
- {
- conn->async_method_call(
- [this](const boost::system::error_code& ec,
- const SensorBaseConfigMap& configs) {
- this->processEndpoint(ec, configs);
- },
- service, objPath, "org.freedesktop.DBus.Properties",
- "GetAll", iface);
- }
- }
- }
- }
-}
-
-void GpuTempSensor::read()
-{
- update();
-
- waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
- waitTimer.async_wait(
- [weakPtrToThis = std::weak_ptr<GpuTempSensor>{shared_from_this()}](
- const boost::system::error_code& ec) {
- if (ec)
- {
- return;
- }
- if (auto ptr = weakPtrToThis.lock())
- {
- ptr->read();
- }
- });
-}
-
-void GpuTempSensor::processResponse(int sendRecvMsgResult)
+void NvidiaGpuTempSensor::processResponse(int sendRecvMsgResult)
{
if (sendRecvMsgResult != 0)
{
lg2::error(
- "Error updating Temperature Sensor: sending message over MCTP failed, rc={RC}",
- "RC", sendRecvMsgResult);
+ "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
+ "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
return;
}
@@ -163,264 +100,29 @@
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
{
lg2::error(
- "Error updating Temperature Sensor: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
- "RC", rc, "CC", cc, "RESC", reasonCode);
+ "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : decode failed. "
+ "rc={RC}, cc={CC}, reasonCode={RESC}",
+ "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
+ reasonCode);
return;
}
updateValue(tempValue);
}
-void GpuTempSensor::update()
+void NvidiaGpuTempSensor::update()
{
auto rc = gpu::encodeGetTemperatureReadingRequest(
0, sensorId, getTemperatureReadingRequest);
+
if (rc != 0)
{
- lg2::error("Error updating Temperature Sensor: encode failed, rc={RC}",
- "RC", rc);
- return;
+ lg2::error(
+ "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
+ "EID", eid, "SID", sensorId, "RC", rc);
}
mctpRequester.sendRecvMsg(
eid, getTemperatureReadingRequest, getTemperatureReadingResponse,
[this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
}
-
-void GpuTempSensor::processQueryDeviceIdResponse(uint8_t eid,
- int sendRecvMsgResult)
-{
- if (sendRecvMsgResult != 0)
- {
- lg2::error(
- "Error processing GPU endpoint: sending message over MCTP failed, rc={RC}",
- "RC", sendRecvMsgResult);
- return;
- }
-
- ocp::accelerator_management::CompletionCode cc{};
- uint16_t reasonCode = 0;
- uint8_t responseDeviceType = 0;
- uint8_t responseInstanceId = 0;
-
- auto rc = gpu::decodeQueryDeviceIdentificationResponse(
- queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
- responseInstanceId);
-
- if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
- {
- lg2::error(
- "Error processing GPU endpoint: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
- "RC", rc, "CC", cc, "RESC", reasonCode);
- return;
- }
-
- if (responseDeviceType ==
- static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
- {
- lg2::info(
- "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
- "EID", eid, "DEVTYPE", responseDeviceType, "IID",
- responseInstanceId);
-
- this->eid = eid;
- setInitialProperties(sensor_paths::unitDegreesC);
- read();
- }
-}
-
-void GpuTempSensor::processGpuEndpoint(uint8_t eid)
-{
- auto rc = gpu::encodeQueryDeviceIdentificationRequest(
- 0, queryDeviceIdentificationRequest);
- if (rc != 0)
- {
- lg2::error("Error processing GPU endpoint: encode failed, rc={RC}",
- "RC", rc);
- return;
- }
-
- mctpRequester.sendRecvMsg(
- eid, queryDeviceIdentificationRequest,
- queryDeviceIdentificationResponse, [this, eid](int sendRecvMsgResult) {
- processQueryDeviceIdResponse(eid, sendRecvMsgResult);
- });
-}
-
-void GpuTempSensor::processEndpoint(const boost::system::error_code& ec,
- const SensorBaseConfigMap& endpoint)
-{
- if (ec)
- {
- lg2::error("Error processing MCTP endpoint: {ERROR}", "ERROR",
- ec.message());
- return;
- }
-
- uint8_t eid{};
- std::vector<uint8_t> mctpTypes{};
-
- auto hasEid = endpoint.find("EID");
- if (hasEid != endpoint.end())
- {
- const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
- if (eidPtr != nullptr)
- {
- eid = *eidPtr;
- }
- else
- {
- lg2::error(
- "Error processing MCTP endpoint: Property EID does not have valid type.");
- return;
- }
- }
- else
- {
- lg2::error(
- "Error processing MCTP endpoint: Property EID not found in the configuration.");
- return;
- }
-
- auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
- if (hasMctpTypes != endpoint.end())
- {
- const auto* mctpTypePtr =
- std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
- if (mctpTypePtr != nullptr)
- {
- mctpTypes = *mctpTypePtr;
- }
- else
- {
- lg2::error(
- "Error processing MCTP endpoint: Property SupportedMessageTypes does not have valid type.");
- return;
- }
- }
- else
- {
- lg2::error(
- "Error processing MCTP endpoint: Property SupportedMessageTypes not found in the configuration.");
- return;
- }
-
- if (std::find(mctpTypes.begin(), mctpTypes.end(),
- ocp::accelerator_management::messageType) != mctpTypes.end())
- {
- lg2::info(
- "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
- "EID", eid);
- this->processGpuEndpoint(eid);
- }
-}
-
-void GpuTempSensor::discoverGpus()
-{
- std::string searchPath{"/au/com/codeconstruct/"};
- std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
-
- conn->async_method_call(
- [this](const boost::system::error_code& ec, const GetSubTreeType& ret) {
- queryEndpoints(ec, ret);
- },
- "xyz.openbmc_project.ObjectMapper",
- "/xyz/openbmc_project/object_mapper",
- "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
- ifaceList);
-}
-
-void processSensorConfigs(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
-{
- for (const auto& [path, interfaces] : resp)
- {
- for (const auto& [intf, cfg] : interfaces)
- {
- if (intf != configInterfaceName(sensorType))
- {
- continue;
- }
-
- std::string name = loadVariant<std::string>(cfg, "Name");
-
- uint64_t pollRate = loadVariant<uint64_t>(cfg, "PollRate");
-
- sensors[name] = std::make_shared<GpuTempSensor>(
- dbusConnection, io, mctpRequester, name, path, objectServer,
- std::vector<thresholds::Threshold>{},
- std::chrono::milliseconds{pollRate});
-
- lg2::info(
- "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
- "NAME", name, "PATH", path);
- }
- }
-}
-
-void createSensors(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester)
-{
- if (!dbusConnection)
- {
- lg2::error("Connection not created");
- return;
- }
- dbusConnection->async_method_call(
- [&sensors, &mctpRequester, &dbusConnection, &io,
- &objectServer](const boost::system::error_code& ec,
- const ManagedObjectType& resp) {
- if (ec)
- {
- lg2::error("Error contacting entity manager");
- return;
- }
-
- processSensorConfigs(io, objectServer, sensors, dbusConnection,
- mctpRequester, resp);
- },
- entityManagerName, "/xyz/openbmc_project/inventory",
- "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
-}
-
-void interfaceRemoved(
- sdbusplus::message_t& message,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors)
-{
- if (message.is_method_error())
- {
- lg2::error("interfacesRemoved callback method error");
- return;
- }
-
- sdbusplus::message::object_path removedPath;
- std::vector<std::string> interfaces;
-
- message.read(removedPath, interfaces);
-
- // If the xyz.openbmc_project.Confguration.X interface was removed
- // for one or more sensors, delete those sensor objects.
- auto sensorIt = sensors.begin();
- while (sensorIt != sensors.end())
- {
- if ((sensorIt->second->configurationPath == removedPath) &&
- (std::find(interfaces.begin(), interfaces.end(),
- configInterfaceName(sensorType)) != interfaces.end()))
- {
- sensorIt = sensors.erase(sensorIt);
- }
- else
- {
- sensorIt++;
- }
- }
-}
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 158dc41..25fe069 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -8,102 +8,52 @@
#include "MctpRequester.hpp"
#include "Thresholds.hpp"
-#include "Utils.hpp"
#include "sensor.hpp"
#include <NvidiaGpuMctpVdm.hpp>
-#include <OcpMctpVdm.hpp>
-#include <boost/asio/io_context.hpp>
-#include <boost/asio/steady_timer.hpp>
-#include <boost/container/flat_map.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
-#include <sdbusplus/message.hpp>
#include <array>
-#include <chrono>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
-constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
-constexpr const char* sensorType = "NvidiaMctpVdm";
-
-struct GpuTempSensor :
+struct NvidiaGpuTempSensor :
public Sensor,
- public std::enable_shared_from_this<GpuTempSensor>
+ public std::enable_shared_from_this<NvidiaGpuTempSensor>
{
public:
- GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io,
- mctp::MctpRequester& mctpRequester, const std::string& name,
- const std::string& sensorConfiguration,
- sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData,
- std::chrono::milliseconds pollRate);
+ NvidiaGpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester,
+ const std::string& name,
+ const std::string& sensorConfiguration, uint8_t eid,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
- ~GpuTempSensor() override;
+ ~NvidiaGpuTempSensor() override;
void checkThresholds() override;
- private:
- void read();
-
void update();
- void discoverGpus();
-
+ private:
void processResponse(int sendRecvMsgResult);
- void processQueryDeviceIdResponse(uint8_t eid, int sendRecvMsgResult);
-
- void queryEndpoints(const boost::system::error_code& ec,
- const GetSubTreeType& ret);
-
- void processEndpoint(const boost::system::error_code& ec,
- const SensorBaseConfigMap& endpoint);
- void processGpuEndpoint(uint8_t eid);
-
uint8_t eid{};
uint8_t sensorId;
- std::chrono::milliseconds sensorPollMs;
-
- boost::asio::steady_timer waitTimer;
+ std::shared_ptr<sdbusplus::asio::connection> conn;
mctp::MctpRequester& mctpRequester;
- std::shared_ptr<sdbusplus::asio::connection> conn;
-
sdbusplus::asio::object_server& objectServer;
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::GetTemperatureReadingRequest)>
+ std::array<uint8_t, sizeof(gpu::GetTemperatureReadingRequest)>
getTemperatureReadingRequest{};
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::GetTemperatureReadingResponse)>
+ std::array<uint8_t, sizeof(gpu::GetTemperatureReadingResponse)>
getTemperatureReadingResponse{};
-
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::QueryDeviceIdentificationRequest)>
- queryDeviceIdentificationRequest{};
-
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::QueryDeviceIdentificationResponse)>
- queryDeviceIdentificationResponse{};
};
-
-void createSensors(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester);
-
-void interfaceRemoved(
- sdbusplus::message_t& message,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors);
diff --git a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
index cf764d3..6ccbb05 100644
--- a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
@@ -5,15 +5,14 @@
*/
#include "MctpRequester.hpp"
-#include "NvidiaGpuSensor.hpp"
#include "Utils.hpp"
+#include <NvidiaDeviceDiscovery.hpp>
#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/post.hpp>
#include <boost/asio/steady_timer.hpp>
#include <boost/container/flat_map.hpp>
-#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/bus.hpp>
@@ -27,7 +26,7 @@
#include <string>
#include <vector>
-boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>> sensors;
+boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>> gpuDevices;
void configTimerExpiryCallback(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
@@ -38,11 +37,7 @@
{
return; // we're being canceled
}
- createSensors(io, objectServer, sensors, dbusConnection, mctpRequester);
- if (sensors.empty())
- {
- lg2::info("Configuration not detected");
- }
+ createSensors(io, objectServer, gpuDevices, dbusConnection, mctpRequester);
}
int main()
@@ -56,7 +51,7 @@
mctp::MctpRequester mctpRequester(io);
boost::asio::post(io, [&]() {
- createSensors(io, objectServer, sensors, systemBus, mctpRequester);
+ createSensors(io, objectServer, gpuDevices, systemBus, mctpRequester);
});
boost::asio::steady_timer configTimer(io);
@@ -73,7 +68,7 @@
std::vector<std::unique_ptr<sdbusplus::bus::match_t>> matches =
setupPropertiesChangedMatches(
- *systemBus, std::to_array<const char*>({sensorType}), eventHandler);
+ *systemBus, std::to_array<const char*>({deviceType}), eventHandler);
// Watch for entity-manager to remove configuration interfaces
// so the corresponding sensors can be removed.
@@ -81,7 +76,7 @@
static_cast<sdbusplus::bus_t&>(*systemBus),
sdbusplus::bus::match::rules::interfacesRemovedAtPath(
std::string(inventoryPath)),
- [](sdbusplus::message_t& msg) { interfaceRemoved(msg, sensors); });
+ [](sdbusplus::message_t& msg) { interfaceRemoved(msg, gpuDevices); });
io.run();
return 0;
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index cf7c8c5..57bab75 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -1,5 +1,7 @@
gpusensor_sources = files(
'MctpRequester.cpp',
+ 'NvidiaDeviceDiscovery.cpp',
+ 'NvidiaGpuDevice.cpp',
'NvidiaGpuMctpVdm.cpp',
'NvidiaGpuSensor.cpp',
'NvidiaGpuSensorMain.cpp',