nvidia-gpu: add support for communication to the endpoint
The commit uses MCTP VDM protocol to read temperature sensor value from
the gpu.
The MCTP VDM protocol is an extension of the OCP Accelerator Management
Interface specification. [1]
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Restart the nvidiagpusensor service.
```
root@gb200nvl-obmc:~# systemctl start xyz.openbmc_project.nvidiagpusensor.service
```
The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree. The app is reading temperature sensor value from gpu
correctly and the temperature sensor is also present on redfish.
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU",
"Name": "NVIDIA GB200 GPU",
"Reading": 36.4375,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 36.3125 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
[1] https://www.opencompute.org/documents/ocp-gpu-accelerator-management-interfaces-v1-pdf
Change-Id: Ied938b9e5c19751ee283b4b948e16c905c78fb48
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 14627fc..158dc41 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -1,14 +1,18 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
- * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
*/
#pragma once
+#include "MctpRequester.hpp"
#include "Thresholds.hpp"
#include "Utils.hpp"
#include "sensor.hpp"
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/steady_timer.hpp>
#include <boost/container/flat_map.hpp>
@@ -16,6 +20,9 @@
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/message.hpp>
+#include <array>
+#include <chrono>
+#include <cstdint>
#include <memory>
#include <string>
#include <vector>
@@ -28,95 +35,74 @@
public std::enable_shared_from_this<GpuTempSensor>
{
public:
- /**
- * @brief Constructor for GpuTempSensor
- * @param conn D-Bus connection
- * @param io Boost ASIO I/O context for asynchronous operations
- * @param mctpRequester MCTP protocol requester for GPU communication
- * @param name Name of the sensor
- * @param sensorConfiguration Configuration string for the sensor
- * @param objectServer D-Bus object server
- * @param thresholdData Vector of threshold configurations
- * @param pollRate How often to poll for new readings
- * @param deviceInfo Information about the GPU device
- * @param verbose Whether to enable verbose logging
- */
GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io, const std::string& name,
+ boost::asio::io_context& io,
+ mctp::MctpRequester& mctpRequester, const std::string& name,
const std::string& sensorConfiguration,
sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData);
+ std::vector<thresholds::Threshold>&& thresholdData,
+ std::chrono::milliseconds pollRate);
- /**
- * @brief Destructor
- */
~GpuTempSensor() override;
- /**
- * @brief Check if any thresholds have been crossed
- * @details Overrides the base class method to implement GPU-specific
- * threshold checking
- */
void checkThresholds() override;
private:
- /**
- * @brief Discover available GPUs on the system
- */
+ void read();
+
+ void update();
+
void discoverGpus();
- /**
- * @brief Process MCTP endpoints discovered on the system
- *
- * @param[in] ec Error code from the D-Bus method call
- * @param[in] ret Object tree results containing MCTP endpoint information
- */
+ void processResponse(int sendRecvMsgResult);
+
+ void processQueryDeviceIdResponse(uint8_t eid, int sendRecvMsgResult);
+
void queryEndpoints(const boost::system::error_code& ec,
const GetSubTreeType& ret);
- /**
- * @brief Process configuration properties for MCTP endpoints
- *
- * @param[in] ec Error code from the D-Bus properties method call
- * @param[in] configs Map of configuration properties for the endpoint
- */
void processEndpoint(const boost::system::error_code& ec,
const SensorBaseConfigMap& endpoint);
+ void processGpuEndpoint(uint8_t eid);
- /**
- * @brief Timer for scheduling sensor reads
- */
+ uint8_t eid{};
+
+ uint8_t sensorId;
+
+ std::chrono::milliseconds sensorPollMs;
+
boost::asio::steady_timer waitTimer;
- /**
- * @brief D-Bus connection
- */
+ mctp::MctpRequester& mctpRequester;
+
std::shared_ptr<sdbusplus::asio::connection> conn;
- /**
- * @brief D-Bus object server
- */
sdbusplus::asio::object_server& objectServer;
+
+ std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
+ sizeof(gpu::GetTemperatureReadingRequest)>
+ getTemperatureReadingRequest{};
+
+ std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
+ sizeof(gpu::GetTemperatureReadingResponse)>
+ getTemperatureReadingResponse{};
+
+ std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
+ sizeof(gpu::QueryDeviceIdentificationRequest)>
+ queryDeviceIdentificationRequest{};
+
+ std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
+ sizeof(gpu::QueryDeviceIdentificationResponse)>
+ queryDeviceIdentificationResponse{};
};
-/**
- * @brief Create GPU temperature sensors
- * @param io Boost ASIO I/O context
- * @param objectServer D-Bus object server
- * @param sensors Map to store created sensors
- * @param dbusConnection D-Bus connection
- */
void createSensors(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection);
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester);
-/**
- * @brief Handle D-Bus interface removal events
- * @param message D-Bus message containing interface removal information
- * @param sensors Map of GPU temperature sensors to check for removal
- */
void interfaceRemoved(
sdbusplus::message_t& message,
boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&