nvidia-gpu: add support for communication to the endpoint
The commit uses MCTP VDM protocol to read temperature sensor value from
the gpu.
The MCTP VDM protocol is an extension of the OCP Accelerator Management
Interface specification. [1]
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Restart the nvidiagpusensor service.
```
root@gb200nvl-obmc:~# systemctl start xyz.openbmc_project.nvidiagpusensor.service
```
The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree. The app is reading temperature sensor value from gpu
correctly and the temperature sensor is also present on redfish.
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU",
"Name": "NVIDIA GB200 GPU",
"Reading": 36.4375,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 36.3125 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
[1] https://www.opencompute.org/documents/ocp-gpu-accelerator-management-interfaces-v1-pdf
Change-Id: Ied938b9e5c19751ee283b4b948e16c905c78fb48
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.cpp b/src/nvidia-gpu/NvidiaGpuSensor.cpp
index 3594c29..86b356b 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.cpp
@@ -1,16 +1,21 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
- * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
*/
#include "NvidiaGpuSensor.hpp"
+#include "SensorPaths.hpp"
#include "Thresholds.hpp"
#include "Utils.hpp"
#include "sensor.hpp"
#include <bits/basic_string.h>
+#include <MctpRequester.hpp>
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/container/flat_map.hpp>
#include <phosphor-logging/lg2.hpp>
@@ -23,6 +28,7 @@
#include <chrono>
#include <cstddef>
#include <cstdint>
+#include <functional>
#include <memory>
#include <string>
#include <utility>
@@ -31,20 +37,23 @@
using namespace std::literals;
+constexpr uint8_t gpuTempSensorId{0};
static constexpr double gpuTempSensorMaxReading = 127;
static constexpr double gpuTempSensorMinReading = -128;
GpuTempSensor::GpuTempSensor(
std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io, const std::string& name,
- const std::string& sensorConfiguration,
+ boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
+ const std::string& name, const std::string& sensorConfiguration,
sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData) :
+ std::vector<thresholds::Threshold>&& thresholdData,
+ std::chrono::milliseconds pollRate) :
Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
"temperature", false, true, gpuTempSensorMaxReading,
gpuTempSensorMinReading, conn),
- waitTimer(io, std::chrono::steady_clock::duration(0)), conn(conn),
- objectServer(objectServer)
+ sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
+ waitTimer(io, std::chrono::steady_clock::duration(0)),
+ mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
{
std::string dbusPath =
sensorPathPrefix + "temperature/"s + escapeName(name);
@@ -115,6 +124,129 @@
}
}
+void GpuTempSensor::read()
+{
+ update();
+
+ waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+ waitTimer.async_wait(
+ [weakPtrToThis = std::weak_ptr<GpuTempSensor>{shared_from_this()}](
+ const boost::system::error_code& ec) {
+ if (ec)
+ {
+ return;
+ }
+ if (auto ptr = weakPtrToThis.lock())
+ {
+ ptr->read();
+ }
+ });
+}
+
+void GpuTempSensor::processResponse(int sendRecvMsgResult)
+{
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error(
+ "Error updating Temperature Sensor: sending message over MCTP failed, rc={RC}",
+ "RC", sendRecvMsgResult);
+ return;
+ }
+
+ ocp::accelerator_management::CompletionCode cc{};
+ uint16_t reasonCode = 0;
+ double tempValue = 0;
+
+ auto rc = gpu::decodeGetTemperatureReadingResponse(
+ getTemperatureReadingResponse, cc, reasonCode, tempValue);
+
+ if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error(
+ "Error updating Temperature Sensor: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+ "RC", rc, "CC", cc, "RESC", reasonCode);
+ return;
+ }
+
+ updateValue(tempValue);
+}
+
+void GpuTempSensor::update()
+{
+ auto rc = gpu::encodeGetTemperatureReadingRequest(
+ 0, sensorId, getTemperatureReadingRequest);
+ if (rc != 0)
+ {
+ lg2::error("Error updating Temperature Sensor: encode failed, rc={RC}",
+ "RC", rc);
+ return;
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, getTemperatureReadingRequest, getTemperatureReadingResponse,
+ [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+}
+
+void GpuTempSensor::processQueryDeviceIdResponse(uint8_t eid,
+ int sendRecvMsgResult)
+{
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error(
+ "Error processing GPU endpoint: sending message over MCTP failed, rc={RC}",
+ "RC", sendRecvMsgResult);
+ return;
+ }
+
+ ocp::accelerator_management::CompletionCode cc{};
+ uint16_t reasonCode = 0;
+ uint8_t responseDeviceType = 0;
+ uint8_t responseInstanceId = 0;
+
+ auto rc = gpu::decodeQueryDeviceIdentificationResponse(
+ queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
+ responseInstanceId);
+
+ if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error(
+ "Error processing GPU endpoint: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+ "RC", rc, "CC", cc, "RESC", reasonCode);
+ return;
+ }
+
+ if (responseDeviceType ==
+ static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
+ {
+ lg2::info(
+ "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+ "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+ responseInstanceId);
+
+ this->eid = eid;
+ setInitialProperties(sensor_paths::unitDegreesC);
+ read();
+ }
+}
+
+void GpuTempSensor::processGpuEndpoint(uint8_t eid)
+{
+ auto rc = gpu::encodeQueryDeviceIdentificationRequest(
+ 0, queryDeviceIdentificationRequest);
+ if (rc != 0)
+ {
+ lg2::error("Error processing GPU endpoint: encode failed, rc={RC}",
+ "RC", rc);
+ return;
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, queryDeviceIdentificationRequest,
+ queryDeviceIdentificationResponse, [this, eid](int sendRecvMsgResult) {
+ processQueryDeviceIdResponse(eid, sendRecvMsgResult);
+ });
+}
+
void GpuTempSensor::processEndpoint(const boost::system::error_code& ec,
const SensorBaseConfigMap& endpoint)
{
@@ -125,7 +257,7 @@
return;
}
- [[maybe_unused]] uint8_t eid{};
+ uint8_t eid{};
std::vector<uint8_t> mctpTypes{};
auto hasEid = endpoint.find("EID");
@@ -173,9 +305,14 @@
return;
}
- // if the OCP MCTP VDM Message type (0x7E) is found in mctpTypes
- // process the endpoint further.
- (void)this;
+ if (std::find(mctpTypes.begin(), mctpTypes.end(),
+ ocp::accelerator_management::messageType) != mctpTypes.end())
+ {
+ lg2::info(
+ "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
+ "EID", eid);
+ this->processGpuEndpoint(eid);
+ }
}
void GpuTempSensor::discoverGpus()
@@ -198,7 +335,7 @@
boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
sensors,
std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- const ManagedObjectType& resp)
+ mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
{
for (const auto& [path, interfaces] : resp)
{
@@ -211,9 +348,12 @@
std::string name = loadVariant<std::string>(cfg, "Name");
+ uint64_t pollRate = loadVariant<uint64_t>(cfg, "PollRate");
+
sensors[name] = std::make_shared<GpuTempSensor>(
- dbusConnection, io, name, path, objectServer,
- std::vector<thresholds::Threshold>{});
+ dbusConnection, io, mctpRequester, name, path, objectServer,
+ std::vector<thresholds::Threshold>{},
+ std::chrono::milliseconds{pollRate});
lg2::info(
"Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
@@ -226,7 +366,8 @@
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection)
+ std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+ mctp::MctpRequester& mctpRequester)
{
if (!dbusConnection)
{
@@ -234,7 +375,7 @@
return;
}
dbusConnection->async_method_call(
- [&sensors, &dbusConnection, &io,
+ [&sensors, &mctpRequester, &dbusConnection, &io,
&objectServer](const boost::system::error_code& ec,
const ManagedObjectType& resp) {
if (ec)
@@ -244,7 +385,7 @@
}
processSensorConfigs(io, objectServer, sensors, dbusConnection,
- resp);
+ mctpRequester, resp);
},
entityManagerName, "/xyz/openbmc_project/inventory",
"org.freedesktop.DBus.ObjectManager", "GetManagedObjects");