nvidia-gpu: add Power Sensor PeakReading Property
Add support for Sensor Properties PeakReading and PeakRedingTime.
Current Limitation -
The ResetMetrics action is currently not supported for Redfish URIs in
bmcweb. As a result, the ability to clear PeakReading values for GPU
Power Sensors has not been implemented.
Future Consideration -
If ResetMetrics action support is added to bmcweb in the future, the
corresponding functionality will also need to be implemented in the
dbus-sensor application to ensure full compatibility.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/power/NVIDIA_GB200_GPU_0_Power_0
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openb... emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 5000 emits-change
.MinValue property d 0 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Uni... emits-change
.Value property d 29.194 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
xyz.openbmc_project.Telemetry.Report interface - - -
.Readings property (ta(ssdt)) 0 1 "PeakReading" "" 80.933 0 emits-change
```
Change-Id: I0a4f7eb0a5db688f32bf80954839140da9bb7e2a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index a13bcfe..9560220 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -17,6 +17,7 @@
#include <MctpRequester.hpp>
#include <NvidiaGpuEnergySensor.hpp>
#include <NvidiaGpuMctpVdm.hpp>
+#include <NvidiaGpuPowerPeakReading.hpp>
#include <NvidiaGpuPowerSensor.hpp>
#include <NvidiaGpuThresholds.hpp>
#include <NvidiaGpuVoltageSensor.hpp>
@@ -74,6 +75,10 @@
conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
objectServer, std::vector<thresholds::Threshold>{});
+ peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
+ mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
+ objectServer);
+
energySensor = std::make_shared<NvidiaGpuEnergySensor>(
conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
objectServer, std::vector<thresholds::Threshold>{});
@@ -120,6 +125,7 @@
}
dramTempSensor->update();
powerSensor->update();
+ peakPower->update();
energySensor->update();
voltageSensor->update();
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.hpp b/src/nvidia-gpu/NvidiaGpuDevice.hpp
index 2937d1f..2f5ee76 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.hpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.hpp
@@ -13,6 +13,7 @@
#include "NvidiaGpuSensor.hpp"
#include <NvidiaGpuEnergySensor.hpp>
+#include <NvidiaGpuPowerPeakReading.hpp>
#include <NvidiaGpuVoltageSensor.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/steady_timer.hpp>
@@ -64,6 +65,7 @@
std::shared_ptr<NvidiaGpuTempSensor> tLimitSensor;
std::shared_ptr<NvidiaGpuTempSensor> dramTempSensor;
std::shared_ptr<NvidiaGpuPowerSensor> powerSensor;
+ std::shared_ptr<NvidiaGpuPowerPeakReading> peakPower;
std::shared_ptr<NvidiaGpuEnergySensor> energySensor;
std::shared_ptr<NvidiaGpuVoltageSensor> voltageSensor;
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
index ebfa76b..51a6755 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.cpp
@@ -218,16 +218,16 @@
return 0;
}
-int encodeGetCurrentPowerDrawRequest(uint8_t instanceId, uint8_t sensorId,
- uint8_t averagingInterval,
- std::span<uint8_t> buf)
+int encodeGetPowerDrawRequest(PlatformEnvironmentalCommands commandCode,
+ uint8_t instanceId, uint8_t sensorId,
+ uint8_t averagingInterval, std::span<uint8_t> buf)
{
- if (buf.size() < sizeof(GetCurrentPowerDrawRequest))
+ if (buf.size() < sizeof(GetPowerDrawRequest))
{
return EINVAL;
}
- auto* msg = reinterpret_cast<GetCurrentPowerDrawRequest*>(buf.data());
+ auto* msg = reinterpret_cast<GetPowerDrawRequest*>(buf.data());
ocp::accelerator_management::BindingPciVidInfo header{};
header.ocp_accelerator_management_msg_type =
@@ -243,8 +243,7 @@
return rc;
}
- msg->hdr.command = static_cast<uint8_t>(
- PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW);
+ msg->hdr.command = static_cast<uint8_t>(commandCode);
msg->hdr.data_size = sizeof(sensorId) + sizeof(averagingInterval);
msg->sensorId = sensorId;
msg->averagingInterval = averagingInterval;
@@ -252,10 +251,9 @@
return 0;
}
-int decodeGetCurrentPowerDrawResponse(
- std::span<const uint8_t> buf,
- ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
- uint32_t& power)
+int decodeGetPowerDrawResponse(std::span<const uint8_t> buf,
+ ocp::accelerator_management::CompletionCode& cc,
+ uint16_t& reasonCode, uint32_t& power)
{
auto rc =
ocp::accelerator_management::decodeReasonCodeAndCC(buf, cc, reasonCode);
@@ -265,13 +263,13 @@
return rc;
}
- if (buf.size() < sizeof(GetCurrentPowerDrawResponse))
+ if (buf.size() < sizeof(GetPowerDrawResponse))
{
return EINVAL;
}
const auto* response =
- reinterpret_cast<const GetCurrentPowerDrawResponse*>(buf.data());
+ reinterpret_cast<const GetPowerDrawResponse*>(buf.data());
const uint16_t dataSize = le16toh(response->hdr.data_size);
@@ -330,7 +328,7 @@
return rc;
}
- if (buf.size() < sizeof(GetCurrentPowerDrawResponse))
+ if (buf.size() < sizeof(GetPowerDrawResponse))
{
return EINVAL;
}
diff --git a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
index 19b9929..1e8e986 100644
--- a/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
+++ b/src/nvidia-gpu/NvidiaGpuMctpVdm.hpp
@@ -40,6 +40,7 @@
GET_TEMPERATURE_READING = 0x00,
READ_THERMAL_PARAMETERS = 0x02,
GET_CURRENT_POWER_DRAW = 0x03,
+ GET_MAX_OBSERVED_POWER = 0x04,
GET_CURRENT_ENERGY_COUNTER = 0x06,
GET_INVENTORY_INFORMATION = 0x0C,
GET_VOLTAGE = 0x0F,
@@ -114,7 +115,7 @@
using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
-struct GetCurrentPowerDrawRequest
+struct GetPowerDrawRequest
{
ocp::accelerator_management::CommonRequest hdr;
uint8_t sensorId;
@@ -137,7 +138,7 @@
int32_t threshold;
} __attribute__((packed));
-struct GetCurrentPowerDrawResponse
+struct GetPowerDrawResponse
{
ocp::accelerator_management::CommonResponse hdr;
uint32_t power;
@@ -194,14 +195,13 @@
ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
int32_t& threshold);
-int encodeGetCurrentPowerDrawRequest(uint8_t instanceId, uint8_t sensorId,
- uint8_t averagingInterval,
- std::span<uint8_t> buf);
+int encodeGetPowerDrawRequest(
+ PlatformEnvironmentalCommands commandCode, uint8_t instanceId,
+ uint8_t sensorId, uint8_t averagingInterval, std::span<uint8_t> buf);
-int decodeGetCurrentPowerDrawResponse(
- std::span<const uint8_t> buf,
- ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
- uint32_t& power);
+int decodeGetPowerDrawResponse(std::span<const uint8_t> buf,
+ ocp::accelerator_management::CompletionCode& cc,
+ uint16_t& reasonCode, uint32_t& power);
int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
std::span<uint8_t> buf);
diff --git a/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp
new file mode 100644
index 0000000..06693e6
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp
@@ -0,0 +1,102 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "NvidiaGpuPowerPeakReading.hpp"
+
+#include "MctpRequester.hpp"
+#include "Utils.hpp"
+
+#include <bits/basic_string.h>
+
+#include <NvidiaDeviceDiscovery.hpp>
+#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
+#include <phosphor-logging/lg2.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+using namespace std::literals;
+
+NvidiaGpuPowerPeakReading::NvidiaGpuPowerPeakReading(
+ mctp::MctpRequester& mctpRequester, const std::string& name, uint8_t eid,
+ uint8_t sensorId, sdbusplus::asio::object_server& objectServer) :
+ eid(eid), sensorId{sensorId}, mctpRequester(mctpRequester),
+ objectServer(objectServer)
+{
+ std::string dbusPath = sensorPathPrefix + "power/"s + escapeName(name);
+
+ telemetryReportInterface = objectServer.add_interface(
+ dbusPath, "xyz.openbmc_project.Telemetry.Report");
+
+ std::get<0>(readings) = 0;
+ // Reading from the device is in milliwatts and unit set on the dbus
+ // is watts.
+ std::get<1>(readings).emplace_back("PeakReading", "", 0.0, 0);
+
+ telemetryReportInterface->register_property("Readings", readings);
+
+ telemetryReportInterface->initialize();
+}
+
+NvidiaGpuPowerPeakReading::~NvidiaGpuPowerPeakReading()
+{
+ objectServer.remove_interface(telemetryReportInterface);
+}
+
+void NvidiaGpuPowerPeakReading::processResponse(int sendRecvMsgResult)
+{
+ if (sendRecvMsgResult != 0)
+ {
+ lg2::error(
+ "Error updating Peak Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
+ "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
+ return;
+ }
+
+ ocp::accelerator_management::CompletionCode cc{};
+ uint16_t reasonCode = 0;
+ uint32_t peakPower = 0;
+
+ const int rc =
+ gpu::decodeGetPowerDrawResponse(response, cc, reasonCode, peakPower);
+
+ if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+ {
+ lg2::error(
+ "Error updating Peak Power Sensor eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+ "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
+ reasonCode);
+ return;
+ }
+
+ // Reading from the device is in milliwatts and unit set on the dbus
+ // is watts.
+ std::get<2>(std::get<1>(readings)[0]) = peakPower / 1000.0;
+
+ telemetryReportInterface->set_property("Readings", readings);
+}
+
+void NvidiaGpuPowerPeakReading::update()
+{
+ const int rc = gpu::encodeGetPowerDrawRequest(
+ gpu::PlatformEnvironmentalCommands::GET_MAX_OBSERVED_POWER, 0, sensorId,
+ averagingInterval, request);
+
+ if (rc != 0)
+ {
+ lg2::error(
+ "Error updating Peak Power Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
+ "EID", eid, "SID", sensorId, "RC", rc);
+ }
+
+ mctpRequester.sendRecvMsg(
+ eid, request, response,
+ [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+}
diff --git a/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp
new file mode 100644
index 0000000..02ba104
--- /dev/null
+++ b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp
@@ -0,0 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "MctpRequester.hpp"
+
+#include <NvidiaGpuMctpVdm.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+constexpr uint8_t gpuPeakPowerSensorId{0};
+
+// GPU Power Sensor Averaging Interval in seconds, 0 implies default
+constexpr uint8_t gpuPowerAveragingIntervalInSec{0};
+
+struct NvidiaGpuPowerPeakReading
+{
+ public:
+ NvidiaGpuPowerPeakReading(mctp::MctpRequester& mctpRequester,
+ const std::string& name, uint8_t eid,
+ uint8_t sensorId,
+ sdbusplus::asio::object_server& objectServer);
+
+ ~NvidiaGpuPowerPeakReading();
+
+ void update();
+
+ private:
+ void processResponse(int sendRecvMsgResult);
+
+ uint8_t eid{};
+
+ uint8_t sensorId;
+
+ uint8_t averagingInterval{gpuPowerAveragingIntervalInSec};
+
+ std::tuple<
+ uint64_t,
+ std::vector<std::tuple<std::string, std::string, double, uint64_t>>>
+ readings;
+
+ mctp::MctpRequester& mctpRequester;
+
+ sdbusplus::asio::object_server& objectServer;
+
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawRequest)> request{};
+
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawResponse)> response{};
+
+ std::shared_ptr<sdbusplus::asio::dbus_interface> telemetryReportInterface;
+};
diff --git a/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp b/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp
index 997ce3d..bac935d 100644
--- a/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp
@@ -97,7 +97,7 @@
uint32_t power = 0;
const int rc =
- gpu::decodeGetCurrentPowerDrawResponse(response, cc, reasonCode, power);
+ gpu::decodeGetPowerDrawResponse(response, cc, reasonCode, power);
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
{
@@ -115,8 +115,9 @@
void NvidiaGpuPowerSensor::update()
{
- const int rc = gpu::encodeGetCurrentPowerDrawRequest(
- 0, sensorId, averagingInterval, request);
+ const int rc = gpu::encodeGetPowerDrawRequest(
+ gpu::PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW, 0, sensorId,
+ averagingInterval, request);
if (rc != 0)
{
diff --git a/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp b/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp
index 7933f75..c217200 100644
--- a/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp
@@ -53,7 +53,7 @@
sdbusplus::asio::object_server& objectServer;
- std::array<uint8_t, sizeof(gpu::GetCurrentPowerDrawRequest)> request{};
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawRequest)> request{};
- std::array<uint8_t, sizeof(gpu::GetCurrentPowerDrawResponse)> response{};
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawResponse)> response{};
};
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index dbd5b27..9d592f3 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -5,6 +5,7 @@
'NvidiaGpuDevice.cpp',
'NvidiaGpuEnergySensor.cpp',
'NvidiaGpuMctpVdm.cpp',
+ 'NvidiaGpuPowerPeakReading.cpp',
'NvidiaGpuPowerSensor.cpp',
'NvidiaGpuSensor.cpp',
'NvidiaGpuThresholds.cpp',
diff --git a/src/tests/test_NvidiaGpuSensorTest.cpp b/src/tests/test_NvidiaGpuSensorTest.cpp
index ce197f3..4ebe9d5 100644
--- a/src/tests/test_NvidiaGpuSensorTest.cpp
+++ b/src/tests/test_NvidiaGpuSensorTest.cpp
@@ -648,14 +648,16 @@
const uint8_t instanceId = 6;
const uint8_t sensorId = 2;
const uint8_t averagingInterval = 10;
- std::array<uint8_t, sizeof(gpu::GetCurrentPowerDrawRequest)> buf{};
+ gpu::PlatformEnvironmentalCommands commandCode =
+ gpu::PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW;
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawRequest)> buf{};
- int result = gpu::encodeGetCurrentPowerDrawRequest(instanceId, sensorId,
- averagingInterval, buf);
+ int result = gpu::encodeGetPowerDrawRequest(
+ commandCode, instanceId, sensorId, averagingInterval, buf);
EXPECT_EQ(result, 0);
- gpu::GetCurrentPowerDrawRequest request{};
+ gpu::GetPowerDrawRequest request{};
std::memcpy(&request, buf.data(), sizeof(request));
EXPECT_EQ(request.hdr.msgHdr.hdr.pci_vendor_id,
@@ -670,9 +672,7 @@
static_cast<uint8_t>(gpu::MessageType::PLATFORM_ENVIRONMENTAL));
// Verify request data
- EXPECT_EQ(request.hdr.command,
- static_cast<uint8_t>(
- gpu::PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW));
+ EXPECT_EQ(request.hdr.command, static_cast<uint8_t>(commandCode));
EXPECT_EQ(request.hdr.data_size,
sizeof(sensorId) + sizeof(averagingInterval));
EXPECT_EQ(request.sensorId, sensorId);
@@ -683,9 +683,9 @@
TEST_F(GpuMctpVdmTests, DecodeGetCurrentPowerDrawResponseSuccess)
{
// Create a mock successful response
- std::array<uint8_t, sizeof(gpu::GetCurrentPowerDrawResponse)> buf{};
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawResponse)> buf{};
- gpu::GetCurrentPowerDrawResponse response{};
+ gpu::GetPowerDrawResponse response{};
ocp::accelerator_management::BindingPciVidInfo headerInfo{};
headerInfo.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
ocp::accelerator_management::MessageType::RESPONSE);
@@ -713,8 +713,7 @@
uint16_t reasonCode{};
uint32_t power{};
- int result =
- gpu::decodeGetCurrentPowerDrawResponse(buf, cc, reasonCode, power);
+ int result = gpu::decodeGetPowerDrawResponse(buf, cc, reasonCode, power);
EXPECT_EQ(result, 0);
EXPECT_EQ(cc, ocp::accelerator_management::CompletionCode::SUCCESS);
@@ -752,8 +751,7 @@
uint16_t reasonCode{};
uint32_t power{};
- int result =
- gpu::decodeGetCurrentPowerDrawResponse(buf, cc, reasonCode, power);
+ int result = gpu::decodeGetPowerDrawResponse(buf, cc, reasonCode, power);
EXPECT_EQ(result, 0);
EXPECT_EQ(cc, ocp::accelerator_management::CompletionCode::ERR_NOT_READY);
@@ -763,9 +761,9 @@
TEST_F(GpuMctpVdmTests, DecodeGetCurrentPowerDrawResponseInvalidSize)
{
// Create a mock response with invalid data_size
- std::array<uint8_t, sizeof(gpu::GetCurrentPowerDrawResponse)> buf{};
+ std::array<uint8_t, sizeof(gpu::GetPowerDrawResponse)> buf{};
- gpu::GetCurrentPowerDrawResponse response{};
+ gpu::GetPowerDrawResponse response{};
ocp::accelerator_management::BindingPciVidInfo headerInfo{};
headerInfo.ocp_accelerator_management_msg_type = static_cast<uint8_t>(
ocp::accelerator_management::MessageType::RESPONSE);
@@ -790,8 +788,7 @@
uint16_t reasonCode{};
uint32_t power{};
- int result =
- gpu::decodeGetCurrentPowerDrawResponse(buf, cc, reasonCode, power);
+ int result = gpu::decodeGetPowerDrawResponse(buf, cc, reasonCode, power);
EXPECT_EQ(result, EINVAL); // Should indicate error for invalid data size
}