blob: 9030ff44ce164c0d290e0479b6e0b7e17c81cdfa [file] [log] [blame]
Harshit Aghera902c6492025-05-08 15:57:42 +05301/*
Ed Tanousb5e823f2025-10-09 20:28:42 -04002 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
Harshit Aghera902c6492025-05-08 15:57:42 +05303 * SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaGpuPowerSensor.hpp"
7
8#include "MctpRequester.hpp"
9#include "SensorPaths.hpp"
10#include "Thresholds.hpp"
11#include "Utils.hpp"
12#include "sensor.hpp"
13
14#include <bits/basic_string.h>
15
16#include <NvidiaDeviceDiscovery.hpp>
17#include <NvidiaGpuMctpVdm.hpp>
18#include <OcpMctpVdm.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23#include <cstddef>
24#include <cstdint>
25#include <functional>
26#include <limits>
27#include <memory>
Marc Olberdingd0125c92025-10-08 14:37:19 -070028#include <span>
Harshit Aghera902c6492025-05-08 15:57:42 +053029#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070030#include <system_error>
Harshit Aghera902c6492025-05-08 15:57:42 +053031#include <utility>
32#include <vector>
33
34using namespace std::literals;
35
Harshit Aghera5e4d3052025-06-19 11:28:38 +053036static constexpr double gpuPowerSensorMaxReading = 5000;
Harshit Aghera902c6492025-05-08 15:57:42 +053037static constexpr double gpuPowerSensorMinReading =
38 std::numeric_limits<uint32_t>::min();
39
40NvidiaGpuPowerSensor::NvidiaGpuPowerSensor(
41 std::shared_ptr<sdbusplus::asio::connection>& conn,
42 mctp::MctpRequester& mctpRequester, const std::string& name,
43 const std::string& sensorConfiguration, uint8_t eid, uint8_t sensorId,
44 sdbusplus::asio::object_server& objectServer,
45 std::vector<thresholds::Threshold>&& thresholdData) :
46 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
47 "power", false, true, gpuPowerSensorMaxReading,
48 gpuPowerSensorMinReading, conn),
49 eid(eid), sensorId{sensorId},
Ed Tanousaba6fca2025-09-29 13:53:20 -070050
Harshit Aghera902c6492025-05-08 15:57:42 +053051 mctpRequester(mctpRequester), objectServer(objectServer)
52
53{
54 std::string dbusPath = sensorPathPrefix + "power/"s + escapeName(name);
55
56 sensorInterface = objectServer.add_interface(
57 dbusPath, "xyz.openbmc_project.Sensor.Value");
58
59 for (const auto& threshold : thresholds)
60 {
61 std::string interface = thresholds::getInterface(threshold.level);
62 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
63 objectServer.add_interface(dbusPath, interface);
64 }
65
66 association = objectServer.add_interface(dbusPath, association::interface);
67
68 setInitialProperties(sensor_paths::unitWatts);
69}
70
71NvidiaGpuPowerSensor::~NvidiaGpuPowerSensor()
72{
73 for (const auto& iface : thresholdInterfaces)
74 {
75 objectServer.remove_interface(iface);
76 }
77 objectServer.remove_interface(association);
78 objectServer.remove_interface(sensorInterface);
79}
80
81void NvidiaGpuPowerSensor::checkThresholds()
82{
83 thresholds::checkThresholds(this);
84}
85
Marc Olberdingd0125c92025-10-08 14:37:19 -070086void NvidiaGpuPowerSensor::processResponse(const std::error_code& ec,
87 std::span<const uint8_t> buffer)
Harshit Aghera902c6492025-05-08 15:57:42 +053088{
Marc Olberdingd0125c92025-10-08 14:37:19 -070089 if (ec)
Harshit Aghera902c6492025-05-08 15:57:42 +053090 {
91 lg2::error(
92 "Error updating Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070093 "EID", eid, "SID", sensorId, "RC", ec.message());
Harshit Aghera902c6492025-05-08 15:57:42 +053094 return;
95 }
96
97 ocp::accelerator_management::CompletionCode cc{};
98 uint16_t reasonCode = 0;
99 uint32_t power = 0;
100
101 const int rc =
Marc Olberdingd0125c92025-10-08 14:37:19 -0700102 gpu::decodeGetPowerDrawResponse(buffer, cc, reasonCode, power);
Harshit Aghera902c6492025-05-08 15:57:42 +0530103
104 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
105 {
106 lg2::error(
107 "Error updating Power Sensor eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
108 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
109 reasonCode);
110 return;
111 }
112
113 // Reading from the device is in milliwatts and unit set on the dbus
114 // is watts.
115 updateValue(power / 1000.0);
116}
117
118void NvidiaGpuPowerSensor::update()
119{
Harshit Aghera6b712322025-07-31 19:25:12 +0530120 const int rc = gpu::encodeGetPowerDrawRequest(
121 gpu::PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW, 0, sensorId,
122 averagingInterval, request);
Harshit Aghera902c6492025-05-08 15:57:42 +0530123
124 if (rc != 0)
125 {
126 lg2::error(
127 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
128 "EID", eid, "SID", sensorId, "RC", rc);
129 }
130
131 mctpRequester.sendRecvMsg(
Marc Olberdingd0125c92025-10-08 14:37:19 -0700132 eid, request,
Marc Olberdingfd4a3772025-09-24 16:31:02 -0700133 [weak{weak_from_this()}](const std::error_code& ec,
134 std::span<const uint8_t> buffer) {
135 std::shared_ptr<NvidiaGpuPowerSensor> self = weak.lock();
136 if (!self)
137 {
138 lg2::error("Invalid reference to NvidiaGpuPowerSensor");
139 return;
140 }
141 self->processResponse(ec, buffer);
Marc Olberdingd0125c92025-10-08 14:37:19 -0700142 });
Harshit Aghera902c6492025-05-08 15:57:42 +0530143}