blob: d5cf9986f80aab787216bd41df1a38c18c7ab2c6 [file] [log] [blame]
Harshit Aghera775199d2025-05-27 14:20:24 +05301/*
Ed Tanousb5e823f2025-10-09 20:28:42 -04002 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
Harshit Aghera775199d2025-05-27 14:20:24 +05303 * SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaGpuEnergySensor.hpp"
7
8#include "SensorPaths.hpp"
9#include "Thresholds.hpp"
10#include "Utils.hpp"
11#include "sensor.hpp"
12
13#include <bits/basic_string.h>
14
15#include <MctpRequester.hpp>
16#include <NvidiaDeviceDiscovery.hpp>
17#include <NvidiaGpuMctpVdm.hpp>
18#include <OcpMctpVdm.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23#include <cstddef>
24#include <cstdint>
25#include <limits>
26#include <memory>
Marc Olberdingd0125c92025-10-08 14:37:19 -070027#include <span>
Harshit Aghera775199d2025-05-27 14:20:24 +053028#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070029#include <system_error>
Harshit Aghera775199d2025-05-27 14:20:24 +053030#include <utility>
31#include <vector>
32
33using namespace std::literals;
34
35// Reading from the device is in millijoules and unit set on the dbus is Joules.
36static constexpr double gpuEnergySensorMaxReading =
37 std::numeric_limits<uint64_t>::max() / 1000.0;
38static constexpr double gpuEnergySensorMinReading = 0.0;
39
40NvidiaGpuEnergySensor::NvidiaGpuEnergySensor(
41 std::shared_ptr<sdbusplus::asio::connection>& conn,
42 mctp::MctpRequester& mctpRequester, const std::string& name,
43 const std::string& sensorConfiguration, const uint8_t eid, uint8_t sensorId,
44 sdbusplus::asio::object_server& objectServer,
45 std::vector<thresholds::Threshold>&& thresholdData) :
46 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
47 "energy", false, true, gpuEnergySensorMaxReading,
48 gpuEnergySensorMinReading, conn),
49 eid(eid), sensorId{sensorId}, mctpRequester(mctpRequester),
50 objectServer(objectServer)
51{
52 std::string dbusPath = sensorPathPrefix + "energy/"s + escapeName(name);
53
54 sensorInterface = objectServer.add_interface(
55 dbusPath, "xyz.openbmc_project.Sensor.Value");
56
57 for (const auto& threshold : thresholds)
58 {
59 std::string interface = thresholds::getInterface(threshold.level);
60 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
61 objectServer.add_interface(dbusPath, interface);
62 }
63
64 association = objectServer.add_interface(dbusPath, association::interface);
65
66 setInitialProperties(sensor_paths::unitJoules);
67}
68
69NvidiaGpuEnergySensor::~NvidiaGpuEnergySensor()
70{
71 for (const auto& iface : thresholdInterfaces)
72 {
73 objectServer.remove_interface(iface);
74 }
75 objectServer.remove_interface(sensorInterface);
76 objectServer.remove_interface(association);
77}
78
79void NvidiaGpuEnergySensor::checkThresholds()
80{
81 thresholds::checkThresholds(this);
82}
83
Marc Olberdingd0125c92025-10-08 14:37:19 -070084void NvidiaGpuEnergySensor::processResponse(const std::error_code& ec,
85 std::span<const uint8_t> buffer)
Harshit Aghera775199d2025-05-27 14:20:24 +053086{
Marc Olberdingd0125c92025-10-08 14:37:19 -070087 if (ec)
Harshit Aghera775199d2025-05-27 14:20:24 +053088 {
89 lg2::error(
90 "Error updating Energy Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070091 "EID", eid, "SID", sensorId, "RC", ec.message());
Harshit Aghera775199d2025-05-27 14:20:24 +053092 return;
93 }
94
95 ocp::accelerator_management::CompletionCode cc{};
96 uint16_t reasonCode = 0;
97 uint64_t energyValue = 0;
98
Marc Olberdingd0125c92025-10-08 14:37:19 -070099 auto rc = gpu::decodeGetCurrentEnergyCounterResponse(buffer, cc, reasonCode,
100 energyValue);
Harshit Aghera775199d2025-05-27 14:20:24 +0530101
102 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
103 {
104 lg2::error(
105 "Error updating Energy Sensor for eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
106 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
107 reasonCode);
108 return;
109 }
110
111 // Reading from the device is in millijoules and unit set on the dbus
112 // is Joules.
113 updateValue(energyValue / 1000.0);
114}
115
116void NvidiaGpuEnergySensor::update()
117{
118 auto rc = gpu::encodeGetCurrentEnergyCounterRequest(0, sensorId, request);
119
120 if (rc != 0)
121 {
122 lg2::error(
123 "Error updating Energy Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
124 "EID", eid, "SID", sensorId, "RC", rc);
125 return;
126 }
127
128 mctpRequester.sendRecvMsg(
Marc Olberdingd0125c92025-10-08 14:37:19 -0700129 eid, request,
Marc Olberdingfd4a3772025-09-24 16:31:02 -0700130 [weak{weak_from_this()}](const std::error_code& ec,
131 std::span<const uint8_t> buffer) {
132 std::shared_ptr<NvidiaGpuEnergySensor> self = weak.lock();
133 if (!self)
134 {
135 lg2::error("invalid reference to NvidiaGpuEnergySensor");
136 return;
137 }
138 self->processResponse(ec, buffer);
Marc Olberdingd0125c92025-10-08 14:37:19 -0700139 });
Harshit Aghera775199d2025-05-27 14:20:24 +0530140}