blob: 5d2b9af5209110748e2d3485d09ce8a5768fa14f [file] [log] [blame]
Harshit Aghera775199d2025-05-27 14:20:24 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuEnergySensor.hpp"
8
9#include "SensorPaths.hpp"
10#include "Thresholds.hpp"
11#include "Utils.hpp"
12#include "sensor.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
17#include <NvidiaDeviceDiscovery.hpp>
18#include <NvidiaGpuMctpVdm.hpp>
19#include <OcpMctpVdm.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23
24#include <cstddef>
25#include <cstdint>
26#include <limits>
27#include <memory>
Marc Olberdingd0125c92025-10-08 14:37:19 -070028#include <span>
Harshit Aghera775199d2025-05-27 14:20:24 +053029#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070030#include <system_error>
Harshit Aghera775199d2025-05-27 14:20:24 +053031#include <utility>
32#include <vector>
33
34using namespace std::literals;
35
36// Reading from the device is in millijoules and unit set on the dbus is Joules.
37static constexpr double gpuEnergySensorMaxReading =
38 std::numeric_limits<uint64_t>::max() / 1000.0;
39static constexpr double gpuEnergySensorMinReading = 0.0;
40
41NvidiaGpuEnergySensor::NvidiaGpuEnergySensor(
42 std::shared_ptr<sdbusplus::asio::connection>& conn,
43 mctp::MctpRequester& mctpRequester, const std::string& name,
44 const std::string& sensorConfiguration, const uint8_t eid, uint8_t sensorId,
45 sdbusplus::asio::object_server& objectServer,
46 std::vector<thresholds::Threshold>&& thresholdData) :
47 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
48 "energy", false, true, gpuEnergySensorMaxReading,
49 gpuEnergySensorMinReading, conn),
50 eid(eid), sensorId{sensorId}, mctpRequester(mctpRequester),
51 objectServer(objectServer)
52{
53 std::string dbusPath = sensorPathPrefix + "energy/"s + escapeName(name);
54
55 sensorInterface = objectServer.add_interface(
56 dbusPath, "xyz.openbmc_project.Sensor.Value");
57
58 for (const auto& threshold : thresholds)
59 {
60 std::string interface = thresholds::getInterface(threshold.level);
61 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
62 objectServer.add_interface(dbusPath, interface);
63 }
64
65 association = objectServer.add_interface(dbusPath, association::interface);
66
67 setInitialProperties(sensor_paths::unitJoules);
68}
69
70NvidiaGpuEnergySensor::~NvidiaGpuEnergySensor()
71{
72 for (const auto& iface : thresholdInterfaces)
73 {
74 objectServer.remove_interface(iface);
75 }
76 objectServer.remove_interface(sensorInterface);
77 objectServer.remove_interface(association);
78}
79
80void NvidiaGpuEnergySensor::checkThresholds()
81{
82 thresholds::checkThresholds(this);
83}
84
Marc Olberdingd0125c92025-10-08 14:37:19 -070085void NvidiaGpuEnergySensor::processResponse(const std::error_code& ec,
86 std::span<const uint8_t> buffer)
Harshit Aghera775199d2025-05-27 14:20:24 +053087{
Marc Olberdingd0125c92025-10-08 14:37:19 -070088 if (ec)
Harshit Aghera775199d2025-05-27 14:20:24 +053089 {
90 lg2::error(
91 "Error updating Energy Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070092 "EID", eid, "SID", sensorId, "RC", ec.message());
Harshit Aghera775199d2025-05-27 14:20:24 +053093 return;
94 }
95
96 ocp::accelerator_management::CompletionCode cc{};
97 uint16_t reasonCode = 0;
98 uint64_t energyValue = 0;
99
Marc Olberdingd0125c92025-10-08 14:37:19 -0700100 auto rc = gpu::decodeGetCurrentEnergyCounterResponse(buffer, cc, reasonCode,
101 energyValue);
Harshit Aghera775199d2025-05-27 14:20:24 +0530102
103 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
104 {
105 lg2::error(
106 "Error updating Energy Sensor for eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
107 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
108 reasonCode);
109 return;
110 }
111
112 // Reading from the device is in millijoules and unit set on the dbus
113 // is Joules.
114 updateValue(energyValue / 1000.0);
115}
116
117void NvidiaGpuEnergySensor::update()
118{
119 auto rc = gpu::encodeGetCurrentEnergyCounterRequest(0, sensorId, request);
120
121 if (rc != 0)
122 {
123 lg2::error(
124 "Error updating Energy Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
125 "EID", eid, "SID", sensorId, "RC", rc);
126 return;
127 }
128
129 mctpRequester.sendRecvMsg(
Marc Olberdingd0125c92025-10-08 14:37:19 -0700130 eid, request,
131 [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
132 processResponse(ec, buffer);
133 });
Harshit Aghera775199d2025-05-27 14:20:24 +0530134}