blob: 9f7c7141b816bfc52ac06b042379d1bd22e8dfd1 [file] [log] [blame]
Harshit Agherac8dab722025-05-08 15:57:42 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaGpuPowerSensor.hpp"
7
8#include "MctpRequester.hpp"
9#include "SensorPaths.hpp"
10#include "Thresholds.hpp"
11#include "Utils.hpp"
12#include "sensor.hpp"
13
14#include <bits/basic_string.h>
15
16#include <NvidiaDeviceDiscovery.hpp>
17#include <NvidiaGpuMctpVdm.hpp>
18#include <OcpMctpVdm.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23#include <cstddef>
24#include <cstdint>
25#include <functional>
26#include <limits>
27#include <memory>
28#include <string>
29#include <utility>
30#include <vector>
31
32using namespace std::literals;
33
34// GPU Power Sensor Averaging Interval in seconds, 0 implies default
35constexpr uint8_t gpuPowerAveragingIntervalInSec{0};
36
37static constexpr double gpuPowerSensorMaxReading =
38 std::numeric_limits<uint32_t>::max();
39static constexpr double gpuPowerSensorMinReading =
40 std::numeric_limits<uint32_t>::min();
41
42NvidiaGpuPowerSensor::NvidiaGpuPowerSensor(
43 std::shared_ptr<sdbusplus::asio::connection>& conn,
44 mctp::MctpRequester& mctpRequester, const std::string& name,
45 const std::string& sensorConfiguration, uint8_t eid, uint8_t sensorId,
46 sdbusplus::asio::object_server& objectServer,
47 std::vector<thresholds::Threshold>&& thresholdData) :
48 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
49 "power", false, true, gpuPowerSensorMaxReading,
50 gpuPowerSensorMinReading, conn),
51 eid(eid), sensorId{sensorId},
52 averagingInterval{gpuPowerAveragingIntervalInSec},
53 mctpRequester(mctpRequester), objectServer(objectServer)
54
55{
56 std::string dbusPath = sensorPathPrefix + "power/"s + escapeName(name);
57
58 sensorInterface = objectServer.add_interface(
59 dbusPath, "xyz.openbmc_project.Sensor.Value");
60
61 for (const auto& threshold : thresholds)
62 {
63 std::string interface = thresholds::getInterface(threshold.level);
64 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
65 objectServer.add_interface(dbusPath, interface);
66 }
67
68 association = objectServer.add_interface(dbusPath, association::interface);
69
70 // Sensor values are only updated when the difference between the new and
71 // previous value exceeds the hysteresisPublish threshold. This threshold
72 // defaults to ((max - min) * 0.0001). Since this sensor lacks defined
73 // min/max values, theoretical limits are used instead, creating a large
74 // hysteresisPublish value that blocks D-Bus updates. Setting
75 // hysteresisPublish to 0 forces all sensor value changes to be published
76 // to D-Bus.
77 hysteresisPublish = 0;
78
79 setInitialProperties(sensor_paths::unitWatts);
80}
81
82NvidiaGpuPowerSensor::~NvidiaGpuPowerSensor()
83{
84 for (const auto& iface : thresholdInterfaces)
85 {
86 objectServer.remove_interface(iface);
87 }
88 objectServer.remove_interface(association);
89 objectServer.remove_interface(sensorInterface);
90}
91
92void NvidiaGpuPowerSensor::checkThresholds()
93{
94 thresholds::checkThresholds(this);
95}
96
97void NvidiaGpuPowerSensor::processResponse(int sendRecvMsgResult)
98{
99 if (sendRecvMsgResult != 0)
100 {
101 lg2::error(
102 "Error updating Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
103 "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
104 return;
105 }
106
107 ocp::accelerator_management::CompletionCode cc{};
108 uint16_t reasonCode = 0;
109 uint32_t power = 0;
110
111 auto rc =
112 gpu::decodeGetCurrentPowerDrawResponse(response, cc, reasonCode, power);
113
114 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
115 {
116 lg2::error(
117 "Error updating Power Sensor eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
118 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
119 reasonCode);
120 return;
121 }
122
123 // Reading from the device is in milliwatts and unit set on the dbus
124 // is watts.
125 updateValue(power / 1000.0);
126}
127
128void NvidiaGpuPowerSensor::update()
129{
130 auto rc = gpu::encodeGetCurrentPowerDrawRequest(0, sensorId,
131 averagingInterval, request);
132
133 if (rc != 0)
134 {
135 lg2::error(
136 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
137 "EID", eid, "SID", sensorId, "RC", rc);
138 }
139
140 mctpRequester.sendRecvMsg(
141 eid, request, response,
142 [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
143}