blob: b21ce0e4077be5006b73ede4d63854929205b719 [file] [log] [blame]
Harshit Aghera902c6492025-05-08 15:57:42 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuPowerSensor.hpp"
8
9#include "MctpRequester.hpp"
10#include "SensorPaths.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13#include "sensor.hpp"
14
15#include <bits/basic_string.h>
16
17#include <NvidiaDeviceDiscovery.hpp>
18#include <NvidiaGpuMctpVdm.hpp>
19#include <OcpMctpVdm.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23
24#include <cstddef>
25#include <cstdint>
26#include <functional>
27#include <limits>
28#include <memory>
Marc Olberdingd0125c92025-10-08 14:37:19 -070029#include <span>
Harshit Aghera902c6492025-05-08 15:57:42 +053030#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070031#include <system_error>
Harshit Aghera902c6492025-05-08 15:57:42 +053032#include <utility>
33#include <vector>
34
35using namespace std::literals;
36
Harshit Aghera5e4d3052025-06-19 11:28:38 +053037static constexpr double gpuPowerSensorMaxReading = 5000;
Harshit Aghera902c6492025-05-08 15:57:42 +053038static constexpr double gpuPowerSensorMinReading =
39 std::numeric_limits<uint32_t>::min();
40
41NvidiaGpuPowerSensor::NvidiaGpuPowerSensor(
42 std::shared_ptr<sdbusplus::asio::connection>& conn,
43 mctp::MctpRequester& mctpRequester, const std::string& name,
44 const std::string& sensorConfiguration, uint8_t eid, uint8_t sensorId,
45 sdbusplus::asio::object_server& objectServer,
46 std::vector<thresholds::Threshold>&& thresholdData) :
47 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
48 "power", false, true, gpuPowerSensorMaxReading,
49 gpuPowerSensorMinReading, conn),
50 eid(eid), sensorId{sensorId},
Ed Tanousaba6fca2025-09-29 13:53:20 -070051
Harshit Aghera902c6492025-05-08 15:57:42 +053052 mctpRequester(mctpRequester), objectServer(objectServer)
53
54{
55 std::string dbusPath = sensorPathPrefix + "power/"s + escapeName(name);
56
57 sensorInterface = objectServer.add_interface(
58 dbusPath, "xyz.openbmc_project.Sensor.Value");
59
60 for (const auto& threshold : thresholds)
61 {
62 std::string interface = thresholds::getInterface(threshold.level);
63 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
64 objectServer.add_interface(dbusPath, interface);
65 }
66
67 association = objectServer.add_interface(dbusPath, association::interface);
68
69 setInitialProperties(sensor_paths::unitWatts);
70}
71
72NvidiaGpuPowerSensor::~NvidiaGpuPowerSensor()
73{
74 for (const auto& iface : thresholdInterfaces)
75 {
76 objectServer.remove_interface(iface);
77 }
78 objectServer.remove_interface(association);
79 objectServer.remove_interface(sensorInterface);
80}
81
82void NvidiaGpuPowerSensor::checkThresholds()
83{
84 thresholds::checkThresholds(this);
85}
86
Marc Olberdingd0125c92025-10-08 14:37:19 -070087void NvidiaGpuPowerSensor::processResponse(const std::error_code& ec,
88 std::span<const uint8_t> buffer)
Harshit Aghera902c6492025-05-08 15:57:42 +053089{
Marc Olberdingd0125c92025-10-08 14:37:19 -070090 if (ec)
Harshit Aghera902c6492025-05-08 15:57:42 +053091 {
92 lg2::error(
93 "Error updating Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070094 "EID", eid, "SID", sensorId, "RC", ec.message());
Harshit Aghera902c6492025-05-08 15:57:42 +053095 return;
96 }
97
98 ocp::accelerator_management::CompletionCode cc{};
99 uint16_t reasonCode = 0;
100 uint32_t power = 0;
101
102 const int rc =
Marc Olberdingd0125c92025-10-08 14:37:19 -0700103 gpu::decodeGetPowerDrawResponse(buffer, cc, reasonCode, power);
Harshit Aghera902c6492025-05-08 15:57:42 +0530104
105 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
106 {
107 lg2::error(
108 "Error updating Power Sensor eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
109 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
110 reasonCode);
111 return;
112 }
113
114 // Reading from the device is in milliwatts and unit set on the dbus
115 // is watts.
116 updateValue(power / 1000.0);
117}
118
119void NvidiaGpuPowerSensor::update()
120{
Harshit Aghera6b712322025-07-31 19:25:12 +0530121 const int rc = gpu::encodeGetPowerDrawRequest(
122 gpu::PlatformEnvironmentalCommands::GET_CURRENT_POWER_DRAW, 0, sensorId,
123 averagingInterval, request);
Harshit Aghera902c6492025-05-08 15:57:42 +0530124
125 if (rc != 0)
126 {
127 lg2::error(
128 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
129 "EID", eid, "SID", sensorId, "RC", rc);
130 }
131
132 mctpRequester.sendRecvMsg(
Marc Olberdingd0125c92025-10-08 14:37:19 -0700133 eid, request,
134 [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
135 processResponse(ec, buffer);
136 });
Harshit Aghera902c6492025-05-08 15:57:42 +0530137}