blob: 5cf9128fd09f3638fd4451d290c0ffc502f2dee6 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
Ed Tanousb5e823f2025-10-09 20:28:42 -04002 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05303 * SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaGpuDevice.hpp"
7
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05308#include "Thresholds.hpp"
9#include "Utils.hpp"
10
Marc Olberdingd0125c92025-10-08 14:37:19 -070011#include <Inventory.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053012#include <MctpRequester.hpp>
Marc Olberdingd0125c92025-10-08 14:37:19 -070013#include <NvidiaDeviceDiscovery.hpp>
Harshit Aghera775199d2025-05-27 14:20:24 +053014#include <NvidiaGpuEnergySensor.hpp>
Rohit PAI0a888262025-06-11 08:52:29 +053015#include <NvidiaGpuMctpVdm.hpp>
Harshit Aghera6b712322025-07-31 19:25:12 +053016#include <NvidiaGpuPowerPeakReading.hpp>
Harshit Aghera902c6492025-05-08 15:57:42 +053017#include <NvidiaGpuPowerSensor.hpp>
Marc Olberdingd0125c92025-10-08 14:37:19 -070018#include <NvidiaGpuSensor.hpp>
Harshit Agherabef4d412025-05-27 14:53:56 +053019#include <NvidiaGpuVoltageSensor.hpp>
Marc Olberding6282a452025-09-28 22:00:09 -070020#include <OcpMctpVdm.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053021#include <boost/asio/io_context.hpp>
22#include <phosphor-logging/lg2.hpp>
23#include <sdbusplus/asio/connection.hpp>
24#include <sdbusplus/asio/object_server.hpp>
25
Marc Olberding6282a452025-09-28 22:00:09 -070026#include <array>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053027#include <chrono>
28#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053029#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053030#include <memory>
Marc Olberding6282a452025-09-28 22:00:09 -070031#include <span>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053032#include <string>
Marc Olberding6282a452025-09-28 22:00:09 -070033#include <system_error>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053034#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053035#include <vector>
36
Marc Olberding6282a452025-09-28 22:00:09 -070037static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
38static constexpr uint8_t gpuTLimitWarningThresholdId{2};
39static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
40
41// nota bene: the order has to match the order in processTLimitThresholds
42static constexpr std::array<uint8_t, 3> thresholdIds{
43 gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
44 gpuTLimitHardshutDownThresholdId};
45
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053046GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
47 const std::string& path,
48 const std::shared_ptr<sdbusplus::asio::connection>& conn,
49 uint8_t eid, boost::asio::io_context& io,
50 mctp::MctpRequester& mctpRequester,
51 sdbusplus::asio::object_server& objectServer) :
52 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
53 waitTimer(io, std::chrono::steady_clock::duration(0)),
54 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
55 configs(configs), name(escapeName(name)), path(path)
56{
Rohit PAI0a888262025-06-11 08:52:29 +053057 inventory = std::make_shared<Inventory>(
Rohit PAIada6baa2025-07-01 18:26:19 +053058 conn, objectServer, name, mctpRequester,
59 gpu::DeviceIdentification::DEVICE_GPU, eid, io);
Marc Olberdingac920732025-09-28 21:56:54 -070060}
61
62void GpuDevice::init()
63{
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053064 makeSensors();
Marc Olberdingac920732025-09-28 21:56:54 -070065 inventory->init();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053066}
67
68void GpuDevice::makeSensors()
69{
70 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053071 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
72 objectServer, std::vector<thresholds::Threshold>{});
73
Harshit Agherab10a67b2025-05-27 12:19:29 +053074 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
75 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
76 gpuDramTempSensorId, objectServer,
77 std::vector<thresholds::Threshold>{thresholds::Threshold{
78 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
79
Harshit Aghera902c6492025-05-08 15:57:42 +053080 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
81 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
82 objectServer, std::vector<thresholds::Threshold>{});
83
Harshit Aghera6b712322025-07-31 19:25:12 +053084 peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
85 mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
86 objectServer);
87
Harshit Aghera775199d2025-05-27 14:20:24 +053088 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
89 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
90 objectServer, std::vector<thresholds::Threshold>{});
91
Harshit Agherabef4d412025-05-27 14:53:56 +053092 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
93 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
94 objectServer, std::vector<thresholds::Threshold>{});
95
Marc Olberding6282a452025-09-28 22:00:09 -070096 getTLimitThresholds();
97
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053098 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
99 name, "PATH", path);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530100 read();
101}
102
Marc Olberding6282a452025-09-28 22:00:09 -0700103void GpuDevice::getTLimitThresholds()
104{
105 thresholds = {};
106 current_threshold_index = 0;
107 getNextThermalParameter();
108}
109
110void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
111 std::span<const uint8_t> buffer)
112{
113 if (ec)
114 {
115 lg2::error(
116 "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
117 "RC", ec.message());
118 processTLimitThresholds(ec);
119 return;
120 }
121
122 ocp::accelerator_management::CompletionCode cc{};
123 uint16_t reasonCode = 0;
124 int32_t threshold = 0;
125
126 int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
127 threshold);
128
129 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
130 {
131 lg2::error(
132 "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
133 "RC", rc, "CC", cc, "RESC", reasonCode);
134 processTLimitThresholds(ec);
135 return;
136 }
137
138 thresholds[current_threshold_index] = threshold;
139
140 current_threshold_index++;
141
142 if (current_threshold_index < thresholdIds.size())
143 {
144 getNextThermalParameter();
145 return;
146 }
147 processTLimitThresholds(std::error_code{});
148}
149
150void GpuDevice::getNextThermalParameter()
151{
152 uint8_t id = thresholdIds[current_threshold_index];
153 auto rc =
154 gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
155 if (rc != 0)
156 {
157 lg2::error(
158 "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
159 "EID", eid, "PID", id, "RC", rc);
160 processTLimitThresholds(
161 std::make_error_code(static_cast<std::errc>(rc)));
162 return;
163 }
164
165 mctpRequester.sendRecvMsg(
166 eid, thermalParamReqMsg,
167 [weak{weak_from_this()}](const std::error_code& ec,
168 std::span<const uint8_t> buffer) {
169 std::shared_ptr<GpuDevice> self = weak.lock();
170 if (!self)
171 {
172 lg2::error("Failed to get lock on GpuDevice");
173 return;
174 }
175 self->readThermalParameterCallback(ec, buffer);
176 });
177}
178
179void GpuDevice::processTLimitThresholds(const std::error_code& ec)
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530180{
181 std::vector<thresholds::Threshold> tLimitThresholds{};
Marc Olberding6282a452025-09-28 22:00:09 -0700182 if (!ec)
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530183 {
184 tLimitThresholds = {
185 thresholds::Threshold{thresholds::Level::WARNING,
186 thresholds::Direction::LOW,
187 static_cast<double>(thresholds[0])},
188 thresholds::Threshold{thresholds::Level::CRITICAL,
189 thresholds::Direction::LOW,
190 static_cast<double>(thresholds[1])},
191 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
192 thresholds::Direction::LOW,
193 static_cast<double>(thresholds[2])}};
194 }
195
196 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
197 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
198 objectServer, std::move(tLimitThresholds));
199}
200
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530201void GpuDevice::read()
202{
203 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530204 if (tLimitSensor)
205 {
206 tLimitSensor->update();
207 }
Harshit Agherab10a67b2025-05-27 12:19:29 +0530208 dramTempSensor->update();
Harshit Aghera902c6492025-05-08 15:57:42 +0530209 powerSensor->update();
Harshit Aghera6b712322025-07-31 19:25:12 +0530210 peakPower->update();
Harshit Aghera775199d2025-05-27 14:20:24 +0530211 energySensor->update();
Harshit Agherabef4d412025-05-27 14:53:56 +0530212 voltageSensor->update();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530213
214 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
Marc Olberdingfd4a3772025-09-24 16:31:02 -0700215 waitTimer.async_wait(
216 [weak{weak_from_this()}](const boost::system::error_code& ec) {
217 std::shared_ptr<GpuDevice> self = weak.lock();
218 if (!self)
219 {
220 lg2::error("Invalid reference to GpuDevice");
221 return;
222 }
223 if (ec)
224 {
225 return;
226 }
227 self->read();
228 });
229}