blob: 7ec47fffce7dfc8d9e55ec8b9d18b416c4f90f73 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05309#include "Thresholds.hpp"
10#include "Utils.hpp"
11
Marc Olberdingd0125c92025-10-08 14:37:19 -070012#include <Inventory.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053013#include <MctpRequester.hpp>
Marc Olberdingd0125c92025-10-08 14:37:19 -070014#include <NvidiaDeviceDiscovery.hpp>
Harshit Aghera775199d2025-05-27 14:20:24 +053015#include <NvidiaGpuEnergySensor.hpp>
Rohit PAI0a888262025-06-11 08:52:29 +053016#include <NvidiaGpuMctpVdm.hpp>
Harshit Aghera6b712322025-07-31 19:25:12 +053017#include <NvidiaGpuPowerPeakReading.hpp>
Harshit Aghera902c6492025-05-08 15:57:42 +053018#include <NvidiaGpuPowerSensor.hpp>
Marc Olberdingd0125c92025-10-08 14:37:19 -070019#include <NvidiaGpuSensor.hpp>
Harshit Agherabef4d412025-05-27 14:53:56 +053020#include <NvidiaGpuVoltageSensor.hpp>
Marc Olberding6282a452025-09-28 22:00:09 -070021#include <OcpMctpVdm.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053022#include <boost/asio/io_context.hpp>
23#include <phosphor-logging/lg2.hpp>
24#include <sdbusplus/asio/connection.hpp>
25#include <sdbusplus/asio/object_server.hpp>
26
Marc Olberding6282a452025-09-28 22:00:09 -070027#include <array>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053028#include <chrono>
29#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053030#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053031#include <memory>
Marc Olberding6282a452025-09-28 22:00:09 -070032#include <span>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053033#include <string>
Marc Olberding6282a452025-09-28 22:00:09 -070034#include <system_error>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053035#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053036#include <vector>
37
Marc Olberding6282a452025-09-28 22:00:09 -070038static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
39static constexpr uint8_t gpuTLimitWarningThresholdId{2};
40static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
41
42// nota bene: the order has to match the order in processTLimitThresholds
43static constexpr std::array<uint8_t, 3> thresholdIds{
44 gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
45 gpuTLimitHardshutDownThresholdId};
46
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053047GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
48 const std::string& path,
49 const std::shared_ptr<sdbusplus::asio::connection>& conn,
50 uint8_t eid, boost::asio::io_context& io,
51 mctp::MctpRequester& mctpRequester,
52 sdbusplus::asio::object_server& objectServer) :
53 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
54 waitTimer(io, std::chrono::steady_clock::duration(0)),
55 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
56 configs(configs), name(escapeName(name)), path(path)
57{
Rohit PAI0a888262025-06-11 08:52:29 +053058 inventory = std::make_shared<Inventory>(
Rohit PAIada6baa2025-07-01 18:26:19 +053059 conn, objectServer, name, mctpRequester,
60 gpu::DeviceIdentification::DEVICE_GPU, eid, io);
Marc Olberdingac920732025-09-28 21:56:54 -070061}
62
63void GpuDevice::init()
64{
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053065 makeSensors();
Marc Olberdingac920732025-09-28 21:56:54 -070066 inventory->init();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053067}
68
69void GpuDevice::makeSensors()
70{
71 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053072 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
73 objectServer, std::vector<thresholds::Threshold>{});
74
Harshit Agherab10a67b2025-05-27 12:19:29 +053075 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
76 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
77 gpuDramTempSensorId, objectServer,
78 std::vector<thresholds::Threshold>{thresholds::Threshold{
79 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
80
Harshit Aghera902c6492025-05-08 15:57:42 +053081 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
82 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
83 objectServer, std::vector<thresholds::Threshold>{});
84
Harshit Aghera6b712322025-07-31 19:25:12 +053085 peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
86 mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
87 objectServer);
88
Harshit Aghera775199d2025-05-27 14:20:24 +053089 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
90 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
91 objectServer, std::vector<thresholds::Threshold>{});
92
Harshit Agherabef4d412025-05-27 14:53:56 +053093 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
94 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
95 objectServer, std::vector<thresholds::Threshold>{});
96
Marc Olberding6282a452025-09-28 22:00:09 -070097 getTLimitThresholds();
98
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053099 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
100 name, "PATH", path);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530101 read();
102}
103
Marc Olberding6282a452025-09-28 22:00:09 -0700104void GpuDevice::getTLimitThresholds()
105{
106 thresholds = {};
107 current_threshold_index = 0;
108 getNextThermalParameter();
109}
110
111void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
112 std::span<const uint8_t> buffer)
113{
114 if (ec)
115 {
116 lg2::error(
117 "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
118 "RC", ec.message());
119 processTLimitThresholds(ec);
120 return;
121 }
122
123 ocp::accelerator_management::CompletionCode cc{};
124 uint16_t reasonCode = 0;
125 int32_t threshold = 0;
126
127 int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
128 threshold);
129
130 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
131 {
132 lg2::error(
133 "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
134 "RC", rc, "CC", cc, "RESC", reasonCode);
135 processTLimitThresholds(ec);
136 return;
137 }
138
139 thresholds[current_threshold_index] = threshold;
140
141 current_threshold_index++;
142
143 if (current_threshold_index < thresholdIds.size())
144 {
145 getNextThermalParameter();
146 return;
147 }
148 processTLimitThresholds(std::error_code{});
149}
150
151void GpuDevice::getNextThermalParameter()
152{
153 uint8_t id = thresholdIds[current_threshold_index];
154 auto rc =
155 gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
156 if (rc != 0)
157 {
158 lg2::error(
159 "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
160 "EID", eid, "PID", id, "RC", rc);
161 processTLimitThresholds(
162 std::make_error_code(static_cast<std::errc>(rc)));
163 return;
164 }
165
166 mctpRequester.sendRecvMsg(
167 eid, thermalParamReqMsg,
168 [weak{weak_from_this()}](const std::error_code& ec,
169 std::span<const uint8_t> buffer) {
170 std::shared_ptr<GpuDevice> self = weak.lock();
171 if (!self)
172 {
173 lg2::error("Failed to get lock on GpuDevice");
174 return;
175 }
176 self->readThermalParameterCallback(ec, buffer);
177 });
178}
179
180void GpuDevice::processTLimitThresholds(const std::error_code& ec)
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530181{
182 std::vector<thresholds::Threshold> tLimitThresholds{};
Marc Olberding6282a452025-09-28 22:00:09 -0700183 if (!ec)
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530184 {
185 tLimitThresholds = {
186 thresholds::Threshold{thresholds::Level::WARNING,
187 thresholds::Direction::LOW,
188 static_cast<double>(thresholds[0])},
189 thresholds::Threshold{thresholds::Level::CRITICAL,
190 thresholds::Direction::LOW,
191 static_cast<double>(thresholds[1])},
192 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
193 thresholds::Direction::LOW,
194 static_cast<double>(thresholds[2])}};
195 }
196
197 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
198 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
199 objectServer, std::move(tLimitThresholds));
200}
201
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530202void GpuDevice::read()
203{
204 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530205 if (tLimitSensor)
206 {
207 tLimitSensor->update();
208 }
Harshit Agherab10a67b2025-05-27 12:19:29 +0530209 dramTempSensor->update();
Harshit Aghera902c6492025-05-08 15:57:42 +0530210 powerSensor->update();
Harshit Aghera6b712322025-07-31 19:25:12 +0530211 peakPower->update();
Harshit Aghera775199d2025-05-27 14:20:24 +0530212 energySensor->update();
Harshit Agherabef4d412025-05-27 14:53:56 +0530213 voltageSensor->update();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530214
215 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
216 waitTimer.async_wait([this](const boost::system::error_code& ec) {
217 if (ec)
218 {
219 return;
220 }
221 read();
222 });
Marc Olberdingd0125c92025-10-08 14:37:19 -0700223};