blob: 54b6df23a7f1ca5e3fc6644f8c714b3bd51c5208 [file] [log] [blame]
Harshit Aghera11b9c1a2025-04-29 17:34:25 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "GpuDevice.hpp"
7
8#include "GpuSensor.hpp"
Harshit Aghera32e3b2b2025-05-05 12:26:35 +05309#include "GpuTLimitSensor.hpp"
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053010#include "Thresholds.hpp"
11#include "Utils.hpp"
12
13#include <bits/basic_string.h>
14
15#include <GpuMctpVdm.hpp>
Harshit Aghera09f6f2c2025-05-07 16:20:16 +053016#include <GpuThresholds.hpp>
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053017#include <MctpRequester.hpp>
18#include <OcpMctpVdm.hpp>
19#include <boost/asio/io_context.hpp>
20#include <boost/container/flat_map.hpp>
21#include <phosphor-logging/lg2.hpp>
22#include <sdbusplus/asio/connection.hpp>
23#include <sdbusplus/asio/object_server.hpp>
24#include <sdbusplus/message.hpp>
25#include <sdbusplus/message/native_types.hpp>
26
27#include <algorithm>
28#include <chrono>
29#include <cstdint>
30#include <functional>
31#include <map>
32#include <memory>
33#include <string>
34#include <utility>
35#include <variant>
36#include <vector>
37
38using namespace std::chrono_literals;
39
40constexpr std::chrono::milliseconds samplingInterval{1000ms};
41
42std::unique_ptr<GpuDevice> gpuDevice;
43
44GpuDevice::GpuDevice(const std::string& name, const std::string& path,
45 std::shared_ptr<sdbusplus::asio::connection>& conn,
46 boost::asio::io_context& io,
47 mctp::MctpRequester& mctpRequester,
48 sdbusplus::asio::object_server& objectServer) :
49 sensorPollMs(samplingInterval),
50 waitTimer(io, std::chrono::steady_clock::duration(0)),
51 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
52 name(escapeName(name)), path(path)
53{
54 discoverGpus();
55}
56
57void GpuDevice::createSensors()
58{
59 sensors.push_back(std::make_shared<GpuTempSensor>(
60 conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
61 std::vector<thresholds::Threshold>{}));
62
Harshit Aghera32e3b2b2025-05-05 12:26:35 +053063 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
64 name, "PATH", path);
Harshit Aghera09f6f2c2025-05-07 16:20:16 +053065
66 readThermalParametersBatched(
67 eid,
68 std::make_shared<std::vector<uint8_t>>(std::vector<uint8_t>{1, 2, 4}),
69 mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
70 if (rc)
71 {
72 return;
73 }
74
75 std::vector<thresholds::Threshold> tLimitThresholds{
76 thresholds::Threshold{thresholds::Level::CRITICAL,
77 thresholds::Direction::LOW,
78 static_cast<double>(thresholds[0])},
79 thresholds::Threshold{thresholds::Level::WARNING,
80 thresholds::Direction::LOW,
81 static_cast<double>(thresholds[1])},
82 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
83 thresholds::Direction::LOW,
84 static_cast<double>(thresholds[2])}};
85
86 sensors.push_back(std::make_shared<GpuTLimitSensor>(
87 conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
88 std::move(tLimitThresholds)));
89 });
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053090}
91
92void GpuDevice::read()
93{
Harshit Aghera32e3b2b2025-05-05 12:26:35 +053094 for (const auto& sensor : sensors)
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053095 {
96 sensor->update();
97 }
98
99 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
100 waitTimer.async_wait([this](const boost::system::error_code& ec) {
101 if (ec)
102 {
103 return;
104 }
105 read();
106 });
107}
108
109void GpuDevice::processGpuEndpoint(uint8_t eid)
110{
111 std::vector<uint8_t> reqMsg(
112 sizeof(ocp::accelerator_management::BindingPciVid) +
113 sizeof(gpu::QueryDeviceIdentificationRequest));
114
115 auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
116
117 auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
118 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
119 {
120 lg2::error(
121 "GpuDevice::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
122 "RC", static_cast<int>(rc));
123 return;
124 }
125
126 mctpRequester.sendRecvMsg(
127 eid, reqMsg,
128 [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
129 if (sendRecvMsgResult != 0)
130 {
131 lg2::error(
132 "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
133 "RC", sendRecvMsgResult);
134 return;
135 }
136
137 if (respMsg.empty())
138 {
139 lg2::error(
140 "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
141 return;
142 }
143
144 uint8_t cc = 0;
145 uint16_t reasonCode = 0;
146 uint8_t responseDeviceType = 0;
147 uint8_t responseInstanceId = 0;
148
149 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
150 *new (respMsg.data()) ocp::accelerator_management::Message,
151 respMsg.size(), cc, reasonCode, responseDeviceType,
152 responseInstanceId);
153
154 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
155 cc != static_cast<uint8_t>(
156 ocp::accelerator_management::CompletionCode::SUCCESS))
157 {
158 lg2::error(
159 "GpuDevice::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
160 "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
161 return;
162 }
163
164 if (responseDeviceType ==
165 static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
166 {
167 lg2::info(
168 "GpuDevice::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
169 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
170 responseInstanceId);
171
172 this->eid = eid;
173 this->createSensors();
174 this->read();
175 }
176 });
177}
178
179void GpuDevice::processMctpEndpoints(const boost::system::error_code& ec,
180 const getSubTreeRet& ret)
181{
182 if (ec)
183 {
184 lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
185 ec.message());
186 return;
187 }
188
189 if (ret.empty())
190 {
191 return;
192 }
193
194 for (const auto& [objPath, services] : ret)
195 {
196 for (const auto& [service, ifaces] : services)
197 {
198 for (const auto& iface : ifaces)
199 {
200 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
201 {
202 conn->async_method_call(
203 [this](const boost::system::error_code& ec,
204 const GpuSensorConfigMap& configs) {
205 this->processEndpointConfigs(ec, configs);
206 },
207 service, objPath, "org.freedesktop.DBus.Properties",
208 "GetAll", iface);
209 }
210 }
211 }
212 }
213}
214
215void GpuDevice::processEndpointConfigs(const boost::system::error_code& ec,
216 const GpuSensorConfigMap& configs)
217{
218 if (ec)
219 {
220 lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
221 ec.message());
222 return;
223 }
224
225 uint8_t eid{};
226 std::vector<uint8_t> mctpTypes{};
227
228 auto hasEid = configs.find("EID");
229 if (hasEid != configs.end())
230 {
231 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
232 if (eidPtr != nullptr)
233 {
234 eid = *eidPtr;
235 }
236 else
237 {
238 lg2::error(
239 "GpuDevice::discoverGpus(): Property EID does not have valid type.");
240 return;
241 }
242 }
243 else
244 {
245 lg2::error(
246 "GpuDevice::discoverGpus(): Property EID not found in the configuration.");
247 return;
248 }
249
250 auto hasMctpTypes = configs.find("SupportedMessageTypes");
251 if (hasMctpTypes != configs.end())
252 {
253 const auto* mctpTypePtr =
254 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
255 if (mctpTypePtr != nullptr)
256 {
257 mctpTypes = *mctpTypePtr;
258 }
259 else
260 {
261 lg2::error(
262 "GpuDevice::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
263 return;
264 }
265 }
266 else
267 {
268 lg2::error(
269 "GpuDevice::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
270 return;
271 }
272
273 if (std::find(mctpTypes.begin(), mctpTypes.end(),
274 ocp::accelerator_management::messageType) != mctpTypes.end())
275 {
276 lg2::info(
277 "GpuDevice::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
278 "EID", eid);
279 this->processGpuEndpoint(eid);
280 }
281}
282
283void GpuDevice::discoverGpus()
284{
285 std::string searchPath{"/au/com/codeconstruct/"};
286 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
287
288 conn->async_method_call(
289 [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
290 processMctpEndpoints(ec, ret);
291 },
292 "xyz.openbmc_project.ObjectMapper",
293 "/xyz/openbmc_project/object_mapper",
294 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
295 ifaceList);
296}
297
298void processSensorConfigs(
299 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
300 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
301 gpuDevice,
302 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
303 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
304{
305 for (const auto& [path, interfaces] : resp)
306 {
307 for (const auto& [intf, cfg] : interfaces)
308 {
309 if (intf != configInterfaceName(sensorType))
310 {
311 continue;
312 }
313
314 std::string name = loadVariant<std::string>(cfg, "Name");
315
316 gpuDevice[name] = std::make_shared<GpuDevice>(
317 name, path, dbusConnection, io, mctpRequester, objectServer);
318 }
319 }
320}
321
322void createSensors(
323 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
324 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
325 gpuDevice,
326 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
327 mctp::MctpRequester& mctpRequester)
328{
329 if (!dbusConnection)
330 {
331 lg2::error("Connection not created");
332 return;
333 }
334 dbusConnection->async_method_call(
335 [&gpuDevice, &mctpRequester, &dbusConnection, &io, &objectServer](
336 boost::system::error_code ec, const ManagedObjectType& resp) {
337 if (ec)
338 {
339 lg2::error("Error contacting entity manager");
340 return;
341 }
342
343 processSensorConfigs(io, objectServer, gpuDevice, dbusConnection,
344 mctpRequester, resp);
345 },
346 entityManagerName, "/xyz/openbmc_project/inventory",
347 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
348}
349
350void interfaceRemoved(
351 sdbusplus::message_t& message,
352 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
353 gpuDevice)
354{
355 if (message.is_method_error())
356 {
357 lg2::error("interfacesRemoved callback method error");
358 return;
359 }
360
361 sdbusplus::message::object_path removedPath;
362 std::vector<std::string> interfaces;
363
364 message.read(removedPath, interfaces);
365
366 // If the xyz.openbmc_project.Confguration.X interface was removed
367 // for one or more sensors, delete those sensor objects.
368 auto sensorIt = gpuDevice.begin();
369 while (sensorIt != gpuDevice.end())
370 {
371 if ((sensorIt->second->getPath() == removedPath) &&
372 (std::find(interfaces.begin(), interfaces.end(),
373 configInterfaceName(sensorType)) != interfaces.end()))
374 {
375 sensorIt = gpuDevice.erase(sensorIt);
376 }
377 else
378 {
379 sensorIt++;
380 }
381 }
382}