blob: acb6fecaa781a8753415b6824ec2de3a7eb617cc [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
Ed Tanousb5e823f2025-10-09 20:28:42 -04002 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05303 * SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaDeviceDiscovery.hpp"
7
8#include "NvidiaGpuDevice.hpp"
Harshit Aghera8951c872025-06-25 15:25:33 +05309#include "NvidiaSmaDevice.hpp"
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053010#include "Utils.hpp"
11
12#include <bits/basic_string.h>
13
14#include <MctpRequester.hpp>
15#include <NvidiaGpuMctpVdm.hpp>
16#include <OcpMctpVdm.hpp>
17#include <boost/asio/io_context.hpp>
18#include <boost/container/flat_map.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22#include <sdbusplus/message.hpp>
23#include <sdbusplus/message/native_types.hpp>
24
25#include <algorithm>
26#include <array>
27#include <cstdint>
28#include <memory>
29#include <span>
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053030#include <stdexcept>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053031#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070032#include <system_error>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053033#include <utility>
34#include <variant>
35#include <vector>
36
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053037static constexpr auto sensorPollRateMs = 1000;
38
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053039void processQueryDeviceIdResponse(
40 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
41 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
42 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +053043 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
44 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053045 const std::shared_ptr<sdbusplus::asio::connection>& conn,
46 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
Marc Olberdingd0125c92025-10-08 14:37:19 -070047 const std::string& path, uint8_t eid,
48 const std::error_code& sendRecvMsgResult,
49 std::span<const uint8_t> queryDeviceIdentificationResponse)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053050{
Marc Olberdingd0125c92025-10-08 14:37:19 -070051 if (sendRecvMsgResult)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053052 {
53 lg2::error(
54 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070055 "EID", eid, "RC", sendRecvMsgResult.message());
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053056 return;
57 }
58
59 ocp::accelerator_management::CompletionCode cc{};
60 uint16_t reasonCode = 0;
61 uint8_t responseDeviceType = 0;
62 uint8_t responseInstanceId = 0;
63
64 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
65 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
66 responseInstanceId);
67
68 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
69 {
70 lg2::error(
71 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
72 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
73 return;
74 }
75
Harshit Aghera8951c872025-06-25 15:25:33 +053076 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053077 {
Harshit Aghera8951c872025-06-25 15:25:33 +053078 case gpu::DeviceIdentification::DEVICE_GPU:
79 {
80 lg2::info(
81 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
82 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
83 responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053084
Harshit Aghera8951c872025-06-25 15:25:33 +053085 auto gpuName = configs.name + '_' +
86 std::to_string(responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053087
Marc Olberdingac920732025-09-28 21:56:54 -070088 auto gpu = gpuDevices
89 .insert(std::make_pair(
90 gpuName, std::make_shared<GpuDevice>(
91 configs, gpuName, path, conn, eid,
92 io, mctpRequester, objectServer)))
93 .first;
94 (*gpu).second->init();
Harshit Aghera8951c872025-06-25 15:25:33 +053095 break;
96 }
97
98 case gpu::DeviceIdentification::DEVICE_SMA:
99 {
100 lg2::info(
101 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
102 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
103 responseInstanceId);
104
105 auto smaName = configs.name + "_SMA_" +
106 std::to_string(responseInstanceId);
107
Marc Olberdingfd4a3772025-09-24 16:31:02 -0700108 auto sma = smaDevices
109 .insert(std::make_pair(
110 smaName, std::make_shared<SmaDevice>(
111 configs, smaName, path, conn, eid,
112 io, mctpRequester, objectServer)))
113 .first;
114 (*sma).second->init();
Harshit Aghera8951c872025-06-25 15:25:33 +0530115 break;
116 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530117 }
118}
119
120void queryDeviceIdentification(
121 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
122 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
123 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530124 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
125 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530126 const std::shared_ptr<sdbusplus::asio::connection>& conn,
127 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
128 const std::string& path, uint8_t eid)
129{
130 auto queryDeviceIdentificationRequest = std::make_shared<
131 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
132
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530133 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
134 0, *queryDeviceIdentificationRequest);
135 if (rc != 0)
136 {
137 lg2::error(
138 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
139 "EID", eid, "RC", rc);
140 return;
141 }
142
143 mctpRequester.sendRecvMsg(
144 eid, *queryDeviceIdentificationRequest,
Harshit Aghera8951c872025-06-25 15:25:33 +0530145 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
Marc Olberdingd0125c92025-10-08 14:37:19 -0700146 configs, path, eid, queryDeviceIdentificationRequest](
147 const std::error_code& ec, std::span<const uint8_t> response) {
148 processQueryDeviceIdResponse(io, objectServer, gpuDevices,
149 smaDevices, conn, mctpRequester,
150 configs, path, eid, ec, response);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530151 });
152}
153
154void processEndpoint(
155 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
156 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
157 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530158 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
159 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530160 const std::shared_ptr<sdbusplus::asio::connection>& conn,
161 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
162 const std::string& path, const boost::system::error_code& ec,
163 const SensorBaseConfigMap& endpoint)
164{
165 if (ec)
166 {
167 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
168 ec.message());
169 return;
170 }
171
172 auto hasEid = endpoint.find("EID");
173 uint8_t eid{};
174
175 if (hasEid != endpoint.end())
176 {
177 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
178 if (eidPtr != nullptr)
179 {
180 eid = *eidPtr;
181 }
182 else
183 {
184 lg2::error(
185 "Error processing MCTP endpoint: Property EID does not have valid type.");
186 return;
187 }
188 }
189 else
190 {
191 lg2::error(
192 "Error processing MCTP endpoint: Property EID not found in the configuration.");
193 return;
194 }
195
196 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
197 std::vector<uint8_t> mctpTypes{};
198
199 if (hasMctpTypes != endpoint.end())
200 {
201 const auto* mctpTypePtr =
202 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
203 if (mctpTypePtr != nullptr)
204 {
205 mctpTypes = *mctpTypePtr;
206 }
207 else
208 {
209 lg2::error(
210 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
211 "EID", eid);
212 return;
213 }
214 }
215 else
216 {
217 lg2::error(
218 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
219 "EID", eid);
220 return;
221 }
222
223 if (std::find(mctpTypes.begin(), mctpTypes.end(),
224 ocp::accelerator_management::messageType) != mctpTypes.end())
225 {
226 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
Harshit Aghera8951c872025-06-25 15:25:33 +0530227 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
228 conn, mctpRequester, configs, path, eid);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530229 }
230}
231
232void queryEndpoints(
233 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
234 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
235 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530236 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
237 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530238 const std::shared_ptr<sdbusplus::asio::connection>& conn,
239 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
240 const std::string& path, const boost::system::error_code& ec,
241 const GetSubTreeType& ret)
242{
243 if (ec)
244 {
245 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
246 ec.message());
247 return;
248 }
249
250 if (ret.empty())
251 {
252 return;
253 }
254
255 for (const auto& [objPath, services] : ret)
256 {
257 for (const auto& [service, ifaces] : services)
258 {
259 for (const auto& iface : ifaces)
260 {
261 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
262 {
263 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530264 [&io, &objectServer, &gpuDevices, &smaDevices, conn,
265 &mctpRequester, configs,
266 path](const boost::system::error_code& ec,
267 const SensorBaseConfigMap& endpoint) {
268 processEndpoint(io, objectServer, gpuDevices,
269 smaDevices, conn, mctpRequester,
270 configs, path, ec, endpoint);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530271 },
272 service, objPath, "org.freedesktop.DBus.Properties",
273 "GetAll", iface);
274 }
275 }
276 }
277 }
278}
279
280void discoverDevices(
281 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
282 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
283 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530284 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
285 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530286 const std::shared_ptr<sdbusplus::asio::connection>& conn,
287 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
288 const std::string& path)
289{
290 std::string searchPath{"/au/com/codeconstruct/"};
291 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
292
293 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530294 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
295 configs,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530296 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
Harshit Aghera8951c872025-06-25 15:25:33 +0530297 queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
298 mctpRequester, configs, path, ec, ret);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530299 },
300 "xyz.openbmc_project.ObjectMapper",
301 "/xyz/openbmc_project/object_mapper",
302 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
303 ifaceList);
304}
305
306void processSensorConfigs(
307 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
308 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
309 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530310 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
311 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530312 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
313 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
314{
315 for (const auto& [path, interfaces] : resp)
316 {
317 for (const auto& [intf, cfg] : interfaces)
318 {
319 if (intf != configInterfaceName(deviceType))
320 {
321 continue;
322 }
323
324 SensorConfigs configs;
325
326 configs.name = loadVariant<std::string>(cfg, "Name");
327
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +0530328 try
329 {
330 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
331 }
332 catch (const std::invalid_argument&)
333 {
334 // PollRate is an optional config
335 configs.pollRate = sensorPollRateMs;
336 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530337
Harshit Aghera8951c872025-06-25 15:25:33 +0530338 discoverDevices(io, objectServer, gpuDevices, smaDevices,
339 dbusConnection, mctpRequester, configs, path);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530340
341 lg2::info(
342 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
343 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
344 }
345 }
346}
347
348void createSensors(
349 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
350 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
351 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530352 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
353 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530354 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
355 mctp::MctpRequester& mctpRequester)
356{
357 if (!dbusConnection)
358 {
359 lg2::error("Connection not created");
360 return;
361 }
362 dbusConnection->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530363 [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
364 &objectServer](boost::system::error_code ec,
365 const ManagedObjectType& resp) {
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530366 if (ec)
367 {
368 lg2::error("Error contacting entity manager");
369 return;
370 }
371
Harshit Aghera8951c872025-06-25 15:25:33 +0530372 processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
373 dbusConnection, mctpRequester, resp);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530374 },
375 entityManagerName, "/xyz/openbmc_project/inventory",
376 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
377}
378
379void interfaceRemoved(
380 sdbusplus::message_t& message,
381 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
Harshit Aghera8951c872025-06-25 15:25:33 +0530382 gpuDevices,
383 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
384 smaDevices)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530385{
386 if (message.is_method_error())
387 {
388 lg2::error("interfacesRemoved callback method error");
389 return;
390 }
391
392 sdbusplus::message::object_path removedPath;
393 std::vector<std::string> interfaces;
394
395 message.read(removedPath, interfaces);
396
397 // If the xyz.openbmc_project.Confguration.X interface was removed
398 // for one or more sensors, delete those sensor objects.
399 auto sensorIt = gpuDevices.begin();
400 while (sensorIt != gpuDevices.end())
401 {
402 if ((sensorIt->second->getPath() == removedPath) &&
403 (std::find(interfaces.begin(), interfaces.end(),
404 configInterfaceName(deviceType)) != interfaces.end()))
405 {
406 sensorIt = gpuDevices.erase(sensorIt);
407 }
408 else
409 {
410 sensorIt++;
411 }
412 }
Harshit Aghera8951c872025-06-25 15:25:33 +0530413
414 auto smaSensorIt = smaDevices.begin();
415 while (smaSensorIt != smaDevices.end())
416 {
417 if ((smaSensorIt->second->getPath() == removedPath) &&
418 (std::find(interfaces.begin(), interfaces.end(),
419 configInterfaceName(deviceType)) != interfaces.end()))
420 {
421 smaSensorIt = smaDevices.erase(smaSensorIt);
422 }
423 else
424 {
425 smaSensorIt++;
426 }
427 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530428}