blob: 47ec061355dbad3226fc8d9dec0d72dd7b087106 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaDeviceDiscovery.hpp"
8
9#include "NvidiaGpuDevice.hpp"
Harshit Aghera8951c872025-06-25 15:25:33 +053010#include "NvidiaSmaDevice.hpp"
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053011#include "Utils.hpp"
12
13#include <bits/basic_string.h>
14
15#include <MctpRequester.hpp>
16#include <NvidiaGpuMctpVdm.hpp>
17#include <OcpMctpVdm.hpp>
18#include <boost/asio/io_context.hpp>
19#include <boost/container/flat_map.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23#include <sdbusplus/message.hpp>
24#include <sdbusplus/message/native_types.hpp>
25
26#include <algorithm>
27#include <array>
28#include <cstdint>
29#include <memory>
30#include <span>
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053031#include <stdexcept>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053032#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070033#include <system_error>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053034#include <utility>
35#include <variant>
36#include <vector>
37
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053038static constexpr auto sensorPollRateMs = 1000;
39
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053040void processQueryDeviceIdResponse(
41 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
42 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
43 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +053044 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
45 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053046 const std::shared_ptr<sdbusplus::asio::connection>& conn,
47 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
Marc Olberdingd0125c92025-10-08 14:37:19 -070048 const std::string& path, uint8_t eid,
49 const std::error_code& sendRecvMsgResult,
50 std::span<const uint8_t> queryDeviceIdentificationResponse)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053051{
Marc Olberdingd0125c92025-10-08 14:37:19 -070052 if (sendRecvMsgResult)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053053 {
54 lg2::error(
55 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070056 "EID", eid, "RC", sendRecvMsgResult.message());
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053057 return;
58 }
59
60 ocp::accelerator_management::CompletionCode cc{};
61 uint16_t reasonCode = 0;
62 uint8_t responseDeviceType = 0;
63 uint8_t responseInstanceId = 0;
64
65 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
66 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
67 responseInstanceId);
68
69 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
70 {
71 lg2::error(
72 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
73 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
74 return;
75 }
76
Harshit Aghera8951c872025-06-25 15:25:33 +053077 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053078 {
Harshit Aghera8951c872025-06-25 15:25:33 +053079 case gpu::DeviceIdentification::DEVICE_GPU:
80 {
81 lg2::info(
82 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
83 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
84 responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053085
Harshit Aghera8951c872025-06-25 15:25:33 +053086 auto gpuName = configs.name + '_' +
87 std::to_string(responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053088
Harshit Aghera8951c872025-06-25 15:25:33 +053089 gpuDevices[gpuName] =
90 std::make_shared<GpuDevice>(configs, gpuName, path, conn, eid,
91 io, mctpRequester, objectServer);
92 break;
93 }
94
95 case gpu::DeviceIdentification::DEVICE_SMA:
96 {
97 lg2::info(
98 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
99 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
100 responseInstanceId);
101
102 auto smaName = configs.name + "_SMA_" +
103 std::to_string(responseInstanceId);
104
105 smaDevices[smaName] =
106 std::make_shared<SmaDevice>(configs, smaName, path, conn, eid,
107 io, mctpRequester, objectServer);
108 break;
109 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530110 }
111}
112
113void queryDeviceIdentification(
114 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
115 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
116 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530117 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
118 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530119 const std::shared_ptr<sdbusplus::asio::connection>& conn,
120 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
121 const std::string& path, uint8_t eid)
122{
123 auto queryDeviceIdentificationRequest = std::make_shared<
124 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
125
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530126 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
127 0, *queryDeviceIdentificationRequest);
128 if (rc != 0)
129 {
130 lg2::error(
131 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
132 "EID", eid, "RC", rc);
133 return;
134 }
135
136 mctpRequester.sendRecvMsg(
137 eid, *queryDeviceIdentificationRequest,
Harshit Aghera8951c872025-06-25 15:25:33 +0530138 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
Marc Olberdingd0125c92025-10-08 14:37:19 -0700139 configs, path, eid, queryDeviceIdentificationRequest](
140 const std::error_code& ec, std::span<const uint8_t> response) {
141 processQueryDeviceIdResponse(io, objectServer, gpuDevices,
142 smaDevices, conn, mctpRequester,
143 configs, path, eid, ec, response);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530144 });
145}
146
147void processEndpoint(
148 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
149 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
150 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530151 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
152 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530153 const std::shared_ptr<sdbusplus::asio::connection>& conn,
154 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
155 const std::string& path, const boost::system::error_code& ec,
156 const SensorBaseConfigMap& endpoint)
157{
158 if (ec)
159 {
160 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
161 ec.message());
162 return;
163 }
164
165 auto hasEid = endpoint.find("EID");
166 uint8_t eid{};
167
168 if (hasEid != endpoint.end())
169 {
170 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
171 if (eidPtr != nullptr)
172 {
173 eid = *eidPtr;
174 }
175 else
176 {
177 lg2::error(
178 "Error processing MCTP endpoint: Property EID does not have valid type.");
179 return;
180 }
181 }
182 else
183 {
184 lg2::error(
185 "Error processing MCTP endpoint: Property EID not found in the configuration.");
186 return;
187 }
188
189 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
190 std::vector<uint8_t> mctpTypes{};
191
192 if (hasMctpTypes != endpoint.end())
193 {
194 const auto* mctpTypePtr =
195 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
196 if (mctpTypePtr != nullptr)
197 {
198 mctpTypes = *mctpTypePtr;
199 }
200 else
201 {
202 lg2::error(
203 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
204 "EID", eid);
205 return;
206 }
207 }
208 else
209 {
210 lg2::error(
211 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
212 "EID", eid);
213 return;
214 }
215
216 if (std::find(mctpTypes.begin(), mctpTypes.end(),
217 ocp::accelerator_management::messageType) != mctpTypes.end())
218 {
219 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
Harshit Aghera8951c872025-06-25 15:25:33 +0530220 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
221 conn, mctpRequester, configs, path, eid);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530222 }
223}
224
225void queryEndpoints(
226 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
227 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
228 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530229 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
230 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530231 const std::shared_ptr<sdbusplus::asio::connection>& conn,
232 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
233 const std::string& path, const boost::system::error_code& ec,
234 const GetSubTreeType& ret)
235{
236 if (ec)
237 {
238 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
239 ec.message());
240 return;
241 }
242
243 if (ret.empty())
244 {
245 return;
246 }
247
248 for (const auto& [objPath, services] : ret)
249 {
250 for (const auto& [service, ifaces] : services)
251 {
252 for (const auto& iface : ifaces)
253 {
254 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
255 {
256 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530257 [&io, &objectServer, &gpuDevices, &smaDevices, conn,
258 &mctpRequester, configs,
259 path](const boost::system::error_code& ec,
260 const SensorBaseConfigMap& endpoint) {
261 processEndpoint(io, objectServer, gpuDevices,
262 smaDevices, conn, mctpRequester,
263 configs, path, ec, endpoint);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530264 },
265 service, objPath, "org.freedesktop.DBus.Properties",
266 "GetAll", iface);
267 }
268 }
269 }
270 }
271}
272
273void discoverDevices(
274 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
275 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
276 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530277 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
278 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530279 const std::shared_ptr<sdbusplus::asio::connection>& conn,
280 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
281 const std::string& path)
282{
283 std::string searchPath{"/au/com/codeconstruct/"};
284 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
285
286 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530287 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
288 configs,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530289 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
Harshit Aghera8951c872025-06-25 15:25:33 +0530290 queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
291 mctpRequester, configs, path, ec, ret);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530292 },
293 "xyz.openbmc_project.ObjectMapper",
294 "/xyz/openbmc_project/object_mapper",
295 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
296 ifaceList);
297}
298
299void processSensorConfigs(
300 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
301 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
302 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530303 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
304 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530305 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
306 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
307{
308 for (const auto& [path, interfaces] : resp)
309 {
310 for (const auto& [intf, cfg] : interfaces)
311 {
312 if (intf != configInterfaceName(deviceType))
313 {
314 continue;
315 }
316
317 SensorConfigs configs;
318
319 configs.name = loadVariant<std::string>(cfg, "Name");
320
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +0530321 try
322 {
323 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
324 }
325 catch (const std::invalid_argument&)
326 {
327 // PollRate is an optional config
328 configs.pollRate = sensorPollRateMs;
329 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530330
Harshit Aghera8951c872025-06-25 15:25:33 +0530331 discoverDevices(io, objectServer, gpuDevices, smaDevices,
332 dbusConnection, mctpRequester, configs, path);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530333
334 lg2::info(
335 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
336 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
337 }
338 }
339}
340
341void createSensors(
342 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
343 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
344 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530345 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
346 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530347 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
348 mctp::MctpRequester& mctpRequester)
349{
350 if (!dbusConnection)
351 {
352 lg2::error("Connection not created");
353 return;
354 }
355 dbusConnection->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530356 [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
357 &objectServer](boost::system::error_code ec,
358 const ManagedObjectType& resp) {
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530359 if (ec)
360 {
361 lg2::error("Error contacting entity manager");
362 return;
363 }
364
Harshit Aghera8951c872025-06-25 15:25:33 +0530365 processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
366 dbusConnection, mctpRequester, resp);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530367 },
368 entityManagerName, "/xyz/openbmc_project/inventory",
369 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
370}
371
372void interfaceRemoved(
373 sdbusplus::message_t& message,
374 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
Harshit Aghera8951c872025-06-25 15:25:33 +0530375 gpuDevices,
376 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
377 smaDevices)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530378{
379 if (message.is_method_error())
380 {
381 lg2::error("interfacesRemoved callback method error");
382 return;
383 }
384
385 sdbusplus::message::object_path removedPath;
386 std::vector<std::string> interfaces;
387
388 message.read(removedPath, interfaces);
389
390 // If the xyz.openbmc_project.Confguration.X interface was removed
391 // for one or more sensors, delete those sensor objects.
392 auto sensorIt = gpuDevices.begin();
393 while (sensorIt != gpuDevices.end())
394 {
395 if ((sensorIt->second->getPath() == removedPath) &&
396 (std::find(interfaces.begin(), interfaces.end(),
397 configInterfaceName(deviceType)) != interfaces.end()))
398 {
399 sensorIt = gpuDevices.erase(sensorIt);
400 }
401 else
402 {
403 sensorIt++;
404 }
405 }
Harshit Aghera8951c872025-06-25 15:25:33 +0530406
407 auto smaSensorIt = smaDevices.begin();
408 while (smaSensorIt != smaDevices.end())
409 {
410 if ((smaSensorIt->second->getPath() == removedPath) &&
411 (std::find(interfaces.begin(), interfaces.end(),
412 configInterfaceName(deviceType)) != interfaces.end()))
413 {
414 smaSensorIt = smaDevices.erase(smaSensorIt);
415 }
416 else
417 {
418 smaSensorIt++;
419 }
420 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530421}