blob: 9afb000715298c8892a6dfb8dfd9914e8054b4cc [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaDeviceDiscovery.hpp"
8
9#include "NvidiaGpuDevice.hpp"
Harshit Aghera8951c872025-06-25 15:25:33 +053010#include "NvidiaSmaDevice.hpp"
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053011#include "Utils.hpp"
12
13#include <bits/basic_string.h>
14
15#include <MctpRequester.hpp>
16#include <NvidiaGpuMctpVdm.hpp>
17#include <OcpMctpVdm.hpp>
18#include <boost/asio/io_context.hpp>
19#include <boost/container/flat_map.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23#include <sdbusplus/message.hpp>
24#include <sdbusplus/message/native_types.hpp>
25
26#include <algorithm>
27#include <array>
28#include <cstdint>
29#include <memory>
30#include <span>
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053031#include <stdexcept>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053032#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070033#include <system_error>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053034#include <utility>
35#include <variant>
36#include <vector>
37
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053038static constexpr auto sensorPollRateMs = 1000;
39
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053040void processQueryDeviceIdResponse(
41 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
42 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
43 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +053044 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
45 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053046 const std::shared_ptr<sdbusplus::asio::connection>& conn,
47 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
Marc Olberdingd0125c92025-10-08 14:37:19 -070048 const std::string& path, uint8_t eid,
49 const std::error_code& sendRecvMsgResult,
50 std::span<const uint8_t> queryDeviceIdentificationResponse)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053051{
Marc Olberdingd0125c92025-10-08 14:37:19 -070052 if (sendRecvMsgResult)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053053 {
54 lg2::error(
55 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -070056 "EID", eid, "RC", sendRecvMsgResult.message());
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053057 return;
58 }
59
60 ocp::accelerator_management::CompletionCode cc{};
61 uint16_t reasonCode = 0;
62 uint8_t responseDeviceType = 0;
63 uint8_t responseInstanceId = 0;
64
65 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
66 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
67 responseInstanceId);
68
69 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
70 {
71 lg2::error(
72 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
73 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
74 return;
75 }
76
Harshit Aghera8951c872025-06-25 15:25:33 +053077 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053078 {
Harshit Aghera8951c872025-06-25 15:25:33 +053079 case gpu::DeviceIdentification::DEVICE_GPU:
80 {
81 lg2::info(
82 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
83 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
84 responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053085
Harshit Aghera8951c872025-06-25 15:25:33 +053086 auto gpuName = configs.name + '_' +
87 std::to_string(responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053088
Marc Olberdingac920732025-09-28 21:56:54 -070089 auto gpu = gpuDevices
90 .insert(std::make_pair(
91 gpuName, std::make_shared<GpuDevice>(
92 configs, gpuName, path, conn, eid,
93 io, mctpRequester, objectServer)))
94 .first;
95 (*gpu).second->init();
Harshit Aghera8951c872025-06-25 15:25:33 +053096 break;
97 }
98
99 case gpu::DeviceIdentification::DEVICE_SMA:
100 {
101 lg2::info(
102 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
103 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
104 responseInstanceId);
105
106 auto smaName = configs.name + "_SMA_" +
107 std::to_string(responseInstanceId);
108
109 smaDevices[smaName] =
110 std::make_shared<SmaDevice>(configs, smaName, path, conn, eid,
111 io, mctpRequester, objectServer);
112 break;
113 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530114 }
115}
116
117void queryDeviceIdentification(
118 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
119 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
120 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530121 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
122 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530123 const std::shared_ptr<sdbusplus::asio::connection>& conn,
124 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
125 const std::string& path, uint8_t eid)
126{
127 auto queryDeviceIdentificationRequest = std::make_shared<
128 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
129
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530130 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
131 0, *queryDeviceIdentificationRequest);
132 if (rc != 0)
133 {
134 lg2::error(
135 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
136 "EID", eid, "RC", rc);
137 return;
138 }
139
140 mctpRequester.sendRecvMsg(
141 eid, *queryDeviceIdentificationRequest,
Harshit Aghera8951c872025-06-25 15:25:33 +0530142 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
Marc Olberdingd0125c92025-10-08 14:37:19 -0700143 configs, path, eid, queryDeviceIdentificationRequest](
144 const std::error_code& ec, std::span<const uint8_t> response) {
145 processQueryDeviceIdResponse(io, objectServer, gpuDevices,
146 smaDevices, conn, mctpRequester,
147 configs, path, eid, ec, response);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530148 });
149}
150
151void processEndpoint(
152 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
153 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
154 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530155 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
156 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530157 const std::shared_ptr<sdbusplus::asio::connection>& conn,
158 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
159 const std::string& path, const boost::system::error_code& ec,
160 const SensorBaseConfigMap& endpoint)
161{
162 if (ec)
163 {
164 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
165 ec.message());
166 return;
167 }
168
169 auto hasEid = endpoint.find("EID");
170 uint8_t eid{};
171
172 if (hasEid != endpoint.end())
173 {
174 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
175 if (eidPtr != nullptr)
176 {
177 eid = *eidPtr;
178 }
179 else
180 {
181 lg2::error(
182 "Error processing MCTP endpoint: Property EID does not have valid type.");
183 return;
184 }
185 }
186 else
187 {
188 lg2::error(
189 "Error processing MCTP endpoint: Property EID not found in the configuration.");
190 return;
191 }
192
193 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
194 std::vector<uint8_t> mctpTypes{};
195
196 if (hasMctpTypes != endpoint.end())
197 {
198 const auto* mctpTypePtr =
199 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
200 if (mctpTypePtr != nullptr)
201 {
202 mctpTypes = *mctpTypePtr;
203 }
204 else
205 {
206 lg2::error(
207 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
208 "EID", eid);
209 return;
210 }
211 }
212 else
213 {
214 lg2::error(
215 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
216 "EID", eid);
217 return;
218 }
219
220 if (std::find(mctpTypes.begin(), mctpTypes.end(),
221 ocp::accelerator_management::messageType) != mctpTypes.end())
222 {
223 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
Harshit Aghera8951c872025-06-25 15:25:33 +0530224 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
225 conn, mctpRequester, configs, path, eid);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530226 }
227}
228
229void queryEndpoints(
230 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
231 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
232 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530233 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
234 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530235 const std::shared_ptr<sdbusplus::asio::connection>& conn,
236 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
237 const std::string& path, const boost::system::error_code& ec,
238 const GetSubTreeType& ret)
239{
240 if (ec)
241 {
242 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
243 ec.message());
244 return;
245 }
246
247 if (ret.empty())
248 {
249 return;
250 }
251
252 for (const auto& [objPath, services] : ret)
253 {
254 for (const auto& [service, ifaces] : services)
255 {
256 for (const auto& iface : ifaces)
257 {
258 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
259 {
260 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530261 [&io, &objectServer, &gpuDevices, &smaDevices, conn,
262 &mctpRequester, configs,
263 path](const boost::system::error_code& ec,
264 const SensorBaseConfigMap& endpoint) {
265 processEndpoint(io, objectServer, gpuDevices,
266 smaDevices, conn, mctpRequester,
267 configs, path, ec, endpoint);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530268 },
269 service, objPath, "org.freedesktop.DBus.Properties",
270 "GetAll", iface);
271 }
272 }
273 }
274 }
275}
276
277void discoverDevices(
278 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
279 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
280 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530281 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
282 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530283 const std::shared_ptr<sdbusplus::asio::connection>& conn,
284 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
285 const std::string& path)
286{
287 std::string searchPath{"/au/com/codeconstruct/"};
288 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
289
290 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530291 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
292 configs,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530293 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
Harshit Aghera8951c872025-06-25 15:25:33 +0530294 queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
295 mctpRequester, configs, path, ec, ret);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530296 },
297 "xyz.openbmc_project.ObjectMapper",
298 "/xyz/openbmc_project/object_mapper",
299 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
300 ifaceList);
301}
302
303void processSensorConfigs(
304 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
305 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
306 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530307 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
308 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530309 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
310 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
311{
312 for (const auto& [path, interfaces] : resp)
313 {
314 for (const auto& [intf, cfg] : interfaces)
315 {
316 if (intf != configInterfaceName(deviceType))
317 {
318 continue;
319 }
320
321 SensorConfigs configs;
322
323 configs.name = loadVariant<std::string>(cfg, "Name");
324
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +0530325 try
326 {
327 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
328 }
329 catch (const std::invalid_argument&)
330 {
331 // PollRate is an optional config
332 configs.pollRate = sensorPollRateMs;
333 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530334
Harshit Aghera8951c872025-06-25 15:25:33 +0530335 discoverDevices(io, objectServer, gpuDevices, smaDevices,
336 dbusConnection, mctpRequester, configs, path);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530337
338 lg2::info(
339 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
340 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
341 }
342 }
343}
344
345void createSensors(
346 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
347 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
348 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530349 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
350 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530351 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
352 mctp::MctpRequester& mctpRequester)
353{
354 if (!dbusConnection)
355 {
356 lg2::error("Connection not created");
357 return;
358 }
359 dbusConnection->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530360 [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
361 &objectServer](boost::system::error_code ec,
362 const ManagedObjectType& resp) {
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530363 if (ec)
364 {
365 lg2::error("Error contacting entity manager");
366 return;
367 }
368
Harshit Aghera8951c872025-06-25 15:25:33 +0530369 processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
370 dbusConnection, mctpRequester, resp);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530371 },
372 entityManagerName, "/xyz/openbmc_project/inventory",
373 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
374}
375
376void interfaceRemoved(
377 sdbusplus::message_t& message,
378 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
Harshit Aghera8951c872025-06-25 15:25:33 +0530379 gpuDevices,
380 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
381 smaDevices)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530382{
383 if (message.is_method_error())
384 {
385 lg2::error("interfacesRemoved callback method error");
386 return;
387 }
388
389 sdbusplus::message::object_path removedPath;
390 std::vector<std::string> interfaces;
391
392 message.read(removedPath, interfaces);
393
394 // If the xyz.openbmc_project.Confguration.X interface was removed
395 // for one or more sensors, delete those sensor objects.
396 auto sensorIt = gpuDevices.begin();
397 while (sensorIt != gpuDevices.end())
398 {
399 if ((sensorIt->second->getPath() == removedPath) &&
400 (std::find(interfaces.begin(), interfaces.end(),
401 configInterfaceName(deviceType)) != interfaces.end()))
402 {
403 sensorIt = gpuDevices.erase(sensorIt);
404 }
405 else
406 {
407 sensorIt++;
408 }
409 }
Harshit Aghera8951c872025-06-25 15:25:33 +0530410
411 auto smaSensorIt = smaDevices.begin();
412 while (smaSensorIt != smaDevices.end())
413 {
414 if ((smaSensorIt->second->getPath() == removedPath) &&
415 (std::find(interfaces.begin(), interfaces.end(),
416 configInterfaceName(deviceType)) != interfaces.end()))
417 {
418 smaSensorIt = smaDevices.erase(smaSensorIt);
419 }
420 else
421 {
422 smaSensorIt++;
423 }
424 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530425}