blob: dd682cfa25f36daa0af85dca47f484fbd102c08a [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaDeviceDiscovery.hpp"
8
9#include "NvidiaGpuDevice.hpp"
Harshit Aghera8951c872025-06-25 15:25:33 +053010#include "NvidiaSmaDevice.hpp"
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053011#include "Utils.hpp"
12
13#include <bits/basic_string.h>
14
15#include <MctpRequester.hpp>
16#include <NvidiaGpuMctpVdm.hpp>
17#include <OcpMctpVdm.hpp>
18#include <boost/asio/io_context.hpp>
19#include <boost/container/flat_map.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23#include <sdbusplus/message.hpp>
24#include <sdbusplus/message/native_types.hpp>
25
26#include <algorithm>
27#include <array>
28#include <cstdint>
29#include <memory>
30#include <span>
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053031#include <stdexcept>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053032#include <string>
33#include <utility>
34#include <variant>
35#include <vector>
36
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +053037static constexpr auto sensorPollRateMs = 1000;
38
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053039void processQueryDeviceIdResponse(
40 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
41 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
42 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +053043 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
44 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053045 const std::shared_ptr<sdbusplus::asio::connection>& conn,
46 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
47 const std::string& path, uint8_t eid, int sendRecvMsgResult,
48 std::span<uint8_t> queryDeviceIdentificationResponse)
49{
50 if (sendRecvMsgResult != 0)
51 {
52 lg2::error(
53 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
54 "EID", eid, "RC", sendRecvMsgResult);
55 return;
56 }
57
58 ocp::accelerator_management::CompletionCode cc{};
59 uint16_t reasonCode = 0;
60 uint8_t responseDeviceType = 0;
61 uint8_t responseInstanceId = 0;
62
63 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
64 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
65 responseInstanceId);
66
67 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
68 {
69 lg2::error(
70 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
71 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
72 return;
73 }
74
Harshit Aghera8951c872025-06-25 15:25:33 +053075 switch (static_cast<gpu::DeviceIdentification>(responseDeviceType))
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053076 {
Harshit Aghera8951c872025-06-25 15:25:33 +053077 case gpu::DeviceIdentification::DEVICE_GPU:
78 {
79 lg2::info(
80 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
81 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
82 responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053083
Harshit Aghera8951c872025-06-25 15:25:33 +053084 auto gpuName = configs.name + '_' +
85 std::to_string(responseInstanceId);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053086
Harshit Aghera8951c872025-06-25 15:25:33 +053087 gpuDevices[gpuName] =
88 std::make_shared<GpuDevice>(configs, gpuName, path, conn, eid,
89 io, mctpRequester, objectServer);
90 break;
91 }
92
93 case gpu::DeviceIdentification::DEVICE_SMA:
94 {
95 lg2::info(
96 "Found the SMA Device with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
97 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
98 responseInstanceId);
99
100 auto smaName = configs.name + "_SMA_" +
101 std::to_string(responseInstanceId);
102
103 smaDevices[smaName] =
104 std::make_shared<SmaDevice>(configs, smaName, path, conn, eid,
105 io, mctpRequester, objectServer);
106 break;
107 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530108 }
109}
110
111void queryDeviceIdentification(
112 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
113 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
114 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530115 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
116 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530117 const std::shared_ptr<sdbusplus::asio::connection>& conn,
118 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
119 const std::string& path, uint8_t eid)
120{
121 auto queryDeviceIdentificationRequest = std::make_shared<
122 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
123
124 auto queryDeviceIdentificationResponse = std::make_shared<
125 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationResponse)>>();
126
127 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
128 0, *queryDeviceIdentificationRequest);
129 if (rc != 0)
130 {
131 lg2::error(
132 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
133 "EID", eid, "RC", rc);
134 return;
135 }
136
137 mctpRequester.sendRecvMsg(
138 eid, *queryDeviceIdentificationRequest,
139 *queryDeviceIdentificationResponse,
Harshit Aghera8951c872025-06-25 15:25:33 +0530140 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
141 configs, path, eid, queryDeviceIdentificationRequest,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530142 queryDeviceIdentificationResponse](int sendRecvMsgResult) {
143 processQueryDeviceIdResponse(
Harshit Aghera8951c872025-06-25 15:25:33 +0530144 io, objectServer, gpuDevices, smaDevices, conn, mctpRequester,
145 configs, path, eid, sendRecvMsgResult,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530146 *queryDeviceIdentificationResponse);
147 });
148}
149
150void processEndpoint(
151 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
152 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
153 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530154 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
155 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530156 const std::shared_ptr<sdbusplus::asio::connection>& conn,
157 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
158 const std::string& path, const boost::system::error_code& ec,
159 const SensorBaseConfigMap& endpoint)
160{
161 if (ec)
162 {
163 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
164 ec.message());
165 return;
166 }
167
168 auto hasEid = endpoint.find("EID");
169 uint8_t eid{};
170
171 if (hasEid != endpoint.end())
172 {
173 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
174 if (eidPtr != nullptr)
175 {
176 eid = *eidPtr;
177 }
178 else
179 {
180 lg2::error(
181 "Error processing MCTP endpoint: Property EID does not have valid type.");
182 return;
183 }
184 }
185 else
186 {
187 lg2::error(
188 "Error processing MCTP endpoint: Property EID not found in the configuration.");
189 return;
190 }
191
192 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
193 std::vector<uint8_t> mctpTypes{};
194
195 if (hasMctpTypes != endpoint.end())
196 {
197 const auto* mctpTypePtr =
198 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
199 if (mctpTypePtr != nullptr)
200 {
201 mctpTypes = *mctpTypePtr;
202 }
203 else
204 {
205 lg2::error(
206 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
207 "EID", eid);
208 return;
209 }
210 }
211 else
212 {
213 lg2::error(
214 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
215 "EID", eid);
216 return;
217 }
218
219 if (std::find(mctpTypes.begin(), mctpTypes.end(),
220 ocp::accelerator_management::messageType) != mctpTypes.end())
221 {
222 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
Harshit Aghera8951c872025-06-25 15:25:33 +0530223 queryDeviceIdentification(io, objectServer, gpuDevices, smaDevices,
224 conn, mctpRequester, configs, path, eid);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530225 }
226}
227
228void queryEndpoints(
229 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
230 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
231 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530232 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
233 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530234 const std::shared_ptr<sdbusplus::asio::connection>& conn,
235 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
236 const std::string& path, const boost::system::error_code& ec,
237 const GetSubTreeType& ret)
238{
239 if (ec)
240 {
241 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
242 ec.message());
243 return;
244 }
245
246 if (ret.empty())
247 {
248 return;
249 }
250
251 for (const auto& [objPath, services] : ret)
252 {
253 for (const auto& [service, ifaces] : services)
254 {
255 for (const auto& iface : ifaces)
256 {
257 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
258 {
259 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530260 [&io, &objectServer, &gpuDevices, &smaDevices, conn,
261 &mctpRequester, configs,
262 path](const boost::system::error_code& ec,
263 const SensorBaseConfigMap& endpoint) {
264 processEndpoint(io, objectServer, gpuDevices,
265 smaDevices, conn, mctpRequester,
266 configs, path, ec, endpoint);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530267 },
268 service, objPath, "org.freedesktop.DBus.Properties",
269 "GetAll", iface);
270 }
271 }
272 }
273 }
274}
275
276void discoverDevices(
277 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
278 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
279 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530280 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
281 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530282 const std::shared_ptr<sdbusplus::asio::connection>& conn,
283 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
284 const std::string& path)
285{
286 std::string searchPath{"/au/com/codeconstruct/"};
287 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
288
289 conn->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530290 [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
291 configs,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530292 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
Harshit Aghera8951c872025-06-25 15:25:33 +0530293 queryEndpoints(io, objectServer, gpuDevices, smaDevices, conn,
294 mctpRequester, configs, path, ec, ret);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530295 },
296 "xyz.openbmc_project.ObjectMapper",
297 "/xyz/openbmc_project/object_mapper",
298 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
299 ifaceList);
300}
301
302void processSensorConfigs(
303 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
304 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
305 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530306 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
307 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530308 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
309 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
310{
311 for (const auto& [path, interfaces] : resp)
312 {
313 for (const auto& [intf, cfg] : interfaces)
314 {
315 if (intf != configInterfaceName(deviceType))
316 {
317 continue;
318 }
319
320 SensorConfigs configs;
321
322 configs.name = loadVariant<std::string>(cfg, "Name");
323
Deepak Kodihalli0ad3a7e2025-06-26 12:24:18 +0530324 try
325 {
326 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
327 }
328 catch (const std::invalid_argument&)
329 {
330 // PollRate is an optional config
331 configs.pollRate = sensorPollRateMs;
332 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530333
Harshit Aghera8951c872025-06-25 15:25:33 +0530334 discoverDevices(io, objectServer, gpuDevices, smaDevices,
335 dbusConnection, mctpRequester, configs, path);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530336
337 lg2::info(
338 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
339 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
340 }
341 }
342}
343
344void createSensors(
345 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
346 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
347 gpuDevices,
Harshit Aghera8951c872025-06-25 15:25:33 +0530348 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
349 smaDevices,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530350 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
351 mctp::MctpRequester& mctpRequester)
352{
353 if (!dbusConnection)
354 {
355 lg2::error("Connection not created");
356 return;
357 }
358 dbusConnection->async_method_call(
Harshit Aghera8951c872025-06-25 15:25:33 +0530359 [&gpuDevices, &smaDevices, &mctpRequester, dbusConnection, &io,
360 &objectServer](boost::system::error_code ec,
361 const ManagedObjectType& resp) {
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530362 if (ec)
363 {
364 lg2::error("Error contacting entity manager");
365 return;
366 }
367
Harshit Aghera8951c872025-06-25 15:25:33 +0530368 processSensorConfigs(io, objectServer, gpuDevices, smaDevices,
369 dbusConnection, mctpRequester, resp);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530370 },
371 entityManagerName, "/xyz/openbmc_project/inventory",
372 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
373}
374
375void interfaceRemoved(
376 sdbusplus::message_t& message,
377 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
Harshit Aghera8951c872025-06-25 15:25:33 +0530378 gpuDevices,
379 boost::container::flat_map<std::string, std::shared_ptr<SmaDevice>>&
380 smaDevices)
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530381{
382 if (message.is_method_error())
383 {
384 lg2::error("interfacesRemoved callback method error");
385 return;
386 }
387
388 sdbusplus::message::object_path removedPath;
389 std::vector<std::string> interfaces;
390
391 message.read(removedPath, interfaces);
392
393 // If the xyz.openbmc_project.Confguration.X interface was removed
394 // for one or more sensors, delete those sensor objects.
395 auto sensorIt = gpuDevices.begin();
396 while (sensorIt != gpuDevices.end())
397 {
398 if ((sensorIt->second->getPath() == removedPath) &&
399 (std::find(interfaces.begin(), interfaces.end(),
400 configInterfaceName(deviceType)) != interfaces.end()))
401 {
402 sensorIt = gpuDevices.erase(sensorIt);
403 }
404 else
405 {
406 sensorIt++;
407 }
408 }
Harshit Aghera8951c872025-06-25 15:25:33 +0530409
410 auto smaSensorIt = smaDevices.begin();
411 while (smaSensorIt != smaDevices.end())
412 {
413 if ((smaSensorIt->second->getPath() == removedPath) &&
414 (std::find(interfaces.begin(), interfaces.end(),
415 configInterfaceName(deviceType)) != interfaces.end()))
416 {
417 smaSensorIt = smaDevices.erase(smaSensorIt);
418 }
419 else
420 {
421 smaSensorIt++;
422 }
423 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530424}