blob: adb21eadfb8d47641d11dcc0dbe3320fe24fd216 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaDeviceDiscovery.hpp"
8
9#include "NvidiaGpuDevice.hpp"
10#include "Utils.hpp"
11
12#include <bits/basic_string.h>
13
14#include <MctpRequester.hpp>
15#include <NvidiaGpuMctpVdm.hpp>
16#include <OcpMctpVdm.hpp>
17#include <boost/asio/io_context.hpp>
18#include <boost/container/flat_map.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22#include <sdbusplus/message.hpp>
23#include <sdbusplus/message/native_types.hpp>
24
25#include <algorithm>
26#include <array>
27#include <cstdint>
28#include <memory>
29#include <span>
30#include <string>
31#include <utility>
32#include <variant>
33#include <vector>
34
35void processQueryDeviceIdResponse(
36 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
37 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
38 gpuDevices,
39 const std::shared_ptr<sdbusplus::asio::connection>& conn,
40 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
41 const std::string& path, uint8_t eid, int sendRecvMsgResult,
42 std::span<uint8_t> queryDeviceIdentificationResponse)
43{
44 if (sendRecvMsgResult != 0)
45 {
46 lg2::error(
47 "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
48 "EID", eid, "RC", sendRecvMsgResult);
49 return;
50 }
51
52 ocp::accelerator_management::CompletionCode cc{};
53 uint16_t reasonCode = 0;
54 uint8_t responseDeviceType = 0;
55 uint8_t responseInstanceId = 0;
56
57 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
58 queryDeviceIdentificationResponse, cc, reasonCode, responseDeviceType,
59 responseInstanceId);
60
61 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
62 {
63 lg2::error(
64 "Error processing MCTP endpoint with eid {EID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
65 "EID", eid, "RC", rc, "CC", cc, "RESC", reasonCode);
66 return;
67 }
68
69 if (responseDeviceType ==
70 static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
71 {
72 lg2::info(
73 "Found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
74 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
75 responseInstanceId);
76
77 auto gpuName = configs.name + '_' + std::to_string(responseInstanceId);
78
79 gpuDevices[gpuName] = std::make_shared<GpuDevice>(
80 configs, gpuName, path, conn, eid, io, mctpRequester, objectServer);
81 }
82}
83
84void queryDeviceIdentification(
85 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
86 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
87 gpuDevices,
88 const std::shared_ptr<sdbusplus::asio::connection>& conn,
89 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
90 const std::string& path, uint8_t eid)
91{
92 auto queryDeviceIdentificationRequest = std::make_shared<
93 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
94
95 auto queryDeviceIdentificationResponse = std::make_shared<
96 std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationResponse)>>();
97
98 auto rc = gpu::encodeQueryDeviceIdentificationRequest(
99 0, *queryDeviceIdentificationRequest);
100 if (rc != 0)
101 {
102 lg2::error(
103 "Error processing MCTP endpoint with eid {EID} : encode failed, rc={RC}",
104 "EID", eid, "RC", rc);
105 return;
106 }
107
108 mctpRequester.sendRecvMsg(
109 eid, *queryDeviceIdentificationRequest,
110 *queryDeviceIdentificationResponse,
111 [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs, path,
112 eid, queryDeviceIdentificationRequest,
113 queryDeviceIdentificationResponse](int sendRecvMsgResult) {
114 processQueryDeviceIdResponse(
115 io, objectServer, gpuDevices, conn, mctpRequester, configs,
116 path, eid, sendRecvMsgResult,
117 *queryDeviceIdentificationResponse);
118 });
119}
120
121void processEndpoint(
122 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
123 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
124 gpuDevices,
125 const std::shared_ptr<sdbusplus::asio::connection>& conn,
126 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
127 const std::string& path, const boost::system::error_code& ec,
128 const SensorBaseConfigMap& endpoint)
129{
130 if (ec)
131 {
132 lg2::error("Error processing MCTP endpoint: Error:{ERROR}", "ERROR",
133 ec.message());
134 return;
135 }
136
137 auto hasEid = endpoint.find("EID");
138 uint8_t eid{};
139
140 if (hasEid != endpoint.end())
141 {
142 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
143 if (eidPtr != nullptr)
144 {
145 eid = *eidPtr;
146 }
147 else
148 {
149 lg2::error(
150 "Error processing MCTP endpoint: Property EID does not have valid type.");
151 return;
152 }
153 }
154 else
155 {
156 lg2::error(
157 "Error processing MCTP endpoint: Property EID not found in the configuration.");
158 return;
159 }
160
161 auto hasMctpTypes = endpoint.find("SupportedMessageTypes");
162 std::vector<uint8_t> mctpTypes{};
163
164 if (hasMctpTypes != endpoint.end())
165 {
166 const auto* mctpTypePtr =
167 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
168 if (mctpTypePtr != nullptr)
169 {
170 mctpTypes = *mctpTypePtr;
171 }
172 else
173 {
174 lg2::error(
175 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes does not have valid type.",
176 "EID", eid);
177 return;
178 }
179 }
180 else
181 {
182 lg2::error(
183 "Error processing MCTP endpoint with eid {EID} : Property SupportedMessageTypes not found in the configuration.",
184 "EID", eid);
185 return;
186 }
187
188 if (std::find(mctpTypes.begin(), mctpTypes.end(),
189 ocp::accelerator_management::messageType) != mctpTypes.end())
190 {
191 lg2::info("Found OCP MCTP VDM Endpoint with ID {EID}", "EID", eid);
192 queryDeviceIdentification(io, objectServer, gpuDevices, conn,
193 mctpRequester, configs, path, eid);
194 }
195}
196
197void queryEndpoints(
198 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
199 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
200 gpuDevices,
201 const std::shared_ptr<sdbusplus::asio::connection>& conn,
202 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
203 const std::string& path, const boost::system::error_code& ec,
204 const GetSubTreeType& ret)
205{
206 if (ec)
207 {
208 lg2::error("Error processing MCTP endpoints: {ERROR}", "ERROR",
209 ec.message());
210 return;
211 }
212
213 if (ret.empty())
214 {
215 return;
216 }
217
218 for (const auto& [objPath, services] : ret)
219 {
220 for (const auto& [service, ifaces] : services)
221 {
222 for (const auto& iface : ifaces)
223 {
224 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
225 {
226 conn->async_method_call(
227 [&io, &objectServer, &gpuDevices, conn, &mctpRequester,
228 configs, path](const boost::system::error_code& ec,
229 const SensorBaseConfigMap& endpoint) {
230 processEndpoint(io, objectServer, gpuDevices, conn,
231 mctpRequester, configs, path, ec,
232 endpoint);
233 },
234 service, objPath, "org.freedesktop.DBus.Properties",
235 "GetAll", iface);
236 }
237 }
238 }
239 }
240}
241
242void discoverDevices(
243 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
244 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
245 gpuDevices,
246 const std::shared_ptr<sdbusplus::asio::connection>& conn,
247 mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
248 const std::string& path)
249{
250 std::string searchPath{"/au/com/codeconstruct/"};
251 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
252
253 conn->async_method_call(
254 [&io, &objectServer, &gpuDevices, conn, &mctpRequester, configs,
255 path](const boost::system::error_code& ec, const GetSubTreeType& ret) {
256 queryEndpoints(io, objectServer, gpuDevices, conn, mctpRequester,
257 configs, path, ec, ret);
258 },
259 "xyz.openbmc_project.ObjectMapper",
260 "/xyz/openbmc_project/object_mapper",
261 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
262 ifaceList);
263}
264
265void processSensorConfigs(
266 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
267 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
268 gpuDevices,
269 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
270 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
271{
272 for (const auto& [path, interfaces] : resp)
273 {
274 for (const auto& [intf, cfg] : interfaces)
275 {
276 if (intf != configInterfaceName(deviceType))
277 {
278 continue;
279 }
280
281 SensorConfigs configs;
282
283 configs.name = loadVariant<std::string>(cfg, "Name");
284
285 configs.pollRate = loadVariant<uint64_t>(cfg, "PollRate");
286
287 discoverDevices(io, objectServer, gpuDevices, dbusConnection,
288 mctpRequester, configs, path);
289
290 lg2::info(
291 "Detected configuration {NAME} of type {TYPE} at path: {PATH}.",
292 "NAME", configs.name, "TYPE", deviceType, "PATH", path);
293 }
294 }
295}
296
297void createSensors(
298 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
299 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
300 gpuDevices,
301 const std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
302 mctp::MctpRequester& mctpRequester)
303{
304 if (!dbusConnection)
305 {
306 lg2::error("Connection not created");
307 return;
308 }
309 dbusConnection->async_method_call(
310 [&gpuDevices, &mctpRequester, dbusConnection, &io, &objectServer](
311 boost::system::error_code ec, const ManagedObjectType& resp) {
312 if (ec)
313 {
314 lg2::error("Error contacting entity manager");
315 return;
316 }
317
318 processSensorConfigs(io, objectServer, gpuDevices, dbusConnection,
319 mctpRequester, resp);
320 },
321 entityManagerName, "/xyz/openbmc_project/inventory",
322 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
323}
324
325void interfaceRemoved(
326 sdbusplus::message_t& message,
327 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
328 gpuDevices)
329{
330 if (message.is_method_error())
331 {
332 lg2::error("interfacesRemoved callback method error");
333 return;
334 }
335
336 sdbusplus::message::object_path removedPath;
337 std::vector<std::string> interfaces;
338
339 message.read(removedPath, interfaces);
340
341 // If the xyz.openbmc_project.Confguration.X interface was removed
342 // for one or more sensors, delete those sensor objects.
343 auto sensorIt = gpuDevices.begin();
344 while (sensorIt != gpuDevices.end())
345 {
346 if ((sensorIt->second->getPath() == removedPath) &&
347 (std::find(interfaces.begin(), interfaces.end(),
348 configInterfaceName(deviceType)) != interfaces.end()))
349 {
350 sensorIt = gpuDevices.erase(sensorIt);
351 }
352 else
353 {
354 sensorIt++;
355 }
356 }
357}