blob: dccd730f5dd323f7234c97d5a8f1630e0652a8ba [file] [log] [blame]
Harshit Aghera11b9c1a2025-04-29 17:34:25 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "GpuDevice.hpp"
7
8#include "GpuSensor.hpp"
9#include "Thresholds.hpp"
10#include "Utils.hpp"
11
12#include <bits/basic_string.h>
13
14#include <GpuMctpVdm.hpp>
15#include <MctpRequester.hpp>
16#include <OcpMctpVdm.hpp>
17#include <boost/asio/io_context.hpp>
18#include <boost/container/flat_map.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22#include <sdbusplus/message.hpp>
23#include <sdbusplus/message/native_types.hpp>
24
25#include <algorithm>
26#include <chrono>
27#include <cstdint>
28#include <functional>
29#include <map>
30#include <memory>
31#include <string>
32#include <utility>
33#include <variant>
34#include <vector>
35
36using namespace std::chrono_literals;
37
38constexpr std::chrono::milliseconds samplingInterval{1000ms};
39
40std::unique_ptr<GpuDevice> gpuDevice;
41
42GpuDevice::GpuDevice(const std::string& name, const std::string& path,
43 std::shared_ptr<sdbusplus::asio::connection>& conn,
44 boost::asio::io_context& io,
45 mctp::MctpRequester& mctpRequester,
46 sdbusplus::asio::object_server& objectServer) :
47 sensorPollMs(samplingInterval),
48 waitTimer(io, std::chrono::steady_clock::duration(0)),
49 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
50 name(escapeName(name)), path(path)
51{
52 discoverGpus();
53}
54
55void GpuDevice::createSensors()
56{
57 sensors.push_back(std::make_shared<GpuTempSensor>(
58 conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
59 std::vector<thresholds::Threshold>{}));
60
61 lg2::info("Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
62 "NAME", name, "PATH", path);
63}
64
65void GpuDevice::read()
66{
67 for ([[maybe_unused]] const auto& sensor : sensors)
68 {
69 sensor->update();
70 }
71
72 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
73 waitTimer.async_wait([this](const boost::system::error_code& ec) {
74 if (ec)
75 {
76 return;
77 }
78 read();
79 });
80}
81
82void GpuDevice::processGpuEndpoint(uint8_t eid)
83{
84 std::vector<uint8_t> reqMsg(
85 sizeof(ocp::accelerator_management::BindingPciVid) +
86 sizeof(gpu::QueryDeviceIdentificationRequest));
87
88 auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
89
90 auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
91 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
92 {
93 lg2::error(
94 "GpuDevice::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
95 "RC", static_cast<int>(rc));
96 return;
97 }
98
99 mctpRequester.sendRecvMsg(
100 eid, reqMsg,
101 [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
102 if (sendRecvMsgResult != 0)
103 {
104 lg2::error(
105 "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
106 "RC", sendRecvMsgResult);
107 return;
108 }
109
110 if (respMsg.empty())
111 {
112 lg2::error(
113 "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
114 return;
115 }
116
117 uint8_t cc = 0;
118 uint16_t reasonCode = 0;
119 uint8_t responseDeviceType = 0;
120 uint8_t responseInstanceId = 0;
121
122 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
123 *new (respMsg.data()) ocp::accelerator_management::Message,
124 respMsg.size(), cc, reasonCode, responseDeviceType,
125 responseInstanceId);
126
127 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
128 cc != static_cast<uint8_t>(
129 ocp::accelerator_management::CompletionCode::SUCCESS))
130 {
131 lg2::error(
132 "GpuDevice::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
133 "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
134 return;
135 }
136
137 if (responseDeviceType ==
138 static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
139 {
140 lg2::info(
141 "GpuDevice::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
142 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
143 responseInstanceId);
144
145 this->eid = eid;
146 this->createSensors();
147 this->read();
148 }
149 });
150}
151
152void GpuDevice::processMctpEndpoints(const boost::system::error_code& ec,
153 const getSubTreeRet& ret)
154{
155 if (ec)
156 {
157 lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
158 ec.message());
159 return;
160 }
161
162 if (ret.empty())
163 {
164 return;
165 }
166
167 for (const auto& [objPath, services] : ret)
168 {
169 for (const auto& [service, ifaces] : services)
170 {
171 for (const auto& iface : ifaces)
172 {
173 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
174 {
175 conn->async_method_call(
176 [this](const boost::system::error_code& ec,
177 const GpuSensorConfigMap& configs) {
178 this->processEndpointConfigs(ec, configs);
179 },
180 service, objPath, "org.freedesktop.DBus.Properties",
181 "GetAll", iface);
182 }
183 }
184 }
185 }
186}
187
188void GpuDevice::processEndpointConfigs(const boost::system::error_code& ec,
189 const GpuSensorConfigMap& configs)
190{
191 if (ec)
192 {
193 lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
194 ec.message());
195 return;
196 }
197
198 uint8_t eid{};
199 std::vector<uint8_t> mctpTypes{};
200
201 auto hasEid = configs.find("EID");
202 if (hasEid != configs.end())
203 {
204 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
205 if (eidPtr != nullptr)
206 {
207 eid = *eidPtr;
208 }
209 else
210 {
211 lg2::error(
212 "GpuDevice::discoverGpus(): Property EID does not have valid type.");
213 return;
214 }
215 }
216 else
217 {
218 lg2::error(
219 "GpuDevice::discoverGpus(): Property EID not found in the configuration.");
220 return;
221 }
222
223 auto hasMctpTypes = configs.find("SupportedMessageTypes");
224 if (hasMctpTypes != configs.end())
225 {
226 const auto* mctpTypePtr =
227 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
228 if (mctpTypePtr != nullptr)
229 {
230 mctpTypes = *mctpTypePtr;
231 }
232 else
233 {
234 lg2::error(
235 "GpuDevice::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
236 return;
237 }
238 }
239 else
240 {
241 lg2::error(
242 "GpuDevice::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
243 return;
244 }
245
246 if (std::find(mctpTypes.begin(), mctpTypes.end(),
247 ocp::accelerator_management::messageType) != mctpTypes.end())
248 {
249 lg2::info(
250 "GpuDevice::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
251 "EID", eid);
252 this->processGpuEndpoint(eid);
253 }
254}
255
256void GpuDevice::discoverGpus()
257{
258 std::string searchPath{"/au/com/codeconstruct/"};
259 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
260
261 conn->async_method_call(
262 [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
263 processMctpEndpoints(ec, ret);
264 },
265 "xyz.openbmc_project.ObjectMapper",
266 "/xyz/openbmc_project/object_mapper",
267 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
268 ifaceList);
269}
270
271void processSensorConfigs(
272 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
273 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
274 gpuDevice,
275 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
276 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
277{
278 for (const auto& [path, interfaces] : resp)
279 {
280 for (const auto& [intf, cfg] : interfaces)
281 {
282 if (intf != configInterfaceName(sensorType))
283 {
284 continue;
285 }
286
287 std::string name = loadVariant<std::string>(cfg, "Name");
288
289 gpuDevice[name] = std::make_shared<GpuDevice>(
290 name, path, dbusConnection, io, mctpRequester, objectServer);
291 }
292 }
293}
294
295void createSensors(
296 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
297 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
298 gpuDevice,
299 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
300 mctp::MctpRequester& mctpRequester)
301{
302 if (!dbusConnection)
303 {
304 lg2::error("Connection not created");
305 return;
306 }
307 dbusConnection->async_method_call(
308 [&gpuDevice, &mctpRequester, &dbusConnection, &io, &objectServer](
309 boost::system::error_code ec, const ManagedObjectType& resp) {
310 if (ec)
311 {
312 lg2::error("Error contacting entity manager");
313 return;
314 }
315
316 processSensorConfigs(io, objectServer, gpuDevice, dbusConnection,
317 mctpRequester, resp);
318 },
319 entityManagerName, "/xyz/openbmc_project/inventory",
320 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
321}
322
323void interfaceRemoved(
324 sdbusplus::message_t& message,
325 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
326 gpuDevice)
327{
328 if (message.is_method_error())
329 {
330 lg2::error("interfacesRemoved callback method error");
331 return;
332 }
333
334 sdbusplus::message::object_path removedPath;
335 std::vector<std::string> interfaces;
336
337 message.read(removedPath, interfaces);
338
339 // If the xyz.openbmc_project.Confguration.X interface was removed
340 // for one or more sensors, delete those sensor objects.
341 auto sensorIt = gpuDevice.begin();
342 while (sensorIt != gpuDevice.end())
343 {
344 if ((sensorIt->second->getPath() == removedPath) &&
345 (std::find(interfaces.begin(), interfaces.end(),
346 configInterfaceName(sensorType)) != interfaces.end()))
347 {
348 sensorIt = gpuDevice.erase(sensorIt);
349 }
350 else
351 {
352 sensorIt++;
353 }
354 }
355}