blob: 11423dddf8b4359f1141c6212321f541ebf0d883 [file] [log] [blame]
Harshit Aghera11b9c1a2025-04-29 17:34:25 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "GpuDevice.hpp"
7
8#include "GpuSensor.hpp"
Harshit Aghera32e3b2b2025-05-05 12:26:35 +05309#include "GpuTLimitSensor.hpp"
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053010#include "Thresholds.hpp"
11#include "Utils.hpp"
12
13#include <bits/basic_string.h>
14
15#include <GpuMctpVdm.hpp>
16#include <MctpRequester.hpp>
17#include <OcpMctpVdm.hpp>
18#include <boost/asio/io_context.hpp>
19#include <boost/container/flat_map.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23#include <sdbusplus/message.hpp>
24#include <sdbusplus/message/native_types.hpp>
25
26#include <algorithm>
27#include <chrono>
28#include <cstdint>
29#include <functional>
30#include <map>
31#include <memory>
32#include <string>
33#include <utility>
34#include <variant>
35#include <vector>
36
37using namespace std::chrono_literals;
38
39constexpr std::chrono::milliseconds samplingInterval{1000ms};
40
41std::unique_ptr<GpuDevice> gpuDevice;
42
43GpuDevice::GpuDevice(const std::string& name, const std::string& path,
44 std::shared_ptr<sdbusplus::asio::connection>& conn,
45 boost::asio::io_context& io,
46 mctp::MctpRequester& mctpRequester,
47 sdbusplus::asio::object_server& objectServer) :
48 sensorPollMs(samplingInterval),
49 waitTimer(io, std::chrono::steady_clock::duration(0)),
50 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
51 name(escapeName(name)), path(path)
52{
53 discoverGpus();
54}
55
56void GpuDevice::createSensors()
57{
58 sensors.push_back(std::make_shared<GpuTempSensor>(
59 conn, mctpRequester, name + "_TEMP_0", path, eid, objectServer,
60 std::vector<thresholds::Threshold>{}));
61
Harshit Aghera32e3b2b2025-05-05 12:26:35 +053062 sensors.push_back(std::make_shared<GpuTLimitSensor>(
63 conn, mctpRequester, name + "_TEMP_1", path, eid, objectServer,
64 std::vector<thresholds::Threshold>{}));
65
66 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
67 name, "PATH", path);
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053068}
69
70void GpuDevice::read()
71{
Harshit Aghera32e3b2b2025-05-05 12:26:35 +053072 for (const auto& sensor : sensors)
Harshit Aghera11b9c1a2025-04-29 17:34:25 +053073 {
74 sensor->update();
75 }
76
77 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
78 waitTimer.async_wait([this](const boost::system::error_code& ec) {
79 if (ec)
80 {
81 return;
82 }
83 read();
84 });
85}
86
87void GpuDevice::processGpuEndpoint(uint8_t eid)
88{
89 std::vector<uint8_t> reqMsg(
90 sizeof(ocp::accelerator_management::BindingPciVid) +
91 sizeof(gpu::QueryDeviceIdentificationRequest));
92
93 auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
94
95 auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
96 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
97 {
98 lg2::error(
99 "GpuDevice::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
100 "RC", static_cast<int>(rc));
101 return;
102 }
103
104 mctpRequester.sendRecvMsg(
105 eid, reqMsg,
106 [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
107 if (sendRecvMsgResult != 0)
108 {
109 lg2::error(
110 "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
111 "RC", sendRecvMsgResult);
112 return;
113 }
114
115 if (respMsg.empty())
116 {
117 lg2::error(
118 "GpuDevice::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
119 return;
120 }
121
122 uint8_t cc = 0;
123 uint16_t reasonCode = 0;
124 uint8_t responseDeviceType = 0;
125 uint8_t responseInstanceId = 0;
126
127 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
128 *new (respMsg.data()) ocp::accelerator_management::Message,
129 respMsg.size(), cc, reasonCode, responseDeviceType,
130 responseInstanceId);
131
132 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
133 cc != static_cast<uint8_t>(
134 ocp::accelerator_management::CompletionCode::SUCCESS))
135 {
136 lg2::error(
137 "GpuDevice::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
138 "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
139 return;
140 }
141
142 if (responseDeviceType ==
143 static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
144 {
145 lg2::info(
146 "GpuDevice::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
147 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
148 responseInstanceId);
149
150 this->eid = eid;
151 this->createSensors();
152 this->read();
153 }
154 });
155}
156
157void GpuDevice::processMctpEndpoints(const boost::system::error_code& ec,
158 const getSubTreeRet& ret)
159{
160 if (ec)
161 {
162 lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
163 ec.message());
164 return;
165 }
166
167 if (ret.empty())
168 {
169 return;
170 }
171
172 for (const auto& [objPath, services] : ret)
173 {
174 for (const auto& [service, ifaces] : services)
175 {
176 for (const auto& iface : ifaces)
177 {
178 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
179 {
180 conn->async_method_call(
181 [this](const boost::system::error_code& ec,
182 const GpuSensorConfigMap& configs) {
183 this->processEndpointConfigs(ec, configs);
184 },
185 service, objPath, "org.freedesktop.DBus.Properties",
186 "GetAll", iface);
187 }
188 }
189 }
190 }
191}
192
193void GpuDevice::processEndpointConfigs(const boost::system::error_code& ec,
194 const GpuSensorConfigMap& configs)
195{
196 if (ec)
197 {
198 lg2::error("GpuDevice::discoverGpus(): Error:{ERROR}", "ERROR",
199 ec.message());
200 return;
201 }
202
203 uint8_t eid{};
204 std::vector<uint8_t> mctpTypes{};
205
206 auto hasEid = configs.find("EID");
207 if (hasEid != configs.end())
208 {
209 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
210 if (eidPtr != nullptr)
211 {
212 eid = *eidPtr;
213 }
214 else
215 {
216 lg2::error(
217 "GpuDevice::discoverGpus(): Property EID does not have valid type.");
218 return;
219 }
220 }
221 else
222 {
223 lg2::error(
224 "GpuDevice::discoverGpus(): Property EID not found in the configuration.");
225 return;
226 }
227
228 auto hasMctpTypes = configs.find("SupportedMessageTypes");
229 if (hasMctpTypes != configs.end())
230 {
231 const auto* mctpTypePtr =
232 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
233 if (mctpTypePtr != nullptr)
234 {
235 mctpTypes = *mctpTypePtr;
236 }
237 else
238 {
239 lg2::error(
240 "GpuDevice::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
241 return;
242 }
243 }
244 else
245 {
246 lg2::error(
247 "GpuDevice::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
248 return;
249 }
250
251 if (std::find(mctpTypes.begin(), mctpTypes.end(),
252 ocp::accelerator_management::messageType) != mctpTypes.end())
253 {
254 lg2::info(
255 "GpuDevice::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
256 "EID", eid);
257 this->processGpuEndpoint(eid);
258 }
259}
260
261void GpuDevice::discoverGpus()
262{
263 std::string searchPath{"/au/com/codeconstruct/"};
264 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
265
266 conn->async_method_call(
267 [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
268 processMctpEndpoints(ec, ret);
269 },
270 "xyz.openbmc_project.ObjectMapper",
271 "/xyz/openbmc_project/object_mapper",
272 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
273 ifaceList);
274}
275
276void processSensorConfigs(
277 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
278 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
279 gpuDevice,
280 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
281 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
282{
283 for (const auto& [path, interfaces] : resp)
284 {
285 for (const auto& [intf, cfg] : interfaces)
286 {
287 if (intf != configInterfaceName(sensorType))
288 {
289 continue;
290 }
291
292 std::string name = loadVariant<std::string>(cfg, "Name");
293
294 gpuDevice[name] = std::make_shared<GpuDevice>(
295 name, path, dbusConnection, io, mctpRequester, objectServer);
296 }
297 }
298}
299
300void createSensors(
301 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
302 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
303 gpuDevice,
304 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
305 mctp::MctpRequester& mctpRequester)
306{
307 if (!dbusConnection)
308 {
309 lg2::error("Connection not created");
310 return;
311 }
312 dbusConnection->async_method_call(
313 [&gpuDevice, &mctpRequester, &dbusConnection, &io, &objectServer](
314 boost::system::error_code ec, const ManagedObjectType& resp) {
315 if (ec)
316 {
317 lg2::error("Error contacting entity manager");
318 return;
319 }
320
321 processSensorConfigs(io, objectServer, gpuDevice, dbusConnection,
322 mctpRequester, resp);
323 },
324 entityManagerName, "/xyz/openbmc_project/inventory",
325 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
326}
327
328void interfaceRemoved(
329 sdbusplus::message_t& message,
330 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
331 gpuDevice)
332{
333 if (message.is_method_error())
334 {
335 lg2::error("interfacesRemoved callback method error");
336 return;
337 }
338
339 sdbusplus::message::object_path removedPath;
340 std::vector<std::string> interfaces;
341
342 message.read(removedPath, interfaces);
343
344 // If the xyz.openbmc_project.Confguration.X interface was removed
345 // for one or more sensors, delete those sensor objects.
346 auto sensorIt = gpuDevice.begin();
347 while (sensorIt != gpuDevice.end())
348 {
349 if ((sensorIt->second->getPath() == removedPath) &&
350 (std::find(interfaces.begin(), interfaces.end(),
351 configInterfaceName(sensorType)) != interfaces.end()))
352 {
353 sensorIt = gpuDevice.erase(sensorIt);
354 }
355 else
356 {
357 sensorIt++;
358 }
359 }
360}