blob: 119554d61f346e55687734debb0df0b2e6f7a984 [file] [log] [blame]
Harshit Agheraacd375a2025-04-21 19:50:10 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "GpuSensor.hpp"
7
Harshit Agheraa3f24f42025-04-21 20:04:56 +05308#include "SensorPaths.hpp"
Harshit Agheraacd375a2025-04-21 19:50:10 +05309#include "Thresholds.hpp"
10#include "Utils.hpp"
11#include "sensor.hpp"
12
13#include <bits/basic_string.h>
14
Harshit Agheraa3f24f42025-04-21 20:04:56 +053015#include <GpuMctpVdm.hpp>
16#include <MctpRequester.hpp>
17#include <OcpMctpVdm.hpp>
Harshit Agheraacd375a2025-04-21 19:50:10 +053018#include <boost/asio/io_context.hpp>
19#include <boost/container/flat_map.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23#include <sdbusplus/message.hpp>
24#include <sdbusplus/message/native_types.hpp>
25
26#include <algorithm>
27#include <chrono>
28#include <cstddef>
29#include <cstdint>
Harshit Agheraa3f24f42025-04-21 20:04:56 +053030#include <functional>
Harshit Agheraacd375a2025-04-21 19:50:10 +053031#include <map>
32#include <memory>
33#include <string>
34#include <utility>
35#include <variant>
36#include <vector>
37
38using namespace std::literals;
39
Harshit Agheraa3f24f42025-04-21 20:04:56 +053040constexpr uint8_t gpuTempSensorId{0};
41constexpr std::chrono::milliseconds samplingInterval{1000ms};
Harshit Agheraacd375a2025-04-21 19:50:10 +053042static constexpr double gpuTempSensorMaxReading = 127;
43static constexpr double gpuTempSensorMinReading = -128;
44
45GpuTempSensor::GpuTempSensor(
46 std::shared_ptr<sdbusplus::asio::connection>& conn,
Harshit Agheraa3f24f42025-04-21 20:04:56 +053047 boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
48 const std::string& name, const std::string& sensorConfiguration,
Harshit Agheraacd375a2025-04-21 19:50:10 +053049 sdbusplus::asio::object_server& objectServer,
Harshit Agheraa3f24f42025-04-21 20:04:56 +053050 std::vector<thresholds::Threshold>&& thresholdData,
51 std::chrono::milliseconds pollRate) :
Harshit Agheraacd375a2025-04-21 19:50:10 +053052 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
53 "temperature", false, true, gpuTempSensorMaxReading,
54 gpuTempSensorMinReading, conn),
Harshit Agheraa3f24f42025-04-21 20:04:56 +053055 sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
56 waitTimer(io, std::chrono::steady_clock::duration(0)),
57 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
Harshit Agheraacd375a2025-04-21 19:50:10 +053058{
59 std::string dbusPath =
60 sensorPathPrefix + "temperature/"s + escapeName(name);
61
62 sensorInterface = objectServer.add_interface(
63 dbusPath, "xyz.openbmc_project.Sensor.Value");
64
65 for (const auto& threshold : thresholds)
66 {
67 std::string interface = thresholds::getInterface(threshold.level);
68 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
69 objectServer.add_interface(dbusPath, interface);
70 }
71
72 association = objectServer.add_interface(dbusPath, association::interface);
73
74 init();
75}
76
77GpuTempSensor::~GpuTempSensor()
78{
79 waitTimer.cancel();
80 for (const auto& iface : thresholdInterfaces)
81 {
82 objectServer.remove_interface(iface);
83 }
84 objectServer.remove_interface(sensorInterface);
85 objectServer.remove_interface(association);
86}
87
88void GpuTempSensor::checkThresholds()
89{
90 thresholds::checkThresholds(this);
91}
92
93void GpuTempSensor::init()
94{
95 discoverGpus();
96}
97
Harshit Agheraa3f24f42025-04-21 20:04:56 +053098void GpuTempSensor::read()
99{
100 update();
101
102 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
103 waitTimer.async_wait([this](const boost::system::error_code& ec) {
104 if (ec)
105 {
106 return;
107 }
108 read();
109 });
110}
111
112void GpuTempSensor::update()
113{
114 std::vector<uint8_t> reqMsg(
115 sizeof(ocp::accelerator_management::BindingPciVid) +
116 sizeof(gpu::GetTemperatureReadingRequest));
117
118 auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
119
120 auto rc = gpu::encodeGetTemperatureReadingRequest(0, sensorId, *msg);
121 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
122 {
123 lg2::error(
124 "GpuTempSensor::update(): gpuEncodeGetTemperatureReadingRequest failed, rc={RC}",
125 "RC", static_cast<int>(rc));
126 return;
127 }
128
129 mctpRequester.sendRecvMsg(
130 eid, reqMsg,
131 [this](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
132 if (sendRecvMsgResult != 0)
133 {
134 lg2::error(
135 "GpuTempSensor::update(): MctpRequester::sendRecvMsg() failed, rc={RC}",
136 "RC", sendRecvMsgResult);
137 return;
138 }
139
140 if (respMsg.empty())
141 {
142 lg2::error(
143 "GpuTempSensor::update(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
144 return;
145 }
146
147 uint8_t cc = 0;
148 uint16_t reasonCode = 0;
149 double tempValue = 0;
150
151 auto rc = gpu::decodeGetTemperatureReadingResponse(
152 *new (respMsg.data()) ocp::accelerator_management::Message,
153 respMsg.size(), cc, reasonCode, tempValue);
154
155 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
156 cc != static_cast<uint8_t>(
157 ocp::accelerator_management::CompletionCode::SUCCESS))
158 {
159 lg2::error(
160 "GpuTempSensor::update(): gpuDecodeGetTemperatureReadingResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
161 "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
162 return;
163 }
164
165 updateValue(tempValue);
166 });
167}
168
169void GpuTempSensor::processGpuEndpoint(uint8_t eid)
170{
171 std::vector<uint8_t> reqMsg(
172 sizeof(ocp::accelerator_management::BindingPciVid) +
173 sizeof(gpu::QueryDeviceIdentificationRequest));
174
175 auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
176
177 auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
178 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
179 {
180 lg2::error(
181 "GpuTempSensor::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
182 "RC", static_cast<int>(rc));
183 return;
184 }
185
186 mctpRequester.sendRecvMsg(
187 eid, reqMsg,
188 [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
189 if (sendRecvMsgResult != 0)
190 {
191 lg2::error(
192 "GpuTempSensor::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
193 "RC", sendRecvMsgResult);
194 return;
195 }
196
197 if (respMsg.empty())
198 {
199 lg2::error(
200 "GpuTempSensor::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
201 return;
202 }
203
204 uint8_t cc = 0;
205 uint16_t reasonCode = 0;
206 uint8_t responseDeviceType = 0;
207 uint8_t responseInstanceId = 0;
208
209 auto rc = gpu::decodeQueryDeviceIdentificationResponse(
210 *new (respMsg.data()) ocp::accelerator_management::Message,
211 respMsg.size(), cc, reasonCode, responseDeviceType,
212 responseInstanceId);
213
214 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
215 cc != static_cast<uint8_t>(
216 ocp::accelerator_management::CompletionCode::SUCCESS))
217 {
218 lg2::error(
219 "GpuTempSensor::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
220 "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
221 return;
222 }
223
224 if (responseDeviceType ==
225 static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
226 {
227 lg2::info(
228 "GpuTempSensor::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
229 "EID", eid, "DEVTYPE", responseDeviceType, "IID",
230 responseInstanceId);
231
232 this->eid = eid;
233 setInitialProperties(sensor_paths::unitDegreesC);
234 this->read();
235 }
236 });
237}
238
Harshit Agheraacd375a2025-04-21 19:50:10 +0530239void GpuTempSensor::processMctpEndpoints(const boost::system::error_code& ec,
240 const getSubTreeRet& ret)
241{
242 if (ec)
243 {
244 lg2::error("GpuTempSensor::discoverGpus(): Error:{ERROR}", "ERROR",
245 ec.message());
246 return;
247 }
248
249 if (ret.empty())
250 {
251 return;
252 }
253
254 for (const auto& [objPath, services] : ret)
255 {
256 for (const auto& [service, ifaces] : services)
257 {
258 for (const auto& iface : ifaces)
259 {
260 if (iface == "xyz.openbmc_project.MCTP.Endpoint")
261 {
262 conn->async_method_call(
263 [this](const boost::system::error_code& ec,
264 const GpuSensorConfigMap& configs) {
265 this->processEndpointConfigs(ec, configs);
266 },
267 service, objPath, "org.freedesktop.DBus.Properties",
268 "GetAll", iface);
269 }
270 }
271 }
272 }
273}
274
275void GpuTempSensor::processEndpointConfigs(const boost::system::error_code& ec,
276 const GpuSensorConfigMap& configs)
277{
278 if (ec)
279 {
280 lg2::error("GpuTempSensor::discoverGpus(): Error:{ERROR}", "ERROR",
281 ec.message());
282 return;
283 }
284
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530285 uint8_t eid{};
Harshit Agheraacd375a2025-04-21 19:50:10 +0530286 std::vector<uint8_t> mctpTypes{};
287
288 auto hasEid = configs.find("EID");
289 if (hasEid != configs.end())
290 {
291 const auto* eidPtr = std::get_if<uint8_t>(&hasEid->second);
292 if (eidPtr != nullptr)
293 {
294 eid = *eidPtr;
295 }
296 else
297 {
298 lg2::error(
299 "GpuTempSensor::discoverGpus(): Property EID does not have valid type.");
300 return;
301 }
302 }
303 else
304 {
305 lg2::error(
306 "GpuTempSensor::discoverGpus(): Property EID not found in the configuration.");
307 return;
308 }
309
310 auto hasMctpTypes = configs.find("SupportedMessageTypes");
311 if (hasMctpTypes != configs.end())
312 {
313 const auto* mctpTypePtr =
314 std::get_if<std::vector<uint8_t>>(&hasMctpTypes->second);
315 if (mctpTypePtr != nullptr)
316 {
317 mctpTypes = *mctpTypePtr;
318 }
319 else
320 {
321 lg2::error(
322 "GpuTempSensor::discoverGpus(): Property SupportedMessageTypes does not have valid type.");
323 return;
324 }
325 }
326 else
327 {
328 lg2::error(
329 "GpuTempSensor::discoverGpus(): Property SupportedMessageTypes not found in the configuration.");
330 return;
331 }
332
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530333 if (std::find(mctpTypes.begin(), mctpTypes.end(),
334 ocp::accelerator_management::messageType) != mctpTypes.end())
335 {
336 lg2::info(
337 "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
338 "EID", eid);
339 this->processGpuEndpoint(eid);
340 }
Harshit Agheraacd375a2025-04-21 19:50:10 +0530341}
342
343void GpuTempSensor::discoverGpus()
344{
345 std::string searchPath{"/au/com/codeconstruct/"};
346 std::vector<std::string> ifaceList{{"xyz.openbmc_project.MCTP.Endpoint"}};
347
348 conn->async_method_call(
349 [this](const boost::system::error_code& ec, const getSubTreeRet& ret) {
350 processMctpEndpoints(ec, ret);
351 },
352 "xyz.openbmc_project.ObjectMapper",
353 "/xyz/openbmc_project/object_mapper",
354 "xyz.openbmc_project.ObjectMapper", "GetSubTree", searchPath, 0,
355 ifaceList);
356}
357
358void processSensorConfigs(
359 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
360 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
361 sensors,
362 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530363 mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
Harshit Agheraacd375a2025-04-21 19:50:10 +0530364{
365 for (const auto& [path, interfaces] : resp)
366 {
367 for (const auto& [intf, cfg] : interfaces)
368 {
369 if (intf != configInterfaceName(sensorType))
370 {
371 continue;
372 }
373
374 std::string name = loadVariant<std::string>(cfg, "Name");
375
376 sensors[name] = std::make_shared<GpuTempSensor>(
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530377 dbusConnection, io, mctpRequester, name, path, objectServer,
378 std::vector<thresholds::Threshold>{}, samplingInterval);
Harshit Agheraacd375a2025-04-21 19:50:10 +0530379
380 lg2::info(
381 "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
382 "NAME", name, "PATH", path);
383 }
384 }
385}
386
387void createSensors(
388 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
389 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
390 sensors,
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530391 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
392 mctp::MctpRequester& mctpRequester)
Harshit Agheraacd375a2025-04-21 19:50:10 +0530393{
394 if (!dbusConnection)
395 {
396 lg2::error("Connection not created");
397 return;
398 }
399 dbusConnection->async_method_call(
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530400 [&sensors, &mctpRequester, &dbusConnection, &io, &objectServer](
401 boost::system::error_code ec, const ManagedObjectType& resp) {
Harshit Agheraacd375a2025-04-21 19:50:10 +0530402 if (ec)
403 {
404 lg2::error("Error contacting entity manager");
405 return;
406 }
407
408 processSensorConfigs(io, objectServer, sensors, dbusConnection,
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530409 mctpRequester, resp);
Harshit Agheraacd375a2025-04-21 19:50:10 +0530410 },
411 entityManagerName, "/xyz/openbmc_project/inventory",
412 "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");
413}
414
415void interfaceRemoved(
416 sdbusplus::message_t& message,
417 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
418 sensors)
419{
420 if (message.is_method_error())
421 {
422 lg2::error("interfacesRemoved callback method error");
423 return;
424 }
425
426 sdbusplus::message::object_path removedPath;
427 std::vector<std::string> interfaces;
428
429 message.read(removedPath, interfaces);
430
431 // If the xyz.openbmc_project.Confguration.X interface was removed
432 // for one or more sensors, delete those sensor objects.
433 auto sensorIt = sensors.begin();
434 while (sensorIt != sensors.end())
435 {
436 if ((sensorIt->second->configurationPath == removedPath) &&
437 (std::find(interfaces.begin(), interfaces.end(),
438 configInterfaceName(sensorType)) != interfaces.end()))
439 {
440 sensorIt = sensors.erase(sensorIt);
441 }
442 else
443 {
444 sensorIt++;
445 }
446 }
447}