blob: 6bb8d7158bcbfa05cc6a09778a3f16e813e85c10 [file] [log] [blame]
Harshit Agherafa2a5b92025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
Rohit PAId87bf7f2025-06-11 08:52:29 +05309#include "Inventory.hpp"
Harshit Agherafa2a5b92025-05-22 11:35:39 +053010#include "NvidiaDeviceDiscovery.hpp"
11#include "NvidiaGpuSensor.hpp"
12#include "Thresholds.hpp"
13#include "Utils.hpp"
14
15#include <bits/basic_string.h>
16
17#include <MctpRequester.hpp>
Harshit Aghera128c91d2025-05-27 14:20:24 +053018#include <NvidiaGpuEnergySensor.hpp>
Harshit Agherac8dab722025-05-08 15:57:42 +053019#include <NvidiaGpuPowerSensor.hpp>
Harshit Agherac20108d2025-05-07 16:20:16 +053020#include <NvidiaGpuThresholds.hpp>
Harshit Agherab55847f2025-05-27 14:53:56 +053021#include <NvidiaGpuVoltageSensor.hpp>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053022#include <boost/asio/io_context.hpp>
23#include <phosphor-logging/lg2.hpp>
24#include <sdbusplus/asio/connection.hpp>
25#include <sdbusplus/asio/object_server.hpp>
26
27#include <chrono>
28#include <cstdint>
29#include <memory>
30#include <string>
Harshit Agherac20108d2025-05-07 16:20:16 +053031#include <utility>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053032#include <vector>
33
34GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
35 const std::string& path,
36 const std::shared_ptr<sdbusplus::asio::connection>& conn,
37 uint8_t eid, boost::asio::io_context& io,
38 mctp::MctpRequester& mctpRequester,
39 sdbusplus::asio::object_server& objectServer) :
40 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
41 waitTimer(io, std::chrono::steady_clock::duration(0)),
42 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
43 configs(configs), name(escapeName(name)), path(path)
44{
Rohit PAId87bf7f2025-06-11 08:52:29 +053045 inventory =
46 std::make_unique<Inventory>(conn, objectServer, name, mctpRequester,
47 Inventory::DeviceType::GPU, eid);
Harshit Agherafa2a5b92025-05-22 11:35:39 +053048 makeSensors();
49}
50
51void GpuDevice::makeSensors()
52{
53 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Aghera0e1718c2025-05-05 12:26:35 +053054 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
55 objectServer, std::vector<thresholds::Threshold>{});
56
Harshit Agherac20108d2025-05-07 16:20:16 +053057 readThermalParameters(
58 eid,
59 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
60 gpuTLimitCriticalThresholdId,
61 gpuTLimitHardshutDownThresholdId},
62 mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
63 std::vector<thresholds::Threshold> tLimitThresholds{};
64 if (rc == 0)
65 {
66 tLimitThresholds = {
67 thresholds::Threshold{thresholds::Level::WARNING,
68 thresholds::Direction::LOW,
69 static_cast<double>(thresholds[0])},
70 thresholds::Threshold{thresholds::Level::CRITICAL,
71 thresholds::Direction::LOW,
72 static_cast<double>(thresholds[1])},
73 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
74 thresholds::Direction::LOW,
75 static_cast<double>(thresholds[2])}};
76 }
77
78 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
79 conn, mctpRequester, name + "_TEMP_1", path, eid,
80 gpuTLimitSensorId, objectServer, std::move(tLimitThresholds));
81 });
Harshit Agherafa2a5b92025-05-22 11:35:39 +053082
Harshit Agherac8dab722025-05-08 15:57:42 +053083 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
84 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
85 objectServer, std::vector<thresholds::Threshold>{});
86
Harshit Aghera128c91d2025-05-27 14:20:24 +053087 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
88 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
89 objectServer, std::vector<thresholds::Threshold>{});
90
Harshit Agherab55847f2025-05-27 14:53:56 +053091 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
92 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
93 objectServer, std::vector<thresholds::Threshold>{});
94
Harshit Agherafa2a5b92025-05-22 11:35:39 +053095 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
96 name, "PATH", path);
97
98 read();
99}
100
101void GpuDevice::read()
102{
103 tempSensor->update();
Harshit Agherac20108d2025-05-07 16:20:16 +0530104 if (tLimitSensor)
105 {
106 tLimitSensor->update();
107 }
Harshit Agherac8dab722025-05-08 15:57:42 +0530108 powerSensor->update();
Harshit Aghera128c91d2025-05-27 14:20:24 +0530109 energySensor->update();
Harshit Agherab55847f2025-05-27 14:53:56 +0530110 voltageSensor->update();
Harshit Agherafa2a5b92025-05-22 11:35:39 +0530111
112 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
113 waitTimer.async_wait([this](const boost::system::error_code& ec) {
114 if (ec)
115 {
116 return;
117 }
118 read();
119 });
120}