blob: d25860d7585d67d1bc8b0e849f3d26cff82ee422 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
9#include "NvidiaDeviceDiscovery.hpp"
10#include "NvidiaGpuSensor.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
Harshit Aghera775199d2025-05-27 14:20:24 +053017#include <NvidiaGpuEnergySensor.hpp>
Harshit Aghera902c6492025-05-08 15:57:42 +053018#include <NvidiaGpuPowerSensor.hpp>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053019#include <NvidiaGpuThresholds.hpp>
Harshit Agherabef4d412025-05-27 14:53:56 +053020#include <NvidiaGpuVoltageSensor.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053021#include <boost/asio/io_context.hpp>
22#include <phosphor-logging/lg2.hpp>
23#include <sdbusplus/asio/connection.hpp>
24#include <sdbusplus/asio/object_server.hpp>
25
26#include <chrono>
27#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053028#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053029#include <memory>
30#include <string>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053031#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053032#include <vector>
33
34GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
35 const std::string& path,
36 const std::shared_ptr<sdbusplus::asio::connection>& conn,
37 uint8_t eid, boost::asio::io_context& io,
38 mctp::MctpRequester& mctpRequester,
39 sdbusplus::asio::object_server& objectServer) :
40 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
41 waitTimer(io, std::chrono::steady_clock::duration(0)),
42 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
43 configs(configs), name(escapeName(name)), path(path)
44{
45 makeSensors();
46}
47
48void GpuDevice::makeSensors()
49{
50 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053051 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
52 objectServer, std::vector<thresholds::Threshold>{});
53
Harshit Aghera5e7decc2025-05-07 16:20:16 +053054 readThermalParameters(
55 eid,
56 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
57 gpuTLimitCriticalThresholdId,
58 gpuTLimitHardshutDownThresholdId},
59 mctpRequester,
60 std::bind_front(&GpuDevice::processTLimitThresholds, this));
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053061
Harshit Agherab10a67b2025-05-27 12:19:29 +053062 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
63 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
64 gpuDramTempSensorId, objectServer,
65 std::vector<thresholds::Threshold>{thresholds::Threshold{
66 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
67
Harshit Aghera902c6492025-05-08 15:57:42 +053068 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
69 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
70 objectServer, std::vector<thresholds::Threshold>{});
71
Harshit Aghera775199d2025-05-27 14:20:24 +053072 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
73 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
74 objectServer, std::vector<thresholds::Threshold>{});
75
Harshit Agherabef4d412025-05-27 14:53:56 +053076 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
77 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
78 objectServer, std::vector<thresholds::Threshold>{});
79
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053080 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
81 name, "PATH", path);
82
83 read();
84}
85
Harshit Aghera5e7decc2025-05-07 16:20:16 +053086void GpuDevice::processTLimitThresholds(uint8_t rc,
87 const std::vector<int32_t>& thresholds)
88{
89 std::vector<thresholds::Threshold> tLimitThresholds{};
90 if (rc == 0)
91 {
92 tLimitThresholds = {
93 thresholds::Threshold{thresholds::Level::WARNING,
94 thresholds::Direction::LOW,
95 static_cast<double>(thresholds[0])},
96 thresholds::Threshold{thresholds::Level::CRITICAL,
97 thresholds::Direction::LOW,
98 static_cast<double>(thresholds[1])},
99 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
100 thresholds::Direction::LOW,
101 static_cast<double>(thresholds[2])}};
102 }
103
104 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
105 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
106 objectServer, std::move(tLimitThresholds));
107}
108
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530109void GpuDevice::read()
110{
111 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530112 if (tLimitSensor)
113 {
114 tLimitSensor->update();
115 }
Harshit Agherab10a67b2025-05-27 12:19:29 +0530116 dramTempSensor->update();
Harshit Aghera902c6492025-05-08 15:57:42 +0530117 powerSensor->update();
Harshit Aghera775199d2025-05-27 14:20:24 +0530118 energySensor->update();
Harshit Agherabef4d412025-05-27 14:53:56 +0530119 voltageSensor->update();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530120
121 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
122 waitTimer.async_wait([this](const boost::system::error_code& ec) {
123 if (ec)
124 {
125 return;
126 }
127 read();
128 });
129}