blob: 082fbbe9bd3208859561d5ff033fc9bd41822ff5 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
9#include "NvidiaDeviceDiscovery.hpp"
10#include "NvidiaGpuSensor.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
Harshit Aghera775199d2025-05-27 14:20:24 +053017#include <NvidiaGpuEnergySensor.hpp>
Harshit Aghera902c6492025-05-08 15:57:42 +053018#include <NvidiaGpuPowerSensor.hpp>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053019#include <NvidiaGpuThresholds.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053020#include <boost/asio/io_context.hpp>
21#include <phosphor-logging/lg2.hpp>
22#include <sdbusplus/asio/connection.hpp>
23#include <sdbusplus/asio/object_server.hpp>
24
25#include <chrono>
26#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053027#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053028#include <memory>
29#include <string>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053030#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053031#include <vector>
32
33GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
34 const std::string& path,
35 const std::shared_ptr<sdbusplus::asio::connection>& conn,
36 uint8_t eid, boost::asio::io_context& io,
37 mctp::MctpRequester& mctpRequester,
38 sdbusplus::asio::object_server& objectServer) :
39 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
40 waitTimer(io, std::chrono::steady_clock::duration(0)),
41 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
42 configs(configs), name(escapeName(name)), path(path)
43{
44 makeSensors();
45}
46
47void GpuDevice::makeSensors()
48{
49 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053050 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
51 objectServer, std::vector<thresholds::Threshold>{});
52
Harshit Aghera5e7decc2025-05-07 16:20:16 +053053 readThermalParameters(
54 eid,
55 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
56 gpuTLimitCriticalThresholdId,
57 gpuTLimitHardshutDownThresholdId},
58 mctpRequester,
59 std::bind_front(&GpuDevice::processTLimitThresholds, this));
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053060
Harshit Aghera902c6492025-05-08 15:57:42 +053061 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
62 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
63 objectServer, std::vector<thresholds::Threshold>{});
64
Harshit Aghera775199d2025-05-27 14:20:24 +053065 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
66 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
67 objectServer, std::vector<thresholds::Threshold>{});
68
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053069 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
70 name, "PATH", path);
71
72 read();
73}
74
Harshit Aghera5e7decc2025-05-07 16:20:16 +053075void GpuDevice::processTLimitThresholds(uint8_t rc,
76 const std::vector<int32_t>& thresholds)
77{
78 std::vector<thresholds::Threshold> tLimitThresholds{};
79 if (rc == 0)
80 {
81 tLimitThresholds = {
82 thresholds::Threshold{thresholds::Level::WARNING,
83 thresholds::Direction::LOW,
84 static_cast<double>(thresholds[0])},
85 thresholds::Threshold{thresholds::Level::CRITICAL,
86 thresholds::Direction::LOW,
87 static_cast<double>(thresholds[1])},
88 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
89 thresholds::Direction::LOW,
90 static_cast<double>(thresholds[2])}};
91 }
92
93 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
94 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
95 objectServer, std::move(tLimitThresholds));
96}
97
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053098void GpuDevice::read()
99{
100 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530101 if (tLimitSensor)
102 {
103 tLimitSensor->update();
104 }
Harshit Aghera902c6492025-05-08 15:57:42 +0530105 powerSensor->update();
Harshit Aghera775199d2025-05-27 14:20:24 +0530106 energySensor->update();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530107
108 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
109 waitTimer.async_wait([this](const boost::system::error_code& ec) {
110 if (ec)
111 {
112 return;
113 }
114 read();
115 });
116}