blob: 03550e5aaa04e6ac97793a7ceb927ae4c78229af [file] [log] [blame]
Harshit Agherafa2a5b92025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
9#include "NvidiaDeviceDiscovery.hpp"
10#include "NvidiaGpuSensor.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
Harshit Agherac20108d2025-05-07 16:20:16 +053017#include <NvidiaGpuThresholds.hpp>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053018#include <boost/asio/io_context.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23#include <chrono>
24#include <cstdint>
25#include <memory>
26#include <string>
Harshit Agherac20108d2025-05-07 16:20:16 +053027#include <utility>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053028#include <vector>
29
30GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
31 const std::string& path,
32 const std::shared_ptr<sdbusplus::asio::connection>& conn,
33 uint8_t eid, boost::asio::io_context& io,
34 mctp::MctpRequester& mctpRequester,
35 sdbusplus::asio::object_server& objectServer) :
36 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
37 waitTimer(io, std::chrono::steady_clock::duration(0)),
38 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
39 configs(configs), name(escapeName(name)), path(path)
40{
41 makeSensors();
42}
43
44void GpuDevice::makeSensors()
45{
46 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Aghera0e1718c2025-05-05 12:26:35 +053047 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
48 objectServer, std::vector<thresholds::Threshold>{});
49
Harshit Agherac20108d2025-05-07 16:20:16 +053050 readThermalParameters(
51 eid,
52 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
53 gpuTLimitCriticalThresholdId,
54 gpuTLimitHardshutDownThresholdId},
55 mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
56 std::vector<thresholds::Threshold> tLimitThresholds{};
57 if (rc == 0)
58 {
59 tLimitThresholds = {
60 thresholds::Threshold{thresholds::Level::WARNING,
61 thresholds::Direction::LOW,
62 static_cast<double>(thresholds[0])},
63 thresholds::Threshold{thresholds::Level::CRITICAL,
64 thresholds::Direction::LOW,
65 static_cast<double>(thresholds[1])},
66 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
67 thresholds::Direction::LOW,
68 static_cast<double>(thresholds[2])}};
69 }
70
71 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
72 conn, mctpRequester, name + "_TEMP_1", path, eid,
73 gpuTLimitSensorId, objectServer, std::move(tLimitThresholds));
74 });
Harshit Agherafa2a5b92025-05-22 11:35:39 +053075
76 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
77 name, "PATH", path);
78
79 read();
80}
81
82void GpuDevice::read()
83{
84 tempSensor->update();
Harshit Agherac20108d2025-05-07 16:20:16 +053085 if (tLimitSensor)
86 {
87 tLimitSensor->update();
88 }
Harshit Agherafa2a5b92025-05-22 11:35:39 +053089
90 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
91 waitTimer.async_wait([this](const boost::system::error_code& ec) {
92 if (ec)
93 {
94 return;
95 }
96 read();
97 });
98}