blob: 38cbf242194dcc3a0b6270f56ad238fdf5a307e1 [file] [log] [blame]
Harshit Agherafa2a5b92025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
9#include "NvidiaDeviceDiscovery.hpp"
10#include "NvidiaGpuSensor.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
Harshit Agherac8dab722025-05-08 15:57:42 +053017#include <NvidiaGpuPowerSensor.hpp>
Harshit Agherac20108d2025-05-07 16:20:16 +053018#include <NvidiaGpuThresholds.hpp>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053019#include <boost/asio/io_context.hpp>
20#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
23
24#include <chrono>
25#include <cstdint>
26#include <memory>
27#include <string>
Harshit Agherac20108d2025-05-07 16:20:16 +053028#include <utility>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053029#include <vector>
30
31GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
32 const std::string& path,
33 const std::shared_ptr<sdbusplus::asio::connection>& conn,
34 uint8_t eid, boost::asio::io_context& io,
35 mctp::MctpRequester& mctpRequester,
36 sdbusplus::asio::object_server& objectServer) :
37 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
38 waitTimer(io, std::chrono::steady_clock::duration(0)),
39 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
40 configs(configs), name(escapeName(name)), path(path)
41{
42 makeSensors();
43}
44
45void GpuDevice::makeSensors()
46{
47 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Aghera0e1718c2025-05-05 12:26:35 +053048 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
49 objectServer, std::vector<thresholds::Threshold>{});
50
Harshit Agherac20108d2025-05-07 16:20:16 +053051 readThermalParameters(
52 eid,
53 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
54 gpuTLimitCriticalThresholdId,
55 gpuTLimitHardshutDownThresholdId},
56 mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
57 std::vector<thresholds::Threshold> tLimitThresholds{};
58 if (rc == 0)
59 {
60 tLimitThresholds = {
61 thresholds::Threshold{thresholds::Level::WARNING,
62 thresholds::Direction::LOW,
63 static_cast<double>(thresholds[0])},
64 thresholds::Threshold{thresholds::Level::CRITICAL,
65 thresholds::Direction::LOW,
66 static_cast<double>(thresholds[1])},
67 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
68 thresholds::Direction::LOW,
69 static_cast<double>(thresholds[2])}};
70 }
71
72 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
73 conn, mctpRequester, name + "_TEMP_1", path, eid,
74 gpuTLimitSensorId, objectServer, std::move(tLimitThresholds));
75 });
Harshit Agherafa2a5b92025-05-22 11:35:39 +053076
Harshit Agherac8dab722025-05-08 15:57:42 +053077 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
78 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
79 objectServer, std::vector<thresholds::Threshold>{});
80
Harshit Agherafa2a5b92025-05-22 11:35:39 +053081 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
82 name, "PATH", path);
83
84 read();
85}
86
87void GpuDevice::read()
88{
89 tempSensor->update();
Harshit Agherac20108d2025-05-07 16:20:16 +053090 if (tLimitSensor)
91 {
92 tLimitSensor->update();
93 }
Harshit Agherac8dab722025-05-08 15:57:42 +053094 powerSensor->update();
Harshit Agherafa2a5b92025-05-22 11:35:39 +053095
96 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
97 waitTimer.async_wait([this](const boost::system::error_code& ec) {
98 if (ec)
99 {
100 return;
101 }
102 read();
103 });
104}