blob: 5f3e83b587d8c1bef9a1923589d77a24cf6ce897 [file] [log] [blame]
Harshit Agherafa2a5b92025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
9#include "NvidiaDeviceDiscovery.hpp"
10#include "NvidiaGpuSensor.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
Harshit Aghera128c91d2025-05-27 14:20:24 +053017#include <NvidiaGpuEnergySensor.hpp>
Harshit Agherac8dab722025-05-08 15:57:42 +053018#include <NvidiaGpuPowerSensor.hpp>
Harshit Agherac20108d2025-05-07 16:20:16 +053019#include <NvidiaGpuThresholds.hpp>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053020#include <boost/asio/io_context.hpp>
21#include <phosphor-logging/lg2.hpp>
22#include <sdbusplus/asio/connection.hpp>
23#include <sdbusplus/asio/object_server.hpp>
24
25#include <chrono>
26#include <cstdint>
27#include <memory>
28#include <string>
Harshit Agherac20108d2025-05-07 16:20:16 +053029#include <utility>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053030#include <vector>
31
32GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
33 const std::string& path,
34 const std::shared_ptr<sdbusplus::asio::connection>& conn,
35 uint8_t eid, boost::asio::io_context& io,
36 mctp::MctpRequester& mctpRequester,
37 sdbusplus::asio::object_server& objectServer) :
38 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
39 waitTimer(io, std::chrono::steady_clock::duration(0)),
40 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
41 configs(configs), name(escapeName(name)), path(path)
42{
43 makeSensors();
44}
45
46void GpuDevice::makeSensors()
47{
48 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Aghera0e1718c2025-05-05 12:26:35 +053049 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
50 objectServer, std::vector<thresholds::Threshold>{});
51
Harshit Agherac20108d2025-05-07 16:20:16 +053052 readThermalParameters(
53 eid,
54 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
55 gpuTLimitCriticalThresholdId,
56 gpuTLimitHardshutDownThresholdId},
57 mctpRequester, [this](uint8_t rc, std::vector<int32_t> thresholds) {
58 std::vector<thresholds::Threshold> tLimitThresholds{};
59 if (rc == 0)
60 {
61 tLimitThresholds = {
62 thresholds::Threshold{thresholds::Level::WARNING,
63 thresholds::Direction::LOW,
64 static_cast<double>(thresholds[0])},
65 thresholds::Threshold{thresholds::Level::CRITICAL,
66 thresholds::Direction::LOW,
67 static_cast<double>(thresholds[1])},
68 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
69 thresholds::Direction::LOW,
70 static_cast<double>(thresholds[2])}};
71 }
72
73 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
74 conn, mctpRequester, name + "_TEMP_1", path, eid,
75 gpuTLimitSensorId, objectServer, std::move(tLimitThresholds));
76 });
Harshit Agherafa2a5b92025-05-22 11:35:39 +053077
Harshit Agherac8dab722025-05-08 15:57:42 +053078 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
79 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
80 objectServer, std::vector<thresholds::Threshold>{});
81
Harshit Aghera128c91d2025-05-27 14:20:24 +053082 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
83 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
84 objectServer, std::vector<thresholds::Threshold>{});
85
Harshit Agherafa2a5b92025-05-22 11:35:39 +053086 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
87 name, "PATH", path);
88
89 read();
90}
91
92void GpuDevice::read()
93{
94 tempSensor->update();
Harshit Agherac20108d2025-05-07 16:20:16 +053095 if (tLimitSensor)
96 {
97 tLimitSensor->update();
98 }
Harshit Agherac8dab722025-05-08 15:57:42 +053099 powerSensor->update();
Harshit Aghera128c91d2025-05-27 14:20:24 +0530100 energySensor->update();
Harshit Agherafa2a5b92025-05-22 11:35:39 +0530101
102 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
103 waitTimer.async_wait([this](const boost::system::error_code& ec) {
104 if (ec)
105 {
106 return;
107 }
108 read();
109 });
110}