blob: 509a353d8f65ab93d8e8d5396babe6686193a6e4 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
9#include "NvidiaDeviceDiscovery.hpp"
10#include "NvidiaGpuSensor.hpp"
11#include "Thresholds.hpp"
12#include "Utils.hpp"
13
14#include <bits/basic_string.h>
15
16#include <MctpRequester.hpp>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053017#include <NvidiaGpuThresholds.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053018#include <boost/asio/io_context.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23#include <chrono>
24#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053025#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053026#include <memory>
27#include <string>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053028#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053029#include <vector>
30
31GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
32 const std::string& path,
33 const std::shared_ptr<sdbusplus::asio::connection>& conn,
34 uint8_t eid, boost::asio::io_context& io,
35 mctp::MctpRequester& mctpRequester,
36 sdbusplus::asio::object_server& objectServer) :
37 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
38 waitTimer(io, std::chrono::steady_clock::duration(0)),
39 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
40 configs(configs), name(escapeName(name)), path(path)
41{
42 makeSensors();
43}
44
45void GpuDevice::makeSensors()
46{
47 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053048 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
49 objectServer, std::vector<thresholds::Threshold>{});
50
Harshit Aghera5e7decc2025-05-07 16:20:16 +053051 readThermalParameters(
52 eid,
53 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
54 gpuTLimitCriticalThresholdId,
55 gpuTLimitHardshutDownThresholdId},
56 mctpRequester,
57 std::bind_front(&GpuDevice::processTLimitThresholds, this));
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053058
59 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
60 name, "PATH", path);
61
62 read();
63}
64
Harshit Aghera5e7decc2025-05-07 16:20:16 +053065void GpuDevice::processTLimitThresholds(uint8_t rc,
66 const std::vector<int32_t>& thresholds)
67{
68 std::vector<thresholds::Threshold> tLimitThresholds{};
69 if (rc == 0)
70 {
71 tLimitThresholds = {
72 thresholds::Threshold{thresholds::Level::WARNING,
73 thresholds::Direction::LOW,
74 static_cast<double>(thresholds[0])},
75 thresholds::Threshold{thresholds::Level::CRITICAL,
76 thresholds::Direction::LOW,
77 static_cast<double>(thresholds[1])},
78 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
79 thresholds::Direction::LOW,
80 static_cast<double>(thresholds[2])}};
81 }
82
83 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
84 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
85 objectServer, std::move(tLimitThresholds));
86}
87
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053088void GpuDevice::read()
89{
90 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +053091 if (tLimitSensor)
92 {
93 tLimitSensor->update();
94 }
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053095
96 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
97 waitTimer.async_wait([this](const boost::system::error_code& ec) {
98 if (ec)
99 {
100 return;
101 }
102 read();
103 });
104}