blob: 9560220f8c6829d1811a069c3f0d4e37d250c4ad [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
Rohit PAI0a888262025-06-11 08:52:29 +05309#include "Inventory.hpp"
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053010#include "NvidiaDeviceDiscovery.hpp"
11#include "NvidiaGpuSensor.hpp"
12#include "Thresholds.hpp"
13#include "Utils.hpp"
14
15#include <bits/basic_string.h>
16
17#include <MctpRequester.hpp>
Harshit Aghera775199d2025-05-27 14:20:24 +053018#include <NvidiaGpuEnergySensor.hpp>
Rohit PAI0a888262025-06-11 08:52:29 +053019#include <NvidiaGpuMctpVdm.hpp>
Harshit Aghera6b712322025-07-31 19:25:12 +053020#include <NvidiaGpuPowerPeakReading.hpp>
Harshit Aghera902c6492025-05-08 15:57:42 +053021#include <NvidiaGpuPowerSensor.hpp>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053022#include <NvidiaGpuThresholds.hpp>
Harshit Agherabef4d412025-05-27 14:53:56 +053023#include <NvidiaGpuVoltageSensor.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053024#include <boost/asio/io_context.hpp>
25#include <phosphor-logging/lg2.hpp>
26#include <sdbusplus/asio/connection.hpp>
27#include <sdbusplus/asio/object_server.hpp>
28
29#include <chrono>
30#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053031#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053032#include <memory>
33#include <string>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053034#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053035#include <vector>
36
37GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
38 const std::string& path,
39 const std::shared_ptr<sdbusplus::asio::connection>& conn,
40 uint8_t eid, boost::asio::io_context& io,
41 mctp::MctpRequester& mctpRequester,
42 sdbusplus::asio::object_server& objectServer) :
43 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
44 waitTimer(io, std::chrono::steady_clock::duration(0)),
45 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
46 configs(configs), name(escapeName(name)), path(path)
47{
Rohit PAI0a888262025-06-11 08:52:29 +053048 inventory = std::make_shared<Inventory>(
Rohit PAIada6baa2025-07-01 18:26:19 +053049 conn, objectServer, name, mctpRequester,
50 gpu::DeviceIdentification::DEVICE_GPU, eid, io);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053051 makeSensors();
52}
53
54void GpuDevice::makeSensors()
55{
56 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053057 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
58 objectServer, std::vector<thresholds::Threshold>{});
59
Harshit Aghera5e7decc2025-05-07 16:20:16 +053060 readThermalParameters(
61 eid,
62 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
63 gpuTLimitCriticalThresholdId,
64 gpuTLimitHardshutDownThresholdId},
65 mctpRequester,
66 std::bind_front(&GpuDevice::processTLimitThresholds, this));
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053067
Harshit Agherab10a67b2025-05-27 12:19:29 +053068 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
69 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
70 gpuDramTempSensorId, objectServer,
71 std::vector<thresholds::Threshold>{thresholds::Threshold{
72 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
73
Harshit Aghera902c6492025-05-08 15:57:42 +053074 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
75 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
76 objectServer, std::vector<thresholds::Threshold>{});
77
Harshit Aghera6b712322025-07-31 19:25:12 +053078 peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
79 mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
80 objectServer);
81
Harshit Aghera775199d2025-05-27 14:20:24 +053082 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
83 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
84 objectServer, std::vector<thresholds::Threshold>{});
85
Harshit Agherabef4d412025-05-27 14:53:56 +053086 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
87 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
88 objectServer, std::vector<thresholds::Threshold>{});
89
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053090 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
91 name, "PATH", path);
92
93 read();
94}
95
Harshit Aghera5e7decc2025-05-07 16:20:16 +053096void GpuDevice::processTLimitThresholds(uint8_t rc,
97 const std::vector<int32_t>& thresholds)
98{
99 std::vector<thresholds::Threshold> tLimitThresholds{};
100 if (rc == 0)
101 {
102 tLimitThresholds = {
103 thresholds::Threshold{thresholds::Level::WARNING,
104 thresholds::Direction::LOW,
105 static_cast<double>(thresholds[0])},
106 thresholds::Threshold{thresholds::Level::CRITICAL,
107 thresholds::Direction::LOW,
108 static_cast<double>(thresholds[1])},
109 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
110 thresholds::Direction::LOW,
111 static_cast<double>(thresholds[2])}};
112 }
113
114 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
115 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
116 objectServer, std::move(tLimitThresholds));
117}
118
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530119void GpuDevice::read()
120{
121 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530122 if (tLimitSensor)
123 {
124 tLimitSensor->update();
125 }
Harshit Agherab10a67b2025-05-27 12:19:29 +0530126 dramTempSensor->update();
Harshit Aghera902c6492025-05-08 15:57:42 +0530127 powerSensor->update();
Harshit Aghera6b712322025-07-31 19:25:12 +0530128 peakPower->update();
Harshit Aghera775199d2025-05-27 14:20:24 +0530129 energySensor->update();
Harshit Agherabef4d412025-05-27 14:53:56 +0530130 voltageSensor->update();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530131
132 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
133 waitTimer.async_wait([this](const boost::system::error_code& ec) {
134 if (ec)
135 {
136 return;
137 }
138 read();
139 });
140}