blob: 78984c517eb880a504f0d886123053cb69532df4 [file] [log] [blame]
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include "NvidiaGpuDevice.hpp"
8
Marc Olberdingd0125c92025-10-08 14:37:19 -07009#include "NvidiaGpuThresholds.hpp"
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053010#include "Thresholds.hpp"
11#include "Utils.hpp"
12
Marc Olberdingd0125c92025-10-08 14:37:19 -070013#include <Inventory.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053014#include <MctpRequester.hpp>
Marc Olberdingd0125c92025-10-08 14:37:19 -070015#include <NvidiaDeviceDiscovery.hpp>
Harshit Aghera775199d2025-05-27 14:20:24 +053016#include <NvidiaGpuEnergySensor.hpp>
Rohit PAI0a888262025-06-11 08:52:29 +053017#include <NvidiaGpuMctpVdm.hpp>
Harshit Aghera6b712322025-07-31 19:25:12 +053018#include <NvidiaGpuPowerPeakReading.hpp>
Harshit Aghera902c6492025-05-08 15:57:42 +053019#include <NvidiaGpuPowerSensor.hpp>
Marc Olberdingd0125c92025-10-08 14:37:19 -070020#include <NvidiaGpuSensor.hpp>
Harshit Agherabef4d412025-05-27 14:53:56 +053021#include <NvidiaGpuVoltageSensor.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053022#include <boost/asio/io_context.hpp>
23#include <phosphor-logging/lg2.hpp>
24#include <sdbusplus/asio/connection.hpp>
25#include <sdbusplus/asio/object_server.hpp>
26
27#include <chrono>
28#include <cstdint>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053029#include <functional>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053030#include <memory>
31#include <string>
Harshit Aghera5e7decc2025-05-07 16:20:16 +053032#include <utility>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053033#include <vector>
34
35GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
36 const std::string& path,
37 const std::shared_ptr<sdbusplus::asio::connection>& conn,
38 uint8_t eid, boost::asio::io_context& io,
39 mctp::MctpRequester& mctpRequester,
40 sdbusplus::asio::object_server& objectServer) :
41 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
42 waitTimer(io, std::chrono::steady_clock::duration(0)),
43 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
44 configs(configs), name(escapeName(name)), path(path)
45{
Rohit PAI0a888262025-06-11 08:52:29 +053046 inventory = std::make_shared<Inventory>(
Rohit PAIada6baa2025-07-01 18:26:19 +053047 conn, objectServer, name, mctpRequester,
48 gpu::DeviceIdentification::DEVICE_GPU, eid, io);
Marc Olberdingac920732025-09-28 21:56:54 -070049}
50
51void GpuDevice::init()
52{
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053053 makeSensors();
Marc Olberdingac920732025-09-28 21:56:54 -070054 inventory->init();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053055}
56
57void GpuDevice::makeSensors()
58{
59 tempSensor = std::make_shared<NvidiaGpuTempSensor>(
Harshit Agheraba138da2025-05-05 12:26:35 +053060 conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
61 objectServer, std::vector<thresholds::Threshold>{});
62
Harshit Aghera5e7decc2025-05-07 16:20:16 +053063 readThermalParameters(
64 eid,
65 std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
66 gpuTLimitCriticalThresholdId,
67 gpuTLimitHardshutDownThresholdId},
68 mctpRequester,
69 std::bind_front(&GpuDevice::processTLimitThresholds, this));
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053070
Harshit Agherab10a67b2025-05-27 12:19:29 +053071 dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
72 conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
73 gpuDramTempSensorId, objectServer,
74 std::vector<thresholds::Threshold>{thresholds::Threshold{
75 thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
76
Harshit Aghera902c6492025-05-08 15:57:42 +053077 powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
78 conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
79 objectServer, std::vector<thresholds::Threshold>{});
80
Harshit Aghera6b712322025-07-31 19:25:12 +053081 peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
82 mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
83 objectServer);
84
Harshit Aghera775199d2025-05-27 14:20:24 +053085 energySensor = std::make_shared<NvidiaGpuEnergySensor>(
86 conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
87 objectServer, std::vector<thresholds::Threshold>{});
88
Harshit Agherabef4d412025-05-27 14:53:56 +053089 voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
90 conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
91 objectServer, std::vector<thresholds::Threshold>{});
92
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053093 lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
94 name, "PATH", path);
95
96 read();
97}
98
Harshit Aghera5e7decc2025-05-07 16:20:16 +053099void GpuDevice::processTLimitThresholds(uint8_t rc,
100 const std::vector<int32_t>& thresholds)
101{
102 std::vector<thresholds::Threshold> tLimitThresholds{};
103 if (rc == 0)
104 {
105 tLimitThresholds = {
106 thresholds::Threshold{thresholds::Level::WARNING,
107 thresholds::Direction::LOW,
108 static_cast<double>(thresholds[0])},
109 thresholds::Threshold{thresholds::Level::CRITICAL,
110 thresholds::Direction::LOW,
111 static_cast<double>(thresholds[1])},
112 thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
113 thresholds::Direction::LOW,
114 static_cast<double>(thresholds[2])}};
115 }
116
117 tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
118 conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
119 objectServer, std::move(tLimitThresholds));
120}
121
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530122void GpuDevice::read()
123{
124 tempSensor->update();
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530125 if (tLimitSensor)
126 {
127 tLimitSensor->update();
128 }
Harshit Agherab10a67b2025-05-27 12:19:29 +0530129 dramTempSensor->update();
Harshit Aghera902c6492025-05-08 15:57:42 +0530130 powerSensor->update();
Harshit Aghera6b712322025-07-31 19:25:12 +0530131 peakPower->update();
Harshit Aghera775199d2025-05-27 14:20:24 +0530132 energySensor->update();
Harshit Agherabef4d412025-05-27 14:53:56 +0530133 voltageSensor->update();
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530134
135 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
136 waitTimer.async_wait([this](const boost::system::error_code& ec) {
137 if (ec)
138 {
139 return;
140 }
141 read();
142 });
Marc Olberdingd0125c92025-10-08 14:37:19 -0700143};