blob: 7c70d557484fc9551f2a658144b2ab086d37f3d2 [file] [log] [blame]
Harshit Agheraacd375a2025-04-21 19:50:10 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#pragma once
7
Harshit Agheraa3f24f42025-04-21 20:04:56 +05308#include "MctpRequester.hpp"
Harshit Agheraacd375a2025-04-21 19:50:10 +05309#include "Thresholds.hpp"
10#include "sensor.hpp"
11
12#include <boost/asio/io_context.hpp>
13#include <boost/asio/steady_timer.hpp>
14#include <boost/container/flat_map.hpp>
15#include <sdbusplus/asio/connection.hpp>
16#include <sdbusplus/asio/object_server.hpp>
17#include <sdbusplus/message.hpp>
18
Harshit Agheraa3f24f42025-04-21 20:04:56 +053019#include <chrono>
Harshit Agheraacd375a2025-04-21 19:50:10 +053020#include <cstdint>
21#include <map>
22#include <memory>
23#include <string>
24#include <utility>
25#include <variant>
26#include <vector>
27
28constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
29constexpr const char* sensorType = "NvidiaMctpVdm";
30
31using getSubTreeRet = std::vector<
32 std::pair<std::string,
33 std::vector<std::pair<std::string, std::vector<std::string>>>>>;
34using GpuSensorConfigMap =
35 std::map<std::string, std::variant<std::string, bool, uint32_t, uint8_t,
36 int64_t, std::vector<uint8_t>>>;
37
38/**
39 * @struct DeviceInfo
40 * @brief Contains information about a device
41 */
42struct DeviceInfo
43{
44 uint8_t deviceType;
45 uint8_t instanceId;
46};
47
48/**
49 * @struct GpuTempSensor
50 * @brief Implements a GPU temperature sensor that monitors temperature values
51 * @details Inherits from Sensor base class and enables shared pointer
52 * management via std::enable_shared_from_this
53 */
54struct GpuTempSensor :
55 public Sensor,
56 public std::enable_shared_from_this<GpuTempSensor>
57{
58 public:
59 /**
60 * @brief Constructor for GpuTempSensor
61 * @param conn D-Bus connection
62 * @param io Boost ASIO I/O context for asynchronous operations
63 * @param mctpRequester MCTP protocol requester for GPU communication
64 * @param name Name of the sensor
65 * @param sensorConfiguration Configuration string for the sensor
66 * @param objectServer D-Bus object server
67 * @param thresholdData Vector of threshold configurations
68 * @param pollRate How often to poll for new readings
69 * @param deviceInfo Information about the GPU device
70 * @param verbose Whether to enable verbose logging
71 */
72 GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
Harshit Agheraa3f24f42025-04-21 20:04:56 +053073 boost::asio::io_context& io,
74 mctp::MctpRequester& mctpRequester, const std::string& name,
Harshit Agheraacd375a2025-04-21 19:50:10 +053075 const std::string& sensorConfiguration,
76 sdbusplus::asio::object_server& objectServer,
Harshit Agheraa3f24f42025-04-21 20:04:56 +053077 std::vector<thresholds::Threshold>&& thresholdData,
78 std::chrono::milliseconds pollRate);
Harshit Agheraacd375a2025-04-21 19:50:10 +053079
80 /**
81 * @brief Destructor
82 */
83 ~GpuTempSensor() override;
84
85 /**
86 * @brief Check if any thresholds have been crossed
87 * @details Overrides the base class method to implement GPU-specific
88 * threshold checking
89 */
90 void checkThresholds() override;
91
92 private:
93 /**
Harshit Agheraa3f24f42025-04-21 20:04:56 +053094 * @brief Read the current temperature value from the GPU
95 */
96 void read();
97
98 /**
Harshit Agheraacd375a2025-04-21 19:50:10 +053099 * @brief Initialize the sensor
100 */
101 void init();
102
103 /**
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530104 * @brief Update the sensor reading
105 */
106 void update();
107
108 /**
Harshit Agheraacd375a2025-04-21 19:50:10 +0530109 * @brief Discover available GPUs on the system
110 */
111 void discoverGpus();
112
113 /**
114 * @brief Process MCTP endpoints discovered on the system
115 *
116 * @param[in] ec Error code from the D-Bus method call
117 * @param[in] ret Object tree results containing MCTP endpoint information
118 */
119 void processMctpEndpoints(const boost::system::error_code& ec,
120 const getSubTreeRet& ret);
121
122 /**
123 * @brief Process configuration properties for MCTP endpoints
124 *
125 * @param[in] ec Error code from the D-Bus properties method call
126 * @param[in] configs Map of configuration properties for the endpoint
127 */
128 void processEndpointConfigs(const boost::system::error_code& ec,
129 const GpuSensorConfigMap& configs);
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530130 /**
131 * @brief Process a discovered GPU endpoint
132 * @param eid The endpoint ID of the discovered GPU
133 */
134 void processGpuEndpoint(uint8_t eid);
135
136 /**
137 * @brief MCTP endpoint ID
138 */
139 uint8_t eid{};
140
141 /**
142 * @brief The sensor ID
143 */
144 uint8_t sensorId;
145
146 /**
147 * @brief How often to poll the sensor in milliseconds
148 */
149 std::chrono::milliseconds sensorPollMs;
Harshit Agheraacd375a2025-04-21 19:50:10 +0530150
151 /**
152 * @brief Timer for scheduling sensor reads
153 */
154 boost::asio::steady_timer waitTimer;
155
156 /**
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530157 * @brief Reference to the MCTP requester for communication
158 */
159 mctp::MctpRequester& mctpRequester;
160
161 /**
Harshit Agheraacd375a2025-04-21 19:50:10 +0530162 * @brief D-Bus connection
163 */
164 std::shared_ptr<sdbusplus::asio::connection> conn;
165
166 /**
167 * @brief D-Bus object server
168 */
169 sdbusplus::asio::object_server& objectServer;
170};
171
172/**
173 * @brief Create GPU temperature sensors
174 * @param io Boost ASIO I/O context
175 * @param objectServer D-Bus object server
176 * @param sensors Map to store created sensors
177 * @param dbusConnection D-Bus connection
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530178 * @param mctpRequester MCTP requester for GPU communication
Harshit Agheraacd375a2025-04-21 19:50:10 +0530179 */
180void createSensors(
181 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
182 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
183 sensors,
Harshit Agheraa3f24f42025-04-21 20:04:56 +0530184 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
185 mctp::MctpRequester& mctpRequester);
Harshit Agheraacd375a2025-04-21 19:50:10 +0530186
187/**
188 * @brief Handle D-Bus interface removal events
189 * @param message D-Bus message containing interface removal information
190 * @param sensors Map of GPU temperature sensors to check for removal
191 */
192void interfaceRemoved(
193 sdbusplus::message_t& message,
194 boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
195 sensors);