blob: e5128be7baccc0903306f19ee4387e5dbe2329ff [file] [log] [blame]
Harshit Aghera11b9c1a2025-04-29 17:34:25 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#pragma once
7
8#include "MctpRequester.hpp"
9#include "Thresholds.hpp"
10#include "UpdatableSensor.hpp"
11
12#include <boost/asio/io_context.hpp>
13#include <boost/asio/steady_timer.hpp>
14#include <boost/container/flat_map.hpp>
15#include <sdbusplus/asio/connection.hpp>
16#include <sdbusplus/asio/object_server.hpp>
17#include <sdbusplus/message.hpp>
18
19#include <chrono>
20#include <cstdint>
21#include <map>
22#include <memory>
23#include <string>
24#include <utility>
25#include <variant>
26#include <vector>
27
28constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
29constexpr const char* sensorType = "NvidiaMctpVdm";
30
31using getSubTreeRet = std::vector<
32 std::pair<std::string,
33 std::vector<std::pair<std::string, std::vector<std::string>>>>>;
34using GpuSensorConfigMap =
35 std::map<std::string, std::variant<std::string, bool, uint32_t, uint8_t,
36 int64_t, std::vector<uint8_t>>>;
37
38/**
39 * @struct GpuDevice
40 * @brief Represents a GPU device in the system
41 * @details Manages the lifecycle of a GPU device including discovery, sensor
42 * creation, communication, and monitoring. Handles MCTP protocol interactions
43 * with the physical GPU hardware.
44 */
45struct GpuDevice
46{
47 public:
48 /**
49 * @brief Constructor for GpuDevice
50 * @details Initializes a GPU device object with the provided parameters and
51 * starts the process of discovering available sensors on the
52 * device
53 *
54 * @param name Name of the GPU device for identification
55 * @param path D-Bus object path for this GPU device
56 * @param conn D-Bus connection for system communication
57 * @param io Boost ASIO I/O context for asynchronous operations
58 * @param mctpRequester MCTP protocol requester for GPU communication
59 * @param objectServer D-Bus object server for exposing interfaces
60 */
61 GpuDevice(const std::string& name, const std::string& path,
62 std::shared_ptr<sdbusplus::asio::connection>& conn,
63 boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
64 sdbusplus::asio::object_server& objectServer);
65
66 const std::string& getPath()
67 {
68 return path;
69 }
70
71 private:
72 /**
73 * @brief Add a sensor to this GPU device
74 * @details Associates a sensor with this GPU device and adds it to the
75 * internal list of sensors managed by this device
76 *
77 * @param name Name of the sensor to add
78 * @param sensor Shared pointer to the sensor object
79 */
80 void addSensor(const std::string& name,
81 const std::shared_ptr<Sensor>& sensor);
82
83 /**
84 * @brief Create sensors for this GPU device
85 * @details Discovers and creates all available sensor types on this GPU
86 */
87 void createSensors();
88
89 /**
90 * @brief Read the current temperature value from the GPU
91 */
92 void read();
93
94 /**
95 * @brief Discover available GPUs on the system
96 */
97 void discoverGpus();
98
99 /**
100 * @brief Process MCTP endpoints discovered on the system
101 *
102 * @param[in] ec Error code from the D-Bus method call
103 * @param[in] ret Object tree results containing MCTP endpoint information
104 */
105 void processMctpEndpoints(const boost::system::error_code& ec,
106 const getSubTreeRet& ret);
107
108 /**
109 * @brief Process configuration properties for MCTP endpoints
110 *
111 * @param[in] ec Error code from the D-Bus properties method call
112 * @param[in] configs Map of configuration properties for the endpoint
113 */
114 void processEndpointConfigs(const boost::system::error_code& ec,
115 const GpuSensorConfigMap& configs);
116
117 /**
118 * @brief Process a discovered GPU endpoint
119 * @param eid The endpoint ID of the discovered GPU
120 */
121 void processGpuEndpoint(uint8_t eid);
122
123 /**
124 * @brief MCTP endpoint ID
125 */
126 uint8_t eid{};
127
128 /**
129 * @brief How often to poll the sensor in milliseconds
130 */
131 std::chrono::milliseconds sensorPollMs;
132
133 /**
134 * @brief Timer for scheduling sensor reads
135 */
136 boost::asio::steady_timer waitTimer;
137
138 /**
139 * @brief Reference to the MCTP requester for communication
140 */
141 mctp::MctpRequester& mctpRequester;
142
143 /**
144 * @brief D-Bus connection
145 */
146 std::shared_ptr<sdbusplus::asio::connection> conn;
147
148 /**
149 * @brief D-Bus object server
150 */
151 sdbusplus::asio::object_server& objectServer;
152
153 /**
154 * @brief Collection of sensors associated with this GPU device
155 * @details Stores all sensor objects created for this GPU
156 */
157 std::vector<std::shared_ptr<GpuSensor>> sensors;
158
159 /**
160 * @brief Name of this GPU device
161 * @details Human-readable identifier for the GPU
162 */
163 std::string name;
164
165 /**
166 * @brief D-Bus object path for this GPU device
167 * @details Path where this GPU device is exposed in the D-Bus object
168 * hierarchy
169 */
170 std::string path;
171};
172
173/**
174 * @brief Create GPU temperature sensors
175 * @details Discovers and creates GPU devices and their associated sensors in
176 * the system. This function is called at startup and whenever configuration
177 * changes are detected.
178 *
179 * @param io Boost ASIO I/O context for scheduling asynchronous operations
180 * @param objectServer D-Bus object server for exposing sensor interfaces
181 * @param gpuDevice Map to store created GPU device objects, keyed by their
182 * paths
183 * @param dbusConnection D-Bus connection for system communication
184 * @param mctpRequester MCTP requester for GPU communication protocol
185 */
186void createSensors(
187 boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
188 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
189 gpuDevice,
190 std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
191 mctp::MctpRequester& mctpRequester);
192
193/**
194 * @brief Handle D-Bus interface removal events
195 * @param message D-Bus message containing interface removal information
196 * @param gpuDevice Map of GPU devices to check for removal
197 */
198void interfaceRemoved(
199 sdbusplus::message_t& message,
200 boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>>&
201 gpuDevice);