gpu : introduce notion of a device
The concept of a device is being introduced for the GPU, which enables a
more efficient and scalable approach to managing multiple sensors for a
single endpoint.
Refactoring of Entity-Manager Configuration and Endpoint Discovery To
support multiple sensors for an endpoint, the following changes are
being made:
Entity-Manager Configuration Discovery: This task will be moved out of
the GPU Temperature Sensor implementation.
Endpoint Discovery: Similarly, endpoint discovery will also be performed
outside of the GPU Temperature Sensor implementation.
Frequency of Task Execution: Both entity-manager configuration discovery
and endpoint discovery will be performed only once per endpoint, rather
than repeatedly for each sensor. This optimization will improve
performance and reduce redundancy.
Tested.
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_TEMP_0",
"Name": "NVIDIA GB200 GPU TEMP 0",
"Reading": 35.96875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ie3dcd43caa031b4aaa61d8be3f5d71aefd53bc9a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuSensorMain.cpp b/src/gpu/GpuSensorMain.cpp
index 254a11a..9345b84 100644
--- a/src/gpu/GpuSensorMain.cpp
+++ b/src/gpu/GpuSensorMain.cpp
@@ -3,17 +3,16 @@
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*/
-#include "GpuSensor.hpp"
#include "MctpRequester.hpp"
#include "OcpMctpVdm.hpp"
#include "Utils.hpp"
+#include <GpuDevice.hpp>
#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/post.hpp>
#include <boost/asio/steady_timer.hpp>
#include <boost/container/flat_map.hpp>
-#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/bus.hpp>
@@ -27,15 +26,24 @@
#include <string>
#include <vector>
-boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>> sensors;
+/**
+ * @brief Global map of GPU devices keyed by their paths
+ * @details Stores all discovered GPU devices in the system for management
+ * and tracking throughout the application lifecycle
+ */
+boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>> gpuDevice;
/**
- * @brief config timer expiry callback
- * @param io Boost ASIO I/O context
- * @param objectServer D-Bus object server
- * @param dbusConnection D-Bus connection
- * @param mctpRequester MCTP requester for GPU communication
- * @param ec Boost ASIO error code
+ * @brief Callback function executed when configuration timer expires
+ * @details Triggers sensor creation or reconfiguration process when
+ * configuration changes are detected in the system. If the timer was canceled
+ * due to application shutdown or other reasons, the function returns early.
+ *
+ * @param io Boost ASIO I/O context for scheduling asynchronous operations
+ * @param objectServer D-Bus object server for exposing sensor interfaces
+ * @param dbusConnection D-Bus connection for system communication
+ * @param mctpRequester MCTP requester for GPU communication protocol
+ * @param ec Boost ASIO error code indicating success or failure reason
*/
void configTimerExpiryCallback(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
@@ -46,11 +54,7 @@
{
return; // we're being canceled
}
- createSensors(io, objectServer, sensors, dbusConnection, mctpRequester);
- if (sensors.empty())
- {
- lg2::info("Configuration not detected");
- }
+ createSensors(io, objectServer, gpuDevice, dbusConnection, mctpRequester);
}
int main()
@@ -65,7 +69,7 @@
ocp::accelerator_management::messageType);
boost::asio::post(io, [&]() {
- createSensors(io, objectServer, sensors, systemBus, mctpRequester);
+ createSensors(io, objectServer, gpuDevice, systemBus, mctpRequester);
});
boost::asio::steady_timer configTimer(io);
@@ -90,7 +94,7 @@
static_cast<sdbusplus::bus_t&>(*systemBus),
"type='signal',member='InterfacesRemoved',arg0path='" +
std::string(inventoryPath) + "/'",
- [](sdbusplus::message_t& msg) { interfaceRemoved(msg, sensors); });
+ [](sdbusplus::message_t& msg) { interfaceRemoved(msg, gpuDevice); });
io.run();
return 0;