nvidia-gpu: introduce notion of a device
Perform device discovery tasks only once per device to prepare for
introducing additional gpu sensors.
In the current implementation, sensor updates and device discovery via
MCTP are managed within a single class for simplicity. However, since a
GPU device typically includes multiple sensors, performing device
discovery for each individual sensor is inefficient. Instead, it would
be more effective to execute device discovery once per device.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"Name": "NVIDIA GB200 GPU 0 TEMP 0",
"Reading": 37.6875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ie3dcd43caa031b4aaa61d8be3f5d71aefd53bc9a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
index cf764d3..6ccbb05 100644
--- a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
@@ -5,15 +5,14 @@
*/
#include "MctpRequester.hpp"
-#include "NvidiaGpuSensor.hpp"
#include "Utils.hpp"
+#include <NvidiaDeviceDiscovery.hpp>
#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/post.hpp>
#include <boost/asio/steady_timer.hpp>
#include <boost/container/flat_map.hpp>
-#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/bus.hpp>
@@ -27,7 +26,7 @@
#include <string>
#include <vector>
-boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>> sensors;
+boost::container::flat_map<std::string, std::shared_ptr<GpuDevice>> gpuDevices;
void configTimerExpiryCallback(
boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
@@ -38,11 +37,7 @@
{
return; // we're being canceled
}
- createSensors(io, objectServer, sensors, dbusConnection, mctpRequester);
- if (sensors.empty())
- {
- lg2::info("Configuration not detected");
- }
+ createSensors(io, objectServer, gpuDevices, dbusConnection, mctpRequester);
}
int main()
@@ -56,7 +51,7 @@
mctp::MctpRequester mctpRequester(io);
boost::asio::post(io, [&]() {
- createSensors(io, objectServer, sensors, systemBus, mctpRequester);
+ createSensors(io, objectServer, gpuDevices, systemBus, mctpRequester);
});
boost::asio::steady_timer configTimer(io);
@@ -73,7 +68,7 @@
std::vector<std::unique_ptr<sdbusplus::bus::match_t>> matches =
setupPropertiesChangedMatches(
- *systemBus, std::to_array<const char*>({sensorType}), eventHandler);
+ *systemBus, std::to_array<const char*>({deviceType}), eventHandler);
// Watch for entity-manager to remove configuration interfaces
// so the corresponding sensors can be removed.
@@ -81,7 +76,7 @@
static_cast<sdbusplus::bus_t&>(*systemBus),
sdbusplus::bus::match::rules::interfacesRemovedAtPath(
std::string(inventoryPath)),
- [](sdbusplus::message_t& msg) { interfaceRemoved(msg, sensors); });
+ [](sdbusplus::message_t& msg) { interfaceRemoved(msg, gpuDevices); });
io.run();
return 0;