nvidia-gpu: introduce notion of a device
Perform device discovery tasks only once per device to prepare for
introducing additional gpu sensors.
In the current implementation, sensor updates and device discovery via
MCTP are managed within a single class for simplicity. However, since a
GPU device typically includes multiple sensors, performing device
discovery for each individual sensor is inefficient. Instead, it would
be more effective to execute device discovery once per device.
Tested: Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"Name": "NVIDIA GB200 GPU 0 TEMP 0",
"Reading": 37.6875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
```
Change-Id: Ie3dcd43caa031b4aaa61d8be3f5d71aefd53bc9a
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 158dc41..25fe069 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp
@@ -8,102 +8,52 @@
#include "MctpRequester.hpp"
#include "Thresholds.hpp"
-#include "Utils.hpp"
#include "sensor.hpp"
#include <NvidiaGpuMctpVdm.hpp>
-#include <OcpMctpVdm.hpp>
-#include <boost/asio/io_context.hpp>
-#include <boost/asio/steady_timer.hpp>
-#include <boost/container/flat_map.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
-#include <sdbusplus/message.hpp>
#include <array>
-#include <chrono>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
-constexpr const char* sensorPathPrefix = "/xyz/openbmc_project/sensors/";
-constexpr const char* sensorType = "NvidiaMctpVdm";
-
-struct GpuTempSensor :
+struct NvidiaGpuTempSensor :
public Sensor,
- public std::enable_shared_from_this<GpuTempSensor>
+ public std::enable_shared_from_this<NvidiaGpuTempSensor>
{
public:
- GpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
- boost::asio::io_context& io,
- mctp::MctpRequester& mctpRequester, const std::string& name,
- const std::string& sensorConfiguration,
- sdbusplus::asio::object_server& objectServer,
- std::vector<thresholds::Threshold>&& thresholdData,
- std::chrono::milliseconds pollRate);
+ NvidiaGpuTempSensor(std::shared_ptr<sdbusplus::asio::connection>& conn,
+ mctp::MctpRequester& mctpRequester,
+ const std::string& name,
+ const std::string& sensorConfiguration, uint8_t eid,
+ sdbusplus::asio::object_server& objectServer,
+ std::vector<thresholds::Threshold>&& thresholdData);
- ~GpuTempSensor() override;
+ ~NvidiaGpuTempSensor() override;
void checkThresholds() override;
- private:
- void read();
-
void update();
- void discoverGpus();
-
+ private:
void processResponse(int sendRecvMsgResult);
- void processQueryDeviceIdResponse(uint8_t eid, int sendRecvMsgResult);
-
- void queryEndpoints(const boost::system::error_code& ec,
- const GetSubTreeType& ret);
-
- void processEndpoint(const boost::system::error_code& ec,
- const SensorBaseConfigMap& endpoint);
- void processGpuEndpoint(uint8_t eid);
-
uint8_t eid{};
uint8_t sensorId;
- std::chrono::milliseconds sensorPollMs;
-
- boost::asio::steady_timer waitTimer;
+ std::shared_ptr<sdbusplus::asio::connection> conn;
mctp::MctpRequester& mctpRequester;
- std::shared_ptr<sdbusplus::asio::connection> conn;
-
sdbusplus::asio::object_server& objectServer;
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::GetTemperatureReadingRequest)>
+ std::array<uint8_t, sizeof(gpu::GetTemperatureReadingRequest)>
getTemperatureReadingRequest{};
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::GetTemperatureReadingResponse)>
+ std::array<uint8_t, sizeof(gpu::GetTemperatureReadingResponse)>
getTemperatureReadingResponse{};
-
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::QueryDeviceIdentificationRequest)>
- queryDeviceIdentificationRequest{};
-
- std::array<uint8_t, sizeof(ocp::accelerator_management::Message) +
- sizeof(gpu::QueryDeviceIdentificationResponse)>
- queryDeviceIdentificationResponse{};
};
-
-void createSensors(
- boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors,
- std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
- mctp::MctpRequester& mctpRequester);
-
-void interfaceRemoved(
- sdbusplus::message_t& message,
- boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
- sensors);