gpu: add support for per EID request queuing
The Nvidia Extension of OCP MCTP VDM Protocol specifies that there
should be only one outstanding request message to a GPU Device
implementing the VDM protocol. This introduces a requirement for request
queuing per EID. This patch implements the same.
This patch renames the MctpRequester to Requester and introduces a new
QueuingRequester that composes on top of the Requester and introduces
per EID queuing. Each call to `sendRecvMsg` now enqueues the request
(instead of sending it immediately). If there is no ongoing request the
requester will send the request out right away. Otherwise the requester
waits for the ongoing request to finish before sending out the
previously enqueued request. This ensures the serialization of the
requests and makes sure that there is only one request "in flight" at a
time. For minimal/no client changes, QueuingRequester is type aliased to
MctpRequester.
Tested.
Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Pick the following changes (in order) that enable multiple GPU sensors:
'''
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/79970
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80031
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80078
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80099
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80566
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80567
'''
Check if all sensors are available on redfish.
'''
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors",
"@odata.type": "#SensorCollection.SensorCollection",
"Description": "Collection of Sensors for this Chassis",
"Members": [
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/energy_NVIDIA_GB200_GPU_0_Energy_0"
},
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/power_NVIDIA_GB200_GPU_0_Power_0"
},
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0"
},
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1"
},
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/voltage_NVIDIA_GB200_GPU_0_Voltage_0"
}
],
"Members@odata.count": 5,
"Name": "Sensors"
}
'''
Check Individual Sensor Updates.
'''
curl -s -k -u 'root:0penBmc' https://10.137.203.245/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_0",
"Name": "NVIDIA GB200 GPU 0 TEMP 0",
"Reading": 27.71875,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}
curl -s -k -u 'root:0penBmc' https://10.137.203.245/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1",
"Name": "NVIDIA GB200 GPU 0 TEMP 1",
"Reading": 57.28125,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}
curl -s -k -u 'root:0penBmc' https://10.137.203.245/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/power_NVIDIA_GB200_GPU_0_Power_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/power_NVIDIA_GB200_GPU_0_Power_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "power_NVIDIA_GB200_GPU_0_Power_0",
"Name": "NVIDIA GB200 GPU 0 Power 0",
"Reading": 27.468,
"ReadingRangeMax": 4294967.295,
"ReadingRangeMin": 0.0,
"ReadingType": "Power",
"ReadingUnits": "W",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}
curl -s -k -u 'root:0penBmc' https://10.137.203.245/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/energy_NVIDIA_GB200_GPU_0_Energy_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/energy_NVIDIA_GB200_GPU_0_Energy_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "energy_NVIDIA_GB200_GPU_0_Energy_0",
"Name": "NVIDIA GB200 GPU 0 Energy 0",
"Reading": 45058.545,
"ReadingRangeMax": 1.8446744073709552e+16,
"ReadingRangeMin": 0.0,
"ReadingType": "EnergyJoules",
"ReadingUnits": "J",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}
curl -s -k -u 'root:0penBmc' https://10.137.203.245/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/voltage_NVIDIA_GB200_GPU_0_Voltage_0
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/voltage_NVIDIA_GB200_GPU_0_Voltage_0",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "voltage_NVIDIA_GB200_GPU_0_Voltage_0",
"Name": "NVIDIA GB200 GPU 0 Voltage 0",
"Reading": 0.735,
"ReadingRangeMax": 4294.967295,
"ReadingRangeMin": 0.0,
"ReadingType": "Voltage",
"ReadingUnits": "V",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}
'''
Change-Id: Ic3b892ef2c76c4c703aa55f5b2a66c22a5d71bdf
Signed-off-by: Aditya Kurdunkar <akurdunkar@nvidia.com>
diff --git a/src/nvidia-gpu/MctpRequester.hpp b/src/nvidia-gpu/MctpRequester.hpp
index 289e800..9d05ebd 100644
--- a/src/nvidia-gpu/MctpRequester.hpp
+++ b/src/nvidia-gpu/MctpRequester.hpp
@@ -10,35 +10,39 @@
#include <boost/asio/generic/datagram_protocol.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/steady_timer.hpp>
+#include <boost/container/devector.hpp>
#include <cstddef>
#include <cstdint>
#include <functional>
+#include <memory>
#include <span>
+#include <unordered_map>
+#include <utility>
namespace mctp
{
-class MctpRequester
+class Requester
{
public:
- MctpRequester() = delete;
+ Requester() = delete;
- MctpRequester(const MctpRequester&) = delete;
+ Requester(const Requester&) = delete;
- MctpRequester(MctpRequester&&) = delete;
+ Requester(Requester&&) = delete;
- MctpRequester& operator=(const MctpRequester&) = delete;
+ Requester& operator=(const Requester&) = delete;
- MctpRequester& operator=(MctpRequester&&) = delete;
+ Requester& operator=(Requester&&) = delete;
- explicit MctpRequester(boost::asio::io_context& ctx);
+ explicit Requester(boost::asio::io_context& ctx);
void sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
std::span<uint8_t> respMsg,
std::move_only_function<void(int)> callback);
private:
- void processRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
+ void processRecvMsg(std::span<const uint8_t> reqMsg,
std::span<uint8_t> respMsg,
const boost::system::error_code& ec, size_t length);
@@ -57,8 +61,56 @@
boost::asio::steady_timer expiryTimer;
- std::move_only_function<void(int)> completionCallback;
+ std::unordered_map<uint8_t, std::move_only_function<void(int)>>
+ completionCallbacks;
static constexpr uint8_t msgType = ocp::accelerator_management::messageType;
};
+
+class QueuingRequester
+{
+ public:
+ QueuingRequester() = delete;
+ QueuingRequester(const QueuingRequester&) = delete;
+ QueuingRequester(QueuingRequester&&) = delete;
+ QueuingRequester& operator=(const QueuingRequester&) = delete;
+ QueuingRequester& operator=(QueuingRequester&&) = delete;
+
+ explicit QueuingRequester(boost::asio::io_context& ctx) : requester(ctx) {}
+
+ void sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
+ std::span<uint8_t> respMsg,
+ std::move_only_function<void(int)> callback);
+
+ private:
+ struct RequestContext
+ {
+ std::span<const uint8_t> reqMsg;
+ std::span<uint8_t> respMsg;
+ std::move_only_function<void(int)> callback;
+
+ RequestContext(const RequestContext&) = delete;
+ RequestContext& operator=(const RequestContext&) = delete;
+
+ RequestContext(RequestContext&&) = default;
+ RequestContext& operator=(RequestContext&&) = default;
+ ~RequestContext() = default;
+
+ explicit RequestContext(std::span<const uint8_t> req,
+ std::span<uint8_t> resp,
+ std::move_only_function<void(int)>&& cb) :
+ reqMsg(req), respMsg(resp), callback(std::move(cb))
+ {}
+ };
+
+ void handleResult(uint8_t eid, int result);
+ void processQueue(uint8_t eid);
+
+ Requester requester;
+ std::unordered_map<
+ uint8_t, boost::container::devector<std::unique_ptr<RequestContext>>>
+ requestContextQueues;
+};
+
+using MctpRequester = QueuingRequester;
} // namespace mctp