gpu: add support for per EID request queuing

The Nvidia Extension of OCP MCTP VDM Protocol specifies that there
should be only one outstanding request message to a GPU Device
implementing the VDM protocol. This introduces a requirement for request
queuing per EID. This patch implements the same.

This patch renames the MctpRequester to Requester and introduces a new
QueuingRequester that composes on top of the Requester and introduces
per EID queuing. Each call to `sendRecvMsg` now enqueues the request
(instead of sending it immediately). If there is no ongoing request the
requester will send the request out right away. Otherwise the requester
waits for the ongoing request to finish before sending out the
previously enqueued request. This ensures the serialization of the
requests and makes sure that there is only one request "in flight" at a
time. For minimal/no client changes, QueuingRequester is type aliased to
MctpRequester.

Tested.

Build an image for gb200nvl-obmc machine with the following
patches cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

Pick the following changes (in order) that enable multiple GPU sensors:

'''
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/79970
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80031
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80078
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80099
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80566
https://gerrit.openbmc.org/c/openbmc/dbus-sensors/+/80567
'''

Check if all sensors are available on redfish.

'''
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors",
  "@odata.type": "#SensorCollection.SensorCollection",
  "Description": "Collection of Sensors for this Chassis",
  "Members": [
    {
      "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/energy_NVIDIA_GB200_GPU_0_Energy_0"
    },
    {
      "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/power_NVIDIA_GB200_GPU_0_Power_0"
    },
    {
      "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0"
    },
    {
      "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1"
    },
    {
      "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/voltage_NVIDIA_GB200_GPU_0_Voltage_0"
    }
  ],
  "Members@odata.count": 5,
  "Name": "Sensors"
}
'''

Check Individual Sensor Updates.

'''
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/energy_NVIDIA_GB200_GPU_0_Energy_0
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/energy_NVIDIA_GB200_GPU_0_Energy_0",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "energy_NVIDIA_GB200_GPU_0_Energy_0",
  "Name": "NVIDIA GB200 GPU 0 Energy 0",
  "Reading": 489574.403,
  "ReadingRangeMax": 1.8446744073709552e+16,
  "ReadingRangeMin": 0.0,
  "ReadingType": "EnergyJoules",
  "ReadingUnits": "J",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  }
}
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/power_NVIDIA_GB200_GPU_0_Power_0
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/power_NVIDIA_GB200_GPU_0_Power_0",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "power_NVIDIA_GB200_GPU_0_Power_0",
  "Name": "NVIDIA GB200 GPU 0 Power 0",
  "Reading": 27.229,
  "ReadingRangeMax": 4294967295.0,
  "ReadingRangeMin": 0.0,
  "ReadingType": "Power",
  "ReadingUnits": "W",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  }
}
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_0",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_0",
  "Name": "NVIDIA GB200 GPU 0 TEMP 0",
  "Reading": 27.6015625,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  }
}
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/voltage_NVIDIA_GB200_GPU_0_Voltage_0
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/voltage_NVIDIA_GB200_GPU_0_Voltage_0",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "voltage_NVIDIA_GB200_GPU_0_Voltage_0",
  "Name": "NVIDIA GB200 GPU 0 Voltage 0",
  "Reading": 0.735,
  "ReadingRangeMax": 1.8446744073709552e+16,
  "ReadingRangeMin": 0.0,
  "ReadingType": "Voltage",
  "ReadingUnits": "V",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  }
}                                                                                                                                                                                                                                                                                                                                                  [ e2d838f-lcedt ] ~ %
~ % curl -s -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU_0_TEMP_1",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU_0_TEMP_1",
  "Name": "NVIDIA GB200 GPU 0 TEMP 1",
  "Reading": 57.37109375,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  },
  "Thresholds": {
    "LowerCaution": {
      "Reading": 0.0
    },
    "LowerCritical": {
      "Reading": 0.0
    },
    "LowerFatal": {
      "Reading": 0.0
    }
  }
}
'''

Change-Id: Ic3b892ef2c76c4c703aa55f5b2a66c22a5d71bdf
Signed-off-by: Aditya Kurdunkar <akurdunkar@nvidia.com>
diff --git a/src/nvidia-gpu/MctpRequester.cpp b/src/nvidia-gpu/MctpRequester.cpp
index 024f8cc..c238d92 100644
--- a/src/nvidia-gpu/MctpRequester.cpp
+++ b/src/nvidia-gpu/MctpRequester.cpp
@@ -22,6 +22,7 @@
 #include <cstdint>
 #include <cstring>
 #include <functional>
+#include <memory>
 #include <span>
 #include <utility>
 
@@ -30,12 +31,12 @@
 namespace mctp
 {
 
-MctpRequester::MctpRequester(boost::asio::io_context& ctx) :
+Requester::Requester(boost::asio::io_context& ctx) :
     mctpSocket(ctx, boost::asio::generic::datagram_protocol{AF_MCTP, 0}),
     expiryTimer(ctx)
 {}
 
-void MctpRequester::processRecvMsg(
+void Requester::processRecvMsg(
     uint8_t eid, const std::span<const uint8_t> reqMsg,
     const std::span<uint8_t> respMsg, const boost::system::error_code& ec,
     const size_t /*length*/)
@@ -47,7 +48,7 @@
         lg2::error(
             "MctpRequester failed to receive data from the MCTP socket - ErrorCode={EC}, Error={ER}.",
             "EC", ec.value(), "ER", ec.message());
-        completionCallback(EIO);
+        completionCallbacks[eid](EIO);
         return;
     }
 
@@ -58,7 +59,7 @@
     if (respAddr->smctp_type != msgType)
     {
         lg2::error("MctpRequester: Message type mismatch");
-        completionCallback(EPROTO);
+        completionCallbacks[eid](EPROTO);
         return;
     }
 
@@ -69,7 +70,7 @@
         lg2::error(
             "MctpRequester: EID mismatch - expected={EID}, received={REID}",
             "EID", eid, "REID", respEid);
-        completionCallback(EPROTO);
+        completionCallbacks[eid](EPROTO);
         return;
     }
 
@@ -96,15 +97,15 @@
                 "MctpRequester: Instance ID mismatch - request={REQ}, response={RESP}",
                 "REQ", static_cast<int>(reqInstanceId), "RESP",
                 static_cast<int>(respInstanceId));
-            completionCallback(EPROTO);
+            completionCallbacks[eid](EPROTO);
             return;
         }
     }
 
-    completionCallback(0);
+    completionCallbacks[eid](0);
 }
 
-void MctpRequester::handleSendMsgCompletion(
+void Requester::handleSendMsgCompletion(
     uint8_t eid, const std::span<const uint8_t> reqMsg,
     std::span<uint8_t> respMsg, const boost::system::error_code& ec,
     size_t /* length */)
@@ -114,28 +115,28 @@
         lg2::error(
             "MctpRequester failed to send data from the MCTP socket - ErrorCode={EC}, Error={ER}.",
             "EC", ec.value(), "ER", ec.message());
-        completionCallback(EIO);
+        completionCallbacks[eid](EIO);
         return;
     }
 
     expiryTimer.expires_after(2s);
 
-    expiryTimer.async_wait([this](const boost::system::error_code& ec) {
+    expiryTimer.async_wait([this, eid](const boost::system::error_code& ec) {
         if (ec != boost::asio::error::operation_aborted)
         {
-            completionCallback(ETIME);
+            completionCallbacks[eid](ETIME);
         }
     });
 
     mctpSocket.async_receive_from(
         boost::asio::mutable_buffer(respMsg), recvEndPoint,
-        std::bind_front(&MctpRequester::processRecvMsg, this, eid, reqMsg,
+        std::bind_front(&Requester::processRecvMsg, this, eid, reqMsg,
                         respMsg));
 }
 
-void MctpRequester::sendRecvMsg(
-    uint8_t eid, const std::span<const uint8_t> reqMsg,
-    std::span<uint8_t> respMsg, std::move_only_function<void(int)> callback)
+void Requester::sendRecvMsg(uint8_t eid, const std::span<const uint8_t> reqMsg,
+                            std::span<uint8_t> respMsg,
+                            std::move_only_function<void(int)> callback)
 {
     if (reqMsg.size() < sizeof(ocp::accelerator_management::BindingPciVid))
     {
@@ -144,7 +145,7 @@
         return;
     }
 
-    completionCallback = std::move(callback);
+    completionCallbacks[eid] = std::move(callback);
 
     struct sockaddr_mctp addr{};
     addr.smctp_family = AF_MCTP;
@@ -156,7 +157,47 @@
 
     mctpSocket.async_send_to(
         boost::asio::const_buffer(reqMsg), sendEndPoint,
-        std::bind_front(&MctpRequester::handleSendMsgCompletion, this, eid,
-                        reqMsg, respMsg));
+        std::bind_front(&Requester::handleSendMsgCompletion, this, eid, reqMsg,
+                        respMsg));
 }
+
+void QueuingRequester::sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
+                                   std::span<uint8_t> respMsg,
+                                   std::move_only_function<void(int)> callback)
+{
+    auto reqCtx =
+        std::make_unique<RequestContext>(reqMsg, respMsg, std::move(callback));
+
+    // Add request to queue
+    auto& queue = requestContextQueues[eid];
+    queue.push(std::move(reqCtx));
+
+    processQueue(eid);
+}
+
+void QueuingRequester::processQueue(uint8_t eid)
+{
+    auto& queue = requestContextQueues[eid];
+    if (queue.empty() || activeRequestContexts.contains(eid))
+    {
+        return;
+    }
+
+    activeRequestContexts[eid] = std::move(queue.front());
+    queue.pop();
+
+    const auto& reqCtx = activeRequestContexts[eid];
+
+    requester.sendRecvMsg(
+        eid, reqCtx->reqMsg, reqCtx->respMsg, [this, eid](int result) {
+            const auto& reqCtx = activeRequestContexts[eid];
+
+            reqCtx->callback(result); // Call the original callback
+
+            activeRequestContexts.erase(eid);
+
+            processQueue(eid);
+        });
+}
+
 } // namespace mctp
diff --git a/src/nvidia-gpu/MctpRequester.hpp b/src/nvidia-gpu/MctpRequester.hpp
index 289e800..5b5a270 100644
--- a/src/nvidia-gpu/MctpRequester.hpp
+++ b/src/nvidia-gpu/MctpRequester.hpp
@@ -14,24 +14,28 @@
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <memory>
+#include <queue>
 #include <span>
+#include <unordered_map>
+#include <utility>
 
 namespace mctp
 {
-class MctpRequester
+class Requester
 {
   public:
-    MctpRequester() = delete;
+    Requester() = delete;
 
-    MctpRequester(const MctpRequester&) = delete;
+    Requester(const Requester&) = delete;
 
-    MctpRequester(MctpRequester&&) = delete;
+    Requester(Requester&&) = delete;
 
-    MctpRequester& operator=(const MctpRequester&) = delete;
+    Requester& operator=(const Requester&) = delete;
 
-    MctpRequester& operator=(MctpRequester&&) = delete;
+    Requester& operator=(Requester&&) = delete;
 
-    explicit MctpRequester(boost::asio::io_context& ctx);
+    explicit Requester(boost::asio::io_context& ctx);
 
     void sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
                      std::span<uint8_t> respMsg,
@@ -57,8 +61,56 @@
 
     boost::asio::steady_timer expiryTimer;
 
-    std::move_only_function<void(int)> completionCallback;
+    std::unordered_map<uint8_t, std::move_only_function<void(int)>>
+        completionCallbacks;
 
     static constexpr uint8_t msgType = ocp::accelerator_management::messageType;
 };
+
+class QueuingRequester
+{
+  public:
+    QueuingRequester() = delete;
+    QueuingRequester(const QueuingRequester&) = delete;
+    QueuingRequester(QueuingRequester&&) = delete;
+    QueuingRequester& operator=(const QueuingRequester&) = delete;
+    QueuingRequester& operator=(QueuingRequester&&) = delete;
+
+    explicit QueuingRequester(boost::asio::io_context& ctx) : requester(ctx) {}
+
+    void sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
+                     std::span<uint8_t> respMsg,
+                     std::move_only_function<void(int)> callback);
+
+  private:
+    struct RequestContext
+    {
+        std::span<const uint8_t> reqMsg;
+        std::span<uint8_t> respMsg;
+        std::move_only_function<void(int)> callback;
+
+        RequestContext(const RequestContext&) = delete;
+        RequestContext& operator=(const RequestContext&) = delete;
+
+        RequestContext(RequestContext&&) = default;
+        RequestContext& operator=(RequestContext&&) = default;
+        ~RequestContext() = default;
+
+        explicit RequestContext(std::span<const uint8_t> req,
+                                std::span<uint8_t> resp,
+                                std::move_only_function<void(int)> cb) :
+            reqMsg(req), respMsg(resp), callback(std::move(cb))
+        {}
+    };
+
+    void processQueue(uint8_t eid);
+
+    Requester requester;
+    std::unordered_map<uint8_t, std::queue<std::unique_ptr<RequestContext>>>
+        requestContextQueues;
+    std::unordered_map<uint8_t, std::unique_ptr<RequestContext>>
+        activeRequestContexts;
+};
+
+using MctpRequester = QueuingRequester;
 } // namespace mctp