nvidia-gpu: Fix up buffering in MctpRequester
This change does a lot, for better or worse
1. Change MctpRequester to hold both buffers for send and receive
2. This requires changing the callback structure, so the reach is far
3. Changes error reporting to be through std::error_code
4. Collapses the QueuingRequeuster and Requeuster to be MctpRequeuster
5. Doing 4 gets rid of a level indirection and an extra unordered_map
6. Adds proper iid support, which is made significantly easier by 4/5
7. Fixes issues around expiry timer's where we would cancel the timer
   for a given request whenever a new packet would come in to be sent.
   This could cause lockup if a packet truly did time out and an
   interleaved packet finished sending. This moves each queue
   to have its own timer.
This fixes an issue where we were receiving buffers in from clients
and then binding them to receive_calls without ensuring that they
are the correct message, thus when receive was called, it was called
with the last bound buffer to async_receive_from. This would cause a
number of issues, ranging from incorrect device discovery results
to core dumps as well as incorrect sensor readings.
This change moves the receive and send buffers to be owned by
the MctpRequester, and a non-owning view is provided via
callback to the client. All existing clients just decode in place
given that buffer.
Tested: loaded onto nvl32-obmc. Correct number of sensors showed up
and the readings were nominal
Change-Id: I67c843691ca79e9fcccfa16df6d611918f25f6ca
Signed-off-by: Marc Olberding <molberding@nvidia.com>
diff --git a/src/nvidia-gpu/Inventory.cpp b/src/nvidia-gpu/Inventory.cpp
index 7a1c3cc..0541d81 100644
--- a/src/nvidia-gpu/Inventory.cpp
+++ b/src/nvidia-gpu/Inventory.cpp
@@ -16,7 +16,9 @@
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
+#include <system_error>
 #include <unordered_map>
 #include <variant>
 #include <vector>
@@ -39,9 +41,6 @@
     name(escapeName(inventoryName)), mctpRequester(mctpRequester),
     deviceType(deviceTypeIn), eid(eid), retryTimer(io)
 {
-    requestBuffer = std::make_shared<InventoryRequestBuffer>();
-    responseBuffer = std::make_shared<InventoryResponseBuffer>();
-
     std::string path = inventoryPrefix + name;
 
     assetIface = objectServer.add_interface(path, assetIfaceName);
@@ -134,7 +133,7 @@
     gpu::InventoryPropertyId propertyId)
 {
     int rc = gpu::encodeGetInventoryInformationRequest(
-        0, static_cast<uint8_t>(propertyId), *requestBuffer);
+        0, static_cast<uint8_t>(propertyId), requestBuffer);
     if (rc != 0)
     {
         lg2::error(
@@ -148,15 +147,17 @@
         "Sending inventory request for property ID {PROP_ID} to EID {EID} for {NAME}",
         "PROP_ID", static_cast<uint8_t>(propertyId), "EID", eid, "NAME", name);
 
-    mctpRequester.sendRecvMsg(eid, *requestBuffer, *responseBuffer,
-                              [this, propertyId](int sendRecvMsgResult) {
-                                  this->handleInventoryPropertyResponse(
-                                      propertyId, sendRecvMsgResult);
-                              });
+    mctpRequester.sendRecvMsg(
+        eid, requestBuffer,
+        [this, propertyId](const std::error_code& result,
+                           std::span<const uint8_t> buffer) {
+            this->handleInventoryPropertyResponse(propertyId, result, buffer);
+        });
 }
 
 void Inventory::handleInventoryPropertyResponse(
-    gpu::InventoryPropertyId propertyId, int sendRecvMsgResult)
+    gpu::InventoryPropertyId propertyId, const std::error_code& ec,
+    std::span<const uint8_t> buffer)
 {
     auto it = properties.find(propertyId);
     if (it == properties.end())
@@ -168,19 +169,19 @@
     }
 
     bool success = false;
-    if (sendRecvMsgResult == 0)
+    if (!ec)
     {
         ocp::accelerator_management::CompletionCode cc{};
         uint16_t reasonCode = 0;
         gpu::InventoryValue info;
         int rc = gpu::decodeGetInventoryInformationResponse(
-            *responseBuffer, cc, reasonCode, propertyId, info);
+            buffer, cc, reasonCode, propertyId, info);
 
         lg2::info(
             "Response for property ID {PROP_ID} from {NAME}, sendRecvMsgResult: {RESULT}, decode_rc: {RC}, completion_code: {CC}, reason_code: {REASON}",
             "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name, "RESULT",
-            sendRecvMsgResult, "RC", rc, "CC", static_cast<uint8_t>(cc),
-            "REASON", reasonCode);
+            ec.message(), "RC", rc, "CC", static_cast<uint8_t>(cc), "REASON",
+            reasonCode);
 
         if (rc == 0 &&
             cc == ocp::accelerator_management::CompletionCode::SUCCESS)