nvidia-gpu: Fix up buffering in MctpRequester This change does a lot, for better or worse 1. Change MctpRequester to hold both buffers for send and receive 2. This requires changing the callback structure, so the reach is far 3. Changes error reporting to be through std::error_code 4. Collapses the QueuingRequeuster and Requeuster to be MctpRequeuster 5. Doing 4 gets rid of a level indirection and an extra unordered_map 6. Adds proper iid support, which is made significantly easier by 4/5 7. Fixes issues around expiry timer's where we would cancel the timer for a given request whenever a new packet would come in to be sent. This could cause lockup if a packet truly did time out and an interleaved packet finished sending. This moves each queue to have its own timer. This fixes an issue where we were receiving buffers in from clients and then binding them to receive_calls without ensuring that they are the correct message, thus when receive was called, it was called with the last bound buffer to async_receive_from. This would cause a number of issues, ranging from incorrect device discovery results to core dumps as well as incorrect sensor readings. This change moves the receive and send buffers to be owned by the MctpRequester, and a non-owning view is provided via callback to the client. All existing clients just decode in place given that buffer. Tested: loaded onto nvl32-obmc. Correct number of sensors showed up and the readings were nominal Change-Id: I67c843691ca79e9fcccfa16df6d611918f25f6ca Signed-off-by: Marc Olberding <molberding@nvidia.com>

commit: d0125c9cdf0f0ae7f1943f773c99fc512db0a68e [log] [tgz]
author: Marc Olberding <molberding@nvidia.com> Wed Oct 08 14:37:19 2025 -0700
committer: Marc Olberding <molberding@nvidia.com> Thu Oct 16 15:02:15 2025 -0700
tree: f6dbb3eba497d5d56de0565fe598ced25a1d0077
parent: 6d481b5d0636dc3adaa00c2292ac92fc8840fd94 [diff] [blame]
diff --git a/src/nvidia-gpu/Inventory.cpp b/src/nvidia-gpu/Inventory.cpp
index 7a1c3cc..0541d81 100644
--- a/src/nvidia-gpu/Inventory.cpp
+++ b/src/nvidia-gpu/Inventory.cpp

@@ -16,7 +16,9 @@
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
+#include <system_error>
 #include <unordered_map>
 #include <variant>
 #include <vector>
@@ -39,9 +41,6 @@
     name(escapeName(inventoryName)), mctpRequester(mctpRequester),
     deviceType(deviceTypeIn), eid(eid), retryTimer(io)
 {
-    requestBuffer = std::make_shared<InventoryRequestBuffer>();
-    responseBuffer = std::make_shared<InventoryResponseBuffer>();
-
     std::string path = inventoryPrefix + name;
 
     assetIface = objectServer.add_interface(path, assetIfaceName);
@@ -134,7 +133,7 @@
     gpu::InventoryPropertyId propertyId)
 {
     int rc = gpu::encodeGetInventoryInformationRequest(
-        0, static_cast<uint8_t>(propertyId), *requestBuffer);
+        0, static_cast<uint8_t>(propertyId), requestBuffer);
     if (rc != 0)
     {
         lg2::error(
@@ -148,15 +147,17 @@
         "Sending inventory request for property ID {PROP_ID} to EID {EID} for {NAME}",
         "PROP_ID", static_cast<uint8_t>(propertyId), "EID", eid, "NAME", name);
 
-    mctpRequester.sendRecvMsg(eid, *requestBuffer, *responseBuffer,
-                              [this, propertyId](int sendRecvMsgResult) {
-                                  this->handleInventoryPropertyResponse(
-                                      propertyId, sendRecvMsgResult);
-                              });
+    mctpRequester.sendRecvMsg(
+        eid, requestBuffer,
+        [this, propertyId](const std::error_code& result,
+                           std::span<const uint8_t> buffer) {
+            this->handleInventoryPropertyResponse(propertyId, result, buffer);
+        });
 }
 
 void Inventory::handleInventoryPropertyResponse(
-    gpu::InventoryPropertyId propertyId, int sendRecvMsgResult)
+    gpu::InventoryPropertyId propertyId, const std::error_code& ec,
+    std::span<const uint8_t> buffer)
 {
     auto it = properties.find(propertyId);
     if (it == properties.end())
@@ -168,19 +169,19 @@
     }
 
     bool success = false;
-    if (sendRecvMsgResult == 0)
+    if (!ec)
     {
         ocp::accelerator_management::CompletionCode cc{};
         uint16_t reasonCode = 0;
         gpu::InventoryValue info;
         int rc = gpu::decodeGetInventoryInformationResponse(
-            *responseBuffer, cc, reasonCode, propertyId, info);
+            buffer, cc, reasonCode, propertyId, info);
 
         lg2::info(
             "Response for property ID {PROP_ID} from {NAME}, sendRecvMsgResult: {RESULT}, decode_rc: {RC}, completion_code: {CC}, reason_code: {REASON}",
             "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name, "RESULT",
-            sendRecvMsgResult, "RC", rc, "CC", static_cast<uint8_t>(cc),
-            "REASON", reasonCode);
+            ec.message(), "RC", rc, "CC", static_cast<uint8_t>(cc), "REASON",
+            reasonCode);
 
         if (rc == 0 &&
             cc == ocp::accelerator_management::CompletionCode::SUCCESS)
commit	d0125c9cdf0f0ae7f1943f773c99fc512db0a68e	[log] [tgz]
author	Marc Olberding <molberding@nvidia.com>	Wed Oct 08 14:37:19 2025 -0700
committer	Marc Olberding <molberding@nvidia.com>	Thu Oct 16 15:02:15 2025 -0700
tree	f6dbb3eba497d5d56de0565fe598ced25a1d0077
parent	6d481b5d0636dc3adaa00c2292ac92fc8840fd94 [diff] [blame]