nvidia-gpu: Fix up buffering in MctpRequester This change does a lot, for better or worse 1. Change MctpRequester to hold both buffers for send and receive 2. This requires changing the callback structure, so the reach is far 3. Changes error reporting to be through std::error_code 4. Collapses the QueuingRequeuster and Requeuster to be MctpRequeuster 5. Doing 4 gets rid of a level indirection and an extra unordered_map 6. Adds proper iid support, which is made significantly easier by 4/5 7. Fixes issues around expiry timer's where we would cancel the timer for a given request whenever a new packet would come in to be sent. This could cause lockup if a packet truly did time out and an interleaved packet finished sending. This moves each queue to have its own timer. This fixes an issue where we were receiving buffers in from clients and then binding them to receive_calls without ensuring that they are the correct message, thus when receive was called, it was called with the last bound buffer to async_receive_from. This would cause a number of issues, ranging from incorrect device discovery results to core dumps as well as incorrect sensor readings. This change moves the receive and send buffers to be owned by the MctpRequester, and a non-owning view is provided via callback to the client. All existing clients just decode in place given that buffer. Tested: loaded onto nvl32-obmc. Correct number of sensors showed up and the readings were nominal Change-Id: I67c843691ca79e9fcccfa16df6d611918f25f6ca Signed-off-by: Marc Olberding <molberding@nvidia.com>

commit: d0125c9cdf0f0ae7f1943f773c99fc512db0a68e [log] [tgz]
author: Marc Olberding <molberding@nvidia.com> Wed Oct 08 14:37:19 2025 -0700
committer: Marc Olberding <molberding@nvidia.com> Thu Oct 16 15:02:15 2025 -0700
tree: f6dbb3eba497d5d56de0565fe598ced25a1d0077
parent: 6d481b5d0636dc3adaa00c2292ac92fc8840fd94 [diff]
diff --git a/src/nvidia-gpu/Inventory.cpp b/src/nvidia-gpu/Inventory.cpp
index 7a1c3cc..0541d81 100644
--- a/src/nvidia-gpu/Inventory.cpp
+++ b/src/nvidia-gpu/Inventory.cpp

@@ -16,7 +16,9 @@
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
+#include <system_error>
 #include <unordered_map>
 #include <variant>
 #include <vector>
@@ -39,9 +41,6 @@
     name(escapeName(inventoryName)), mctpRequester(mctpRequester),
     deviceType(deviceTypeIn), eid(eid), retryTimer(io)
 {
-    requestBuffer = std::make_shared<InventoryRequestBuffer>();
-    responseBuffer = std::make_shared<InventoryResponseBuffer>();
-
     std::string path = inventoryPrefix + name;
 
     assetIface = objectServer.add_interface(path, assetIfaceName);
@@ -134,7 +133,7 @@
     gpu::InventoryPropertyId propertyId)
 {
     int rc = gpu::encodeGetInventoryInformationRequest(
-        0, static_cast<uint8_t>(propertyId), *requestBuffer);
+        0, static_cast<uint8_t>(propertyId), requestBuffer);
     if (rc != 0)
     {
         lg2::error(
@@ -148,15 +147,17 @@
         "Sending inventory request for property ID {PROP_ID} to EID {EID} for {NAME}",
         "PROP_ID", static_cast<uint8_t>(propertyId), "EID", eid, "NAME", name);
 
-    mctpRequester.sendRecvMsg(eid, *requestBuffer, *responseBuffer,
-                              [this, propertyId](int sendRecvMsgResult) {
-                                  this->handleInventoryPropertyResponse(
-                                      propertyId, sendRecvMsgResult);
-                              });
+    mctpRequester.sendRecvMsg(
+        eid, requestBuffer,
+        [this, propertyId](const std::error_code& result,
+                           std::span<const uint8_t> buffer) {
+            this->handleInventoryPropertyResponse(propertyId, result, buffer);
+        });
 }
 
 void Inventory::handleInventoryPropertyResponse(
-    gpu::InventoryPropertyId propertyId, int sendRecvMsgResult)
+    gpu::InventoryPropertyId propertyId, const std::error_code& ec,
+    std::span<const uint8_t> buffer)
 {
     auto it = properties.find(propertyId);
     if (it == properties.end())
@@ -168,19 +169,19 @@
     }
 
     bool success = false;
-    if (sendRecvMsgResult == 0)
+    if (!ec)
     {
         ocp::accelerator_management::CompletionCode cc{};
         uint16_t reasonCode = 0;
         gpu::InventoryValue info;
         int rc = gpu::decodeGetInventoryInformationResponse(
-            *responseBuffer, cc, reasonCode, propertyId, info);
+            buffer, cc, reasonCode, propertyId, info);
 
         lg2::info(
             "Response for property ID {PROP_ID} from {NAME}, sendRecvMsgResult: {RESULT}, decode_rc: {RC}, completion_code: {CC}, reason_code: {REASON}",
             "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name, "RESULT",
-            sendRecvMsgResult, "RC", rc, "CC", static_cast<uint8_t>(cc),
-            "REASON", reasonCode);
+            ec.message(), "RC", rc, "CC", static_cast<uint8_t>(cc), "REASON",
+            reasonCode);
 
         if (rc == 0 &&
             cc == ocp::accelerator_management::CompletionCode::SUCCESS)

diff --git a/src/nvidia-gpu/Inventory.hpp b/src/nvidia-gpu/Inventory.hpp
index ea38f6f..5a24e56 100644
--- a/src/nvidia-gpu/Inventory.hpp
+++ b/src/nvidia-gpu/Inventory.hpp

@@ -16,11 +16,6 @@
 #include <string>
 #include <unordered_map>
 
-using InventoryRequestBuffer =
-    std::array<uint8_t, sizeof(gpu::GetInventoryInformationRequest)>;
-using InventoryResponseBuffer =
-    std::array<uint8_t, sizeof(gpu::GetInventoryInformationResponse)>;
-
 class Inventory : public std::enable_shared_from_this<Inventory>
 {
   public:
@@ -41,7 +36,8 @@
     };
     void sendInventoryPropertyRequest(gpu::InventoryPropertyId propertyId);
     void handleInventoryPropertyResponse(gpu::InventoryPropertyId propertyId,
-                                         int sendRecvMsgResult);
+                                         const std::error_code& ec,
+                                         std::span<const uint8_t> buffer);
     void processNextProperty();
     void processInventoryProperty(gpu::InventoryPropertyId propertyId);
     void registerProperty(
@@ -67,8 +63,8 @@
     uint8_t eid;
     boost::asio::steady_timer retryTimer;
     std::unordered_map<gpu::InventoryPropertyId, PropertyInfo> properties;
-    std::shared_ptr<InventoryRequestBuffer> requestBuffer;
-    std::shared_ptr<InventoryResponseBuffer> responseBuffer;
+    std::array<uint8_t, sizeof(gpu::GetInventoryInformationRequest)>
+        requestBuffer{};
     static constexpr std::chrono::seconds retryDelay{5};
     static constexpr int maxRetryAttempts = 3;
 };

diff --git a/src/nvidia-gpu/MctpRequester.cpp b/src/nvidia-gpu/MctpRequester.cpp
index 765859e..f28371f 100644
--- a/src/nvidia-gpu/MctpRequester.cpp
+++ b/src/nvidia-gpu/MctpRequester.cpp

@@ -18,13 +18,17 @@
 #include <boost/container/devector.hpp>
 #include <phosphor-logging/lg2.hpp>
 
-#include <cerrno>
+#include <bit>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <expected>
+#include <format>
 #include <functional>
-#include <memory>
+#include <optional>
 #include <span>
+#include <stdexcept>
+#include <system_error>
 #include <utility>
 
 using namespace std::literals;
@@ -32,131 +36,314 @@
 namespace mctp
 {
 
-Requester::Requester(boost::asio::io_context& ctx) :
-    mctpSocket(ctx, boost::asio::generic::datagram_protocol{AF_MCTP, 0}),
-    expiryTimer(ctx)
-{}
-
-void Requester::processRecvMsg(
-    const std::span<const uint8_t> reqMsg, const std::span<uint8_t> respMsg,
-    const boost::system::error_code& ec, const size_t /*length*/)
+static const ocp::accelerator_management::BindingPciVid* getHeaderFromBuffer(
+    std::span<const uint8_t> buffer)
 {
-    const auto* respAddr =
-        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-        reinterpret_cast<const struct sockaddr_mctp*>(recvEndPoint.data());
-
-    uint8_t eid = respAddr->smctp_addr.s_addr;
-
-    if (!completionCallbacks.contains(eid))
+    if (buffer.size() < sizeof(ocp::accelerator_management::BindingPciVid))
     {
-        lg2::error(
-            "MctpRequester failed to get the callback for the EID: {EID}",
-            "EID", static_cast<int>(eid));
+        return nullptr;
+    }
+
+    return std::bit_cast<const ocp::accelerator_management::BindingPciVid*>(
+        buffer.data());
+}
+
+static std::optional<uint8_t> getIid(std::span<const uint8_t> buffer)
+{
+    const ocp::accelerator_management::BindingPciVid* header =
+        getHeaderFromBuffer(buffer);
+    if (header == nullptr)
+    {
+        return std::nullopt;
+    }
+    return header->instance_id & ocp::accelerator_management::instanceIdBitMask;
+}
+
+static std::optional<bool> getRequestBit(std::span<const uint8_t> buffer)
+{
+    const ocp::accelerator_management::BindingPciVid* header =
+        getHeaderFromBuffer(buffer);
+    if (header == nullptr)
+    {
+        return std::nullopt;
+    }
+    return header->instance_id & ocp::accelerator_management::requestBitMask;
+}
+
+MctpRequester::MctpRequester(boost::asio::io_context& ctx) :
+    io{ctx},
+    mctpSocket(ctx, boost::asio::generic::datagram_protocol{AF_MCTP, 0})
+{
+    startReceive();
+}
+
+void MctpRequester::startReceive()
+{
+    mctpSocket.async_receive_from(
+        boost::asio::buffer(buffer), recvEndPoint.endpoint,
+        std::bind_front(&MctpRequester::processRecvMsg, this));
+}
+
+void MctpRequester::processRecvMsg(const boost::system::error_code& ec,
+                                   const size_t length)
+{
+    std::optional<uint8_t> expectedEid = recvEndPoint.eid();
+    std::optional<uint8_t> receivedMsgType = recvEndPoint.type();
+
+    if (!expectedEid || !receivedMsgType)
+    {
+        // we were handed an endpoint that can't be treated as an MCTP endpoint
+        // This is probably a kernel bug...yell about it and rebind.
+        lg2::error("MctpRequester: invalid endpoint");
         return;
     }
 
-    auto& callback = completionCallbacks.at(eid);
-
-    if (respAddr->smctp_type != msgType)
+    if (*receivedMsgType != msgType)
     {
-        lg2::error("MctpRequester: Message type mismatch");
-        callback(EPROTO);
+        // we received a message that this handler doesn't support
+        // drop it on the floor and rebind receive_from
+        lg2::error("MctpRequester: Message type mismatch. We received {MSG}",
+                   "MSG", *receivedMsgType);
         return;
     }
 
-    expiryTimer.cancel();
+    uint8_t eid = *expectedEid;
 
     if (ec)
     {
         lg2::error(
             "MctpRequester failed to receive data from the MCTP socket - ErrorCode={EC}, Error={ER}.",
             "EC", ec.value(), "ER", ec.message());
-        callback(EIO);
+        handleResult(eid, static_cast<std::error_code>(ec), {});
         return;
     }
 
-    if (respMsg.size() > sizeof(ocp::accelerator_management::BindingPciVid))
+    // if the received length was greater than our buffer, we would've truncated
+    // and gotten an error code in asio
+    std::span<const uint8_t> responseBuffer{buffer.data(), length};
+
+    std::optional<uint8_t> optionalIid = getIid(responseBuffer);
+    std::optional<bool> isRq = getRequestBit(responseBuffer);
+    if (!optionalIid || !isRq)
     {
-        const auto* reqHdr =
-            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-            reinterpret_cast<const ocp::accelerator_management::BindingPciVid*>(
-                reqMsg.data());
-
-        uint8_t reqInstanceId = reqHdr->instance_id &
-                                ocp::accelerator_management::instanceIdBitMask;
-        const auto* respHdr =
-            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-            reinterpret_cast<const ocp::accelerator_management::BindingPciVid*>(
-                respMsg.data());
-
-        uint8_t respInstanceId = respHdr->instance_id &
-                                 ocp::accelerator_management::instanceIdBitMask;
-
-        if (reqInstanceId != respInstanceId)
-        {
-            lg2::error(
-                "MctpRequester: Instance ID mismatch - request={REQ}, response={RESP}",
-                "REQ", static_cast<int>(reqInstanceId), "RESP",
-                static_cast<int>(respInstanceId));
-            callback(EPROTO);
-            return;
-        }
+        // we received something from the device,
+        // but we aren't able to parse iid byte
+        // drop this packet on the floor
+        // and rely on the timer to notify the client
+        lg2::error("MctpRequester: Unable to decode message from eid {EID}",
+                   "EID", eid);
+        return;
     }
 
-    callback(0);
+    if (isRq.value())
+    {
+        // we received a request from a downstream device.
+        // We don't currently support this, drop the packet
+        // on the floor and rebind receive, keep the timer running
+        return;
+    }
+
+    uint8_t iid = *optionalIid;
+
+    auto it = requestContextQueues.find(eid);
+    if (it == requestContextQueues.end())
+    {
+        // something very bad has happened here
+        // we've received a packet that is a response
+        // from a device we've never talked to
+        // do our best and rebind receive and keep the timer running
+        lg2::error("Unable to match request to response");
+        return;
+    }
+
+    if (iid != it->second.iid)
+    {
+        // we received an iid that doesn't match the one we sent
+        // rebind async_receive_from and drop this packet on the floor
+        lg2::error("Invalid iid {IID} from eid {EID}, expected {E_IID}", "IID",
+                   iid, "EID", eid, "E_IID", it->second.iid);
+        return;
+    }
+
+    handleResult(eid, std::error_code{}, responseBuffer);
 }
 
-void Requester::handleSendMsgCompletion(
-    uint8_t eid, const std::span<const uint8_t> reqMsg,
-    std::span<uint8_t> respMsg, const boost::system::error_code& ec,
-    size_t /* length */)
+void MctpRequester::handleSendMsgCompletion(
+    uint8_t eid, const boost::system::error_code& ec, size_t /* length */)
 {
-    if (!completionCallbacks.contains(eid))
-    {
-        lg2::error(
-            "MctpRequester failed to get the callback for the EID: {EID}",
-            "EID", static_cast<int>(eid));
-        return;
-    }
-
-    auto& callback = completionCallbacks.at(eid);
-
     if (ec)
     {
         lg2::error(
             "MctpRequester failed to send data from the MCTP socket - ErrorCode={EC}, Error={ER}.",
             "EC", ec.value(), "ER", ec.message());
-        callback(EIO);
+        handleResult(eid, static_cast<std::error_code>(ec), {});
         return;
     }
 
+    auto it = requestContextQueues.find(eid);
+    if (it == requestContextQueues.end())
+    {
+        // something very bad has happened here,
+        // we've sent something to a device that we have
+        // no record of. yell loudly and bail
+        lg2::error(
+            "MctpRequester completed send for an EID that we have no record of");
+        return;
+    }
+
+    boost::asio::steady_timer& expiryTimer = it->second.timer;
     expiryTimer.expires_after(2s);
 
     expiryTimer.async_wait([this, eid](const boost::system::error_code& ec) {
         if (ec != boost::asio::error::operation_aborted)
         {
-            auto& callback = completionCallbacks.at(eid);
-            callback(ETIME);
+            lg2::error("Operation timed out on eid {EID}", "EID", eid);
+            handleResult(eid, std::make_error_code(std::errc::timed_out), {});
         }
     });
-
-    mctpSocket.async_receive_from(
-        boost::asio::mutable_buffer(respMsg), recvEndPoint,
-        std::bind_front(&Requester::processRecvMsg, this, reqMsg, respMsg));
 }
 
-void Requester::sendRecvMsg(uint8_t eid, const std::span<const uint8_t> reqMsg,
-                            std::span<uint8_t> respMsg,
-                            std::move_only_function<void(int)> callback)
+void MctpRequester::sendRecvMsg(
+    uint8_t eid, std::span<const uint8_t> reqMsg,
+    std::move_only_function<void(const std::error_code&,
+                                 std::span<const uint8_t>)>
+        callback)
 {
-    if (reqMsg.size() < sizeof(ocp::accelerator_management::BindingPciVid))
+    RequestContext reqCtx{reqMsg, std::move(callback)};
+
+    // try_emplace only affects the result if the key does not already exist
+    auto [it, inserted] = requestContextQueues.try_emplace(eid, io);
+    (void)inserted;
+
+    auto& queue = it->second.queue;
+    queue.push_back(std::move(reqCtx));
+
+    if (queue.size() == 1)
     {
-        lg2::error("MctpRequester: Message too small");
-        callback(EPROTO);
+        processQueue(eid);
+    }
+}
+
+static bool isFatalError(const std::error_code& ec)
+{
+    return ec &&
+           (ec != std::errc::timed_out && ec != std::errc::host_unreachable);
+}
+
+void MctpRequester::handleResult(uint8_t eid, const std::error_code& ec,
+                                 std::span<const uint8_t> buffer)
+{
+    auto it = requestContextQueues.find(eid);
+    if (it == requestContextQueues.end())
+    {
+        lg2::error("We tried to a handle a result for an eid we don't have");
+
+        startReceive();
         return;
     }
 
-    completionCallbacks[eid] = std::move(callback);
+    auto& queue = it->second.queue;
+    auto& reqCtx = queue.front();
+
+    it->second.timer.cancel();
+
+    reqCtx.callback(ec, buffer); // Call the original callback
+
+    if (isFatalError(ec))
+    {
+        // some errors are fatal, since these are datagrams,
+        // we won't get a receive path error message.
+        // and since this daemon services all nvidia iana commands
+        // for a given system, we should only restart the service if its
+        // unrecoverable, i.e. if we get error codes that the client
+        // can't reasonably deal with. If thats the cause, restart
+        // and hope that we can deal with it then.
+        // since we're fully async, the only reasonable way to bubble
+        // this issue up is to chuck an exception and let main deal with it.
+        // alternatively we could call cancel on the io_context, but there's
+        // not a great way to figure *what* happened.
+        throw std::runtime_error(std::format(
+            "eid {} encountered a fatal error: {}", eid, ec.message()));
+    }
+
+    startReceive();
+
+    queue.pop_front();
+
+    processQueue(eid);
+}
+
+std::optional<uint8_t> MctpRequester::getNextIid(uint8_t eid)
+{
+    auto it = requestContextQueues.find(eid);
+    if (it == requestContextQueues.end())
+    {
+        return std::nullopt;
+    }
+
+    uint8_t& iid = it->second.iid;
+    ++iid;
+    iid &= ocp::accelerator_management::instanceIdBitMask;
+    return iid;
+}
+
+static std::expected<void, std::error_code> injectIid(std::span<uint8_t> buffer,
+                                                      uint8_t iid)
+{
+    if (buffer.size() < sizeof(ocp::accelerator_management::BindingPciVid))
+    {
+        return std::unexpected(
+            std::make_error_code(std::errc::invalid_argument));
+    }
+
+    if (iid > ocp::accelerator_management::instanceIdBitMask)
+    {
+        return std::unexpected(
+            std::make_error_code(std::errc::invalid_argument));
+    }
+
+    auto* header = std::bit_cast<ocp::accelerator_management::BindingPciVid*>(
+        buffer.data());
+
+    header->instance_id &= ~ocp::accelerator_management::instanceIdBitMask;
+    header->instance_id |= iid;
+    return {};
+}
+
+void MctpRequester::processQueue(uint8_t eid)
+{
+    auto it = requestContextQueues.find(eid);
+    if (it == requestContextQueues.end())
+    {
+        lg2::error("We are attempting to process a queue that doesn't exist");
+        return;
+    }
+
+    auto& queue = it->second.queue;
+
+    if (queue.empty())
+    {
+        return;
+    }
+    auto& reqCtx = queue.front();
+
+    std::span<uint8_t> req{reqCtx.reqMsg.data(), reqCtx.reqMsg.size()};
+
+    std::optional<uint8_t> iid = getNextIid(eid);
+    if (!iid)
+    {
+        lg2::error("MctpRequester: Unable to get next iid");
+        handleResult(eid, std::make_error_code(std::errc::no_such_device), {});
+        return;
+    }
+
+    std::expected<void, std::error_code> success = injectIid(req, *iid);
+    if (!success)
+    {
+        lg2::error("MctpRequester: unable to set iid");
+        handleResult(eid, success.error(), {});
+        return;
+    }
 
     struct sockaddr_mctp addr{};
     addr.smctp_family = AF_MCTP;
@@ -167,54 +354,8 @@
     sendEndPoint = {&addr, sizeof(addr)};
 
     mctpSocket.async_send_to(
-        boost::asio::const_buffer(reqMsg), sendEndPoint,
-        std::bind_front(&Requester::handleSendMsgCompletion, this, eid, reqMsg,
-                        respMsg));
-}
-
-void QueuingRequester::sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
-                                   std::span<uint8_t> respMsg,
-                                   std::move_only_function<void(int)> callback)
-{
-    auto reqCtx =
-        std::make_unique<RequestContext>(reqMsg, respMsg, std::move(callback));
-
-    // Add request to queue
-    auto& queue = requestContextQueues[eid];
-    queue.push_back(std::move(reqCtx));
-
-    if (queue.size() == 1)
-    {
-        processQueue(eid);
-    }
-}
-
-void QueuingRequester::handleResult(uint8_t eid, int result)
-{
-    auto& queue = requestContextQueues[eid];
-    const auto& reqCtx = queue.front();
-
-    reqCtx->callback(result); // Call the original callback
-
-    queue.pop_front();
-
-    processQueue(eid);
-}
-
-void QueuingRequester::processQueue(uint8_t eid)
-{
-    auto& queue = requestContextQueues[eid];
-
-    if (queue.empty())
-    {
-        return;
-    }
-
-    const auto& reqCtx = queue.front();
-
-    requester.sendRecvMsg(
-        eid, reqCtx->reqMsg, reqCtx->respMsg,
-        std::bind_front(&QueuingRequester::handleResult, this, eid));
+        boost::asio::const_buffer(req.data(), req.size()), sendEndPoint,
+        std::bind_front(&MctpRequester::handleSendMsgCompletion, this, eid));
 }
 
 } // namespace mctp

diff --git a/src/nvidia-gpu/MctpRequester.hpp b/src/nvidia-gpu/MctpRequester.hpp
index 9d05ebd..0fd4683 100644
--- a/src/nvidia-gpu/MctpRequester.hpp
+++ b/src/nvidia-gpu/MctpRequester.hpp

@@ -6,88 +6,61 @@
 
 #pragma once
 
+#include <MctpAsioEndpoint.hpp>
 #include <OcpMctpVdm.hpp>
 #include <boost/asio/generic/datagram_protocol.hpp>
 #include <boost/asio/io_context.hpp>
 #include <boost/asio/steady_timer.hpp>
+#include <boost/circular_buffer.hpp>
 #include <boost/container/devector.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/small_vector.hpp>
 
 #include <cstddef>
 #include <cstdint>
+#include <expected>
 #include <functional>
+#include <iostream>
 #include <memory>
+#include <queue>
 #include <span>
+#include <system_error>
 #include <unordered_map>
 #include <utility>
 
 namespace mctp
 {
-class Requester
+class MctpRequester
 {
   public:
-    Requester() = delete;
+    MctpRequester() = delete;
 
-    Requester(const Requester&) = delete;
+    MctpRequester(const MctpRequester&) = delete;
 
-    Requester(Requester&&) = delete;
+    MctpRequester(MctpRequester&&) = delete;
 
-    Requester& operator=(const Requester&) = delete;
+    MctpRequester& operator=(const MctpRequester&) = delete;
 
-    Requester& operator=(Requester&&) = delete;
+    MctpRequester& operator=(MctpRequester&&) = delete;
 
-    explicit Requester(boost::asio::io_context& ctx);
+    explicit MctpRequester(boost::asio::io_context& ctx);
 
     void sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
-                     std::span<uint8_t> respMsg,
-                     std::move_only_function<void(int)> callback);
+                     std::move_only_function<void(const std::error_code&,
+                                                  std::span<const uint8_t>)>
+                         callback);
 
   private:
-    void processRecvMsg(std::span<const uint8_t> reqMsg,
-                        std::span<uint8_t> respMsg,
-                        const boost::system::error_code& ec, size_t length);
-
-    void handleSendMsgCompletion(uint8_t eid, std::span<const uint8_t> reqMsg,
-                                 std::span<uint8_t> respMsg,
-                                 const boost::system::error_code& ec,
-                                 size_t length);
-
-    boost::asio::generic::datagram_protocol::socket mctpSocket;
+    using cb_t = std::move_only_function<void(const std::error_code&,
+                                              std::span<const uint8_t>)>;
 
     static constexpr size_t maxMessageSize = 65536 + 256;
-
-    boost::asio::generic::datagram_protocol::endpoint sendEndPoint;
-
-    boost::asio::generic::datagram_protocol::endpoint recvEndPoint;
-
-    boost::asio::steady_timer expiryTimer;
-
-    std::unordered_map<uint8_t, std::move_only_function<void(int)>>
-        completionCallbacks;
-
     static constexpr uint8_t msgType = ocp::accelerator_management::messageType;
-};
 
-class QueuingRequester
-{
-  public:
-    QueuingRequester() = delete;
-    QueuingRequester(const QueuingRequester&) = delete;
-    QueuingRequester(QueuingRequester&&) = delete;
-    QueuingRequester& operator=(const QueuingRequester&) = delete;
-    QueuingRequester& operator=(QueuingRequester&&) = delete;
-
-    explicit QueuingRequester(boost::asio::io_context& ctx) : requester(ctx) {}
-
-    void sendRecvMsg(uint8_t eid, std::span<const uint8_t> reqMsg,
-                     std::span<uint8_t> respMsg,
-                     std::move_only_function<void(int)> callback);
-
-  private:
     struct RequestContext
     {
-        std::span<const uint8_t> reqMsg;
-        std::span<uint8_t> respMsg;
-        std::move_only_function<void(int)> callback;
+        std::vector<uint8_t> reqMsg;
+        cb_t callback;
 
         RequestContext(const RequestContext&) = delete;
         RequestContext& operator=(const RequestContext&) = delete;
@@ -96,21 +69,42 @@
         RequestContext& operator=(RequestContext&&) = default;
         ~RequestContext() = default;
 
-        explicit RequestContext(std::span<const uint8_t> req,
-                                std::span<uint8_t> resp,
-                                std::move_only_function<void(int)>&& cb) :
-            reqMsg(req), respMsg(resp), callback(std::move(cb))
+        explicit RequestContext(std::span<const uint8_t> req, cb_t&& cb) :
+            reqMsg(req.begin(), req.end()), callback(std::move(cb))
         {}
     };
 
-    void handleResult(uint8_t eid, int result);
+    struct EidContext
+    {
+        boost::asio::steady_timer timer;
+        uint8_t iid{};
+        boost::container::devector<RequestContext> queue;
+        EidContext(boost::asio::io_context& io) : timer{io}, iid{0xFF} {}
+        EidContext(EidContext&&) noexcept = default;
+        EidContext& operator=(EidContext&&) noexcept = default;
+        EidContext& operator=(const EidContext&) = delete;
+        EidContext(const EidContext&) = delete;
+        ~EidContext() = default;
+    };
+
+    std::optional<uint8_t> getNextIid(uint8_t eid);
+    void startReceive();
+    void processRecvMsg(const boost::system::error_code& ec, size_t length);
+    void handleSendMsgCompletion(uint8_t eid,
+                                 const boost::system::error_code& ec,
+                                 size_t length);
+
+    void handleResult(uint8_t eid, const std::error_code& ec,
+                      std::span<const uint8_t> buffer);
     void processQueue(uint8_t eid);
 
-    Requester requester;
-    std::unordered_map<
-        uint8_t, boost::container::devector<std::unique_ptr<RequestContext>>>
-        requestContextQueues;
+    boost::asio::io_context& io;
+    boost::asio::generic::datagram_protocol::endpoint sendEndPoint;
+
+    boost::asio::generic::datagram_protocol::socket mctpSocket;
+    std::array<uint8_t, maxMessageSize> buffer{};
+    MctpAsioEndpoint recvEndPoint;
+    std::unordered_map<uint8_t, EidContext> requestContextQueues;
 };
 
-using MctpRequester = QueuingRequester;
 } // namespace mctp

diff --git a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
index dd682cf..47ec061 100644
--- a/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp
+++ b/src/nvidia-gpu/NvidiaDeviceDiscovery.cpp

@@ -30,6 +30,7 @@
 #include <span>
 #include <stdexcept>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -44,14 +45,15 @@
         smaDevices,
     const std::shared_ptr<sdbusplus::asio::connection>& conn,
     mctp::MctpRequester& mctpRequester, const SensorConfigs& configs,
-    const std::string& path, uint8_t eid, int sendRecvMsgResult,
-    std::span<uint8_t> queryDeviceIdentificationResponse)
+    const std::string& path, uint8_t eid,
+    const std::error_code& sendRecvMsgResult,
+    std::span<const uint8_t> queryDeviceIdentificationResponse)
 {
-    if (sendRecvMsgResult != 0)
+    if (sendRecvMsgResult)
     {
         lg2::error(
             "Error processing MCTP endpoint with eid {EID} : sending message over MCTP failed, rc={RC}",
-            "EID", eid, "RC", sendRecvMsgResult);
+            "EID", eid, "RC", sendRecvMsgResult.message());
         return;
     }
 
@@ -121,9 +123,6 @@
     auto queryDeviceIdentificationRequest = std::make_shared<
         std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationRequest)>>();
 
-    auto queryDeviceIdentificationResponse = std::make_shared<
-        std::array<uint8_t, sizeof(gpu::QueryDeviceIdentificationResponse)>>();
-
     auto rc = gpu::encodeQueryDeviceIdentificationRequest(
         0, *queryDeviceIdentificationRequest);
     if (rc != 0)
@@ -136,14 +135,12 @@
 
     mctpRequester.sendRecvMsg(
         eid, *queryDeviceIdentificationRequest,
-        *queryDeviceIdentificationResponse,
         [&io, &objectServer, &gpuDevices, &smaDevices, conn, &mctpRequester,
-         configs, path, eid, queryDeviceIdentificationRequest,
-         queryDeviceIdentificationResponse](int sendRecvMsgResult) {
-            processQueryDeviceIdResponse(
-                io, objectServer, gpuDevices, smaDevices, conn, mctpRequester,
-                configs, path, eid, sendRecvMsgResult,
-                *queryDeviceIdentificationResponse);
+         configs, path, eid, queryDeviceIdentificationRequest](
+            const std::error_code& ec, std::span<const uint8_t> response) {
+            processQueryDeviceIdResponse(io, objectServer, gpuDevices,
+                                         smaDevices, conn, mctpRequester,
+                                         configs, path, eid, ec, response);
         });
 }
 

diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index 9560220..4bba49c 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp

@@ -6,20 +6,18 @@
 
 #include "NvidiaGpuDevice.hpp"
 
-#include "Inventory.hpp"
-#include "NvidiaDeviceDiscovery.hpp"
-#include "NvidiaGpuSensor.hpp"
+#include "NvidiaGpuThresholds.hpp"
 #include "Thresholds.hpp"
 #include "Utils.hpp"
 
-#include <bits/basic_string.h>
-
+#include <Inventory.hpp>
 #include <MctpRequester.hpp>
+#include <NvidiaDeviceDiscovery.hpp>
 #include <NvidiaGpuEnergySensor.hpp>
 #include <NvidiaGpuMctpVdm.hpp>
 #include <NvidiaGpuPowerPeakReading.hpp>
 #include <NvidiaGpuPowerSensor.hpp>
-#include <NvidiaGpuThresholds.hpp>
+#include <NvidiaGpuSensor.hpp>
 #include <NvidiaGpuVoltageSensor.hpp>
 #include <boost/asio/io_context.hpp>
 #include <phosphor-logging/lg2.hpp>
@@ -137,4 +135,4 @@
         }
         read();
     });
-}
+};

diff --git a/src/nvidia-gpu/NvidiaGpuEnergySensor.cpp b/src/nvidia-gpu/NvidiaGpuEnergySensor.cpp
index 6d0fcd8..5d2b9af 100644
--- a/src/nvidia-gpu/NvidiaGpuEnergySensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuEnergySensor.cpp

@@ -25,7 +25,9 @@
 #include <cstdint>
 #include <limits>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
@@ -80,13 +82,14 @@
     thresholds::checkThresholds(this);
 }
 
-void NvidiaGpuEnergySensor::processResponse(int sendRecvMsgResult)
+void NvidiaGpuEnergySensor::processResponse(const std::error_code& ec,
+                                            std::span<const uint8_t> buffer)
 {
-    if (sendRecvMsgResult != 0)
+    if (ec)
     {
         lg2::error(
             "Error updating Energy Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
-            "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
+            "EID", eid, "SID", sensorId, "RC", ec.message());
         return;
     }
 
@@ -94,8 +97,8 @@
     uint16_t reasonCode = 0;
     uint64_t energyValue = 0;
 
-    auto rc = gpu::decodeGetCurrentEnergyCounterResponse(
-        response, cc, reasonCode, energyValue);
+    auto rc = gpu::decodeGetCurrentEnergyCounterResponse(buffer, cc, reasonCode,
+                                                         energyValue);
 
     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
     {
@@ -124,6 +127,8 @@
     }
 
     mctpRequester.sendRecvMsg(
-        eid, request, response,
-        [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+        eid, request,
+        [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
+            processResponse(ec, buffer);
+        });
 }

diff --git a/src/nvidia-gpu/NvidiaGpuEnergySensor.hpp b/src/nvidia-gpu/NvidiaGpuEnergySensor.hpp
index 19bb982..1d9d1f5 100644
--- a/src/nvidia-gpu/NvidiaGpuEnergySensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuEnergySensor.hpp

@@ -39,7 +39,8 @@
     void update();
 
   private:
-    void processResponse(int sendRecvMsgResult);
+    void processResponse(const std::error_code& ec,
+                         std::span<const uint8_t> buffer);
 
     uint8_t eid{};
 
@@ -52,7 +53,4 @@
     sdbusplus::asio::object_server& objectServer;
 
     std::array<uint8_t, sizeof(gpu::GetCurrentEnergyCounterRequest)> request{};
-
-    std::array<uint8_t, sizeof(gpu::GetCurrentEnergyCounterResponse)>
-        response{};
 };

diff --git a/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp
index 06693e6..839dfe4 100644
--- a/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp
+++ b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.cpp

@@ -20,7 +20,9 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 
 using namespace std::literals;
 
@@ -50,13 +52,14 @@
     objectServer.remove_interface(telemetryReportInterface);
 }
 
-void NvidiaGpuPowerPeakReading::processResponse(int sendRecvMsgResult)
+void NvidiaGpuPowerPeakReading::processResponse(const std::error_code& ec,
+                                                std::span<const uint8_t> buffer)
 {
-    if (sendRecvMsgResult != 0)
+    if (ec)
     {
         lg2::error(
             "Error updating Peak Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
-            "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
+            "EID", eid, "SID", sensorId, "RC", ec.message());
         return;
     }
 
@@ -65,7 +68,7 @@
     uint32_t peakPower = 0;
 
     const int rc =
-        gpu::decodeGetPowerDrawResponse(response, cc, reasonCode, peakPower);
+        gpu::decodeGetPowerDrawResponse(buffer, cc, reasonCode, peakPower);
 
     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
     {
@@ -97,6 +100,8 @@
     }
 
     mctpRequester.sendRecvMsg(
-        eid, request, response,
-        [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+        eid, request,
+        [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
+            processResponse(ec, buffer);
+        });
 }

diff --git a/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp
index 02ba104..cc229d2 100644
--- a/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp
+++ b/src/nvidia-gpu/NvidiaGpuPowerPeakReading.hpp

@@ -36,7 +36,8 @@
     void update();
 
   private:
-    void processResponse(int sendRecvMsgResult);
+    void processResponse(const std::error_code& ec,
+                         std::span<const uint8_t> buffer);
 
     uint8_t eid{};
 
@@ -55,7 +56,5 @@
 
     std::array<uint8_t, sizeof(gpu::GetPowerDrawRequest)> request{};
 
-    std::array<uint8_t, sizeof(gpu::GetPowerDrawResponse)> response{};
-
     std::shared_ptr<sdbusplus::asio::dbus_interface> telemetryReportInterface;
 };

diff --git a/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp b/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp
index bac935d..b21ce0e 100644
--- a/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuPowerSensor.cpp

@@ -26,7 +26,9 @@
 #include <functional>
 #include <limits>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
@@ -82,13 +84,14 @@
     thresholds::checkThresholds(this);
 }
 
-void NvidiaGpuPowerSensor::processResponse(int sendRecvMsgResult)
+void NvidiaGpuPowerSensor::processResponse(const std::error_code& ec,
+                                           std::span<const uint8_t> buffer)
 {
-    if (sendRecvMsgResult != 0)
+    if (ec)
     {
         lg2::error(
             "Error updating Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
-            "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
+            "EID", eid, "SID", sensorId, "RC", ec.message());
         return;
     }
 
@@ -97,7 +100,7 @@
     uint32_t power = 0;
 
     const int rc =
-        gpu::decodeGetPowerDrawResponse(response, cc, reasonCode, power);
+        gpu::decodeGetPowerDrawResponse(buffer, cc, reasonCode, power);
 
     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
     {
@@ -127,6 +130,8 @@
     }
 
     mctpRequester.sendRecvMsg(
-        eid, request, response,
-        [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+        eid, request,
+        [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
+            processResponse(ec, buffer);
+        });
 }

diff --git a/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp b/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp
index c217200..8fa967a 100644
--- a/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuPowerSensor.hpp

@@ -17,7 +17,9 @@
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <vector>
 
 constexpr uint8_t gpuPowerSensorId{0};
@@ -39,7 +41,8 @@
     void update();
 
   private:
-    void processResponse(int sendRecvMsgResult);
+    void processResponse(const std::error_code& ec,
+                         std::span<const uint8_t> buffer);
 
     uint8_t eid{};
 
@@ -54,6 +57,4 @@
     sdbusplus::asio::object_server& objectServer;
 
     std::array<uint8_t, sizeof(gpu::GetPowerDrawRequest)> request{};
-
-    std::array<uint8_t, sizeof(gpu::GetPowerDrawResponse)> response{};
 };

diff --git a/src/nvidia-gpu/NvidiaGpuSensor.cpp b/src/nvidia-gpu/NvidiaGpuSensor.cpp
index 3853048..26fb1c4 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.cpp

@@ -25,7 +25,9 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
@@ -79,13 +81,14 @@
     thresholds::checkThresholds(this);
 }
 
-void NvidiaGpuTempSensor::processResponse(int sendRecvMsgResult)
+void NvidiaGpuTempSensor::processResponse(const std::error_code& ec,
+                                          std::span<const uint8_t> buffer)
 {
-    if (sendRecvMsgResult != 0)
+    if (ec)
     {
         lg2::error(
             "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
-            "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
+            "EID", eid, "SID", sensorId, "RC", ec.message());
         return;
     }
 
@@ -93,8 +96,8 @@
     uint16_t reasonCode = 0;
     double tempValue = 0;
 
-    auto rc = gpu::decodeGetTemperatureReadingResponse(
-        getTemperatureReadingResponse, cc, reasonCode, tempValue);
+    auto rc = gpu::decodeGetTemperatureReadingResponse(buffer, cc, reasonCode,
+                                                       tempValue);
 
     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
     {
@@ -122,6 +125,8 @@
     }
 
     mctpRequester.sendRecvMsg(
-        eid, getTemperatureReadingRequest, getTemperatureReadingResponse,
-        [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+        eid, getTemperatureReadingRequest,
+        [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
+            processResponse(ec, buffer);
+        });
 }

diff --git a/src/nvidia-gpu/NvidiaGpuSensor.hpp b/src/nvidia-gpu/NvidiaGpuSensor.hpp
index 0e9b7af..2b0c49c 100644
--- a/src/nvidia-gpu/NvidiaGpuSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuSensor.hpp

@@ -44,7 +44,8 @@
     void update();
 
   private:
-    void processResponse(int sendRecvMsgResult);
+    void processResponse(const std::error_code& ec,
+                         std::span<const uint8_t> buffer);
 
     uint8_t eid{};
 
@@ -58,7 +59,4 @@
 
     std::array<uint8_t, sizeof(gpu::GetTemperatureReadingRequest)>
         getTemperatureReadingRequest{};
-
-    std::array<uint8_t, sizeof(gpu::GetTemperatureReadingResponse)>
-        getTemperatureReadingResponse{};
 };

diff --git a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
index 7e0ab9d..9a1caf9 100644
--- a/src/nvidia-gpu/NvidiaGpuSensorMain.cpp
+++ b/src/nvidia-gpu/NvidiaGpuSensorMain.cpp

@@ -14,6 +14,7 @@
 #include <boost/asio/post.hpp>
 #include <boost/asio/steady_timer.hpp>
 #include <boost/container/flat_map.hpp>
+#include <phosphor-logging/lg2.hpp>
 #include <sdbusplus/asio/connection.hpp>
 #include <sdbusplus/asio/object_server.hpp>
 #include <sdbusplus/bus.hpp>
@@ -22,6 +23,8 @@
 
 #include <array>
 #include <chrono>
+#include <cstdlib>
+#include <exception>
 #include <functional>
 #include <memory>
 #include <string>
@@ -85,6 +88,16 @@
             interfaceRemoved(msg, gpuDevices, smaDevices);
         });
 
-    io.run();
+    try
+    {
+        io.run();
+    }
+    catch (const std::exception& e)
+    {
+        lg2::error("fatal error caught during processing: {MSG}", "MSG",
+                   e.what());
+        return EXIT_FAILURE;
+    }
+
     return 0;
 }

diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.cpp b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
index 16141f1..79a14b2 100644
--- a/src/nvidia-gpu/NvidiaGpuThresholds.cpp
+++ b/src/nvidia-gpu/NvidiaGpuThresholds.cpp

@@ -18,17 +18,18 @@
 #include <functional>
 #include <memory>
 #include <span>
+#include <system_error>
 #include <vector>
 
 void processReadThermalParameterResponse(
     const std::function<void(uint8_t, int32_t)>& callback,
-    const std::span<const uint8_t> respMsg, int sendRecvMsgResult)
+    const std::error_code& ec, std::span<const uint8_t> respMsg)
 {
-    if (sendRecvMsgResult != 0)
+    if (ec)
     {
         lg2::error(
             "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
-            "RC", sendRecvMsgResult);
+            "RC", ec.message());
         callback(EPROTO, 0);
         return;
     }
@@ -59,9 +60,6 @@
     auto reqMsg = std::make_shared<
         std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>>();
 
-    auto respMsg = std::make_shared<
-        std::array<uint8_t, sizeof(gpu::ReadThermalParametersResponse)>>();
-
     auto rc = gpu::encodeReadThermalParametersRequest(0, id, *reqMsg);
     if (rc != 0)
     {
@@ -73,10 +71,10 @@
     }
 
     mctpRequester.sendRecvMsg(
-        eid, *reqMsg, *respMsg,
-        [reqMsg, respMsg, callback](int sendRecvMsgResult) {
-            processReadThermalParameterResponse(callback, *respMsg,
-                                                sendRecvMsgResult);
+        eid, *reqMsg,
+        [reqMsg,
+         callback](const std::error_code& ec, std::span<const uint8_t> buff) {
+            processReadThermalParameterResponse(callback, ec, buff);
         });
 }
 

diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.hpp b/src/nvidia-gpu/NvidiaGpuThresholds.hpp
index 9d1970f..4252c97 100644
--- a/src/nvidia-gpu/NvidiaGpuThresholds.hpp
+++ b/src/nvidia-gpu/NvidiaGpuThresholds.hpp

@@ -14,9 +14,9 @@
 
 using gpuThresholdId = uint8_t;
 
-constexpr gpuThresholdId gpuTLimitCriticalThresholdId{1};
-constexpr gpuThresholdId gpuTLimitWarnringThresholdId{2};
-constexpr gpuThresholdId gpuTLimitHardshutDownThresholdId{4};
+static constexpr gpuThresholdId gpuTLimitCriticalThresholdId{1};
+static constexpr gpuThresholdId gpuTLimitWarnringThresholdId{2};
+static constexpr gpuThresholdId gpuTLimitHardshutDownThresholdId{4};
 
 void readThermalParameters(
     uint8_t eid, const std::vector<gpuThresholdId>& ids,

diff --git a/src/nvidia-gpu/NvidiaGpuVoltageSensor.cpp b/src/nvidia-gpu/NvidiaGpuVoltageSensor.cpp
index 6fe71e6..d32f6b2 100644
--- a/src/nvidia-gpu/NvidiaGpuVoltageSensor.cpp
+++ b/src/nvidia-gpu/NvidiaGpuVoltageSensor.cpp

@@ -26,7 +26,9 @@
 #include <functional>
 #include <limits>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
@@ -80,13 +82,14 @@
     thresholds::checkThresholds(this);
 }
 
-void NvidiaGpuVoltageSensor::processResponse(int sendRecvMsgResult)
+void NvidiaGpuVoltageSensor::processResponse(const std::error_code& ec,
+                                             std::span<const uint8_t> buffer)
 {
-    if (sendRecvMsgResult != 0)
+    if (ec)
     {
         lg2::error(
             "Error updating Voltage Sensor: sending message over MCTP failed, rc={RC}",
-            "RC", sendRecvMsgResult);
+            "RC", ec.message());
         return;
     }
 
@@ -95,7 +98,7 @@
     uint32_t voltageValue = 0;
 
     auto rc =
-        gpu::decodeGetVoltageResponse(response, cc, reasonCode, voltageValue);
+        gpu::decodeGetVoltageResponse(buffer, cc, reasonCode, voltageValue);
 
     if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
     {
@@ -122,6 +125,8 @@
     }
 
     mctpRequester.sendRecvMsg(
-        eid, request, response,
-        [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
+        eid, request,
+        [this](const std::error_code& ec, std::span<const uint8_t> buffer) {
+            processResponse(ec, buffer);
+        });
 }

diff --git a/src/nvidia-gpu/NvidiaGpuVoltageSensor.hpp b/src/nvidia-gpu/NvidiaGpuVoltageSensor.hpp
index a8c1038..e5e187a 100644
--- a/src/nvidia-gpu/NvidiaGpuVoltageSensor.hpp
+++ b/src/nvidia-gpu/NvidiaGpuVoltageSensor.hpp

@@ -17,7 +17,9 @@
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <vector>
 
 constexpr uint8_t gpuVoltageSensorId{0};
@@ -39,7 +41,8 @@
     void update();
 
   private:
-    void processResponse(int sendRecvMsgResult);
+    void processResponse(const std::error_code& ec,
+                         std::span<const uint8_t> buffer);
 
     uint8_t eid{};
 
@@ -52,6 +55,4 @@
     sdbusplus::asio::object_server& objectServer;
 
     std::array<uint8_t, sizeof(gpu::GetVoltageRequest)> request{};
-
-    std::array<uint8_t, sizeof(gpu::GetVoltageResponse)> response{};
 };
commit	d0125c9cdf0f0ae7f1943f773c99fc512db0a68e	[log] [tgz]
author	Marc Olberding <molberding@nvidia.com>	Wed Oct 08 14:37:19 2025 -0700
committer	Marc Olberding <molberding@nvidia.com>	Thu Oct 16 15:02:15 2025 -0700
tree	f6dbb3eba497d5d56de0565fe598ced25a1d0077
parent	6d481b5d0636dc3adaa00c2292ac92fc8840fd94 [diff]