nvidia-gpu: NvidiaGpuDevice fix use after free
Fixes use after free for NvidiaGpuThresholds.
Moves the storage used for communication to be part
of the NvidiaGpuDevice class instead of ephemerally
passed around through free functions
Also makes NvidiaGpuDevice inherit from
std::enable_shared_from_this
Testing: Issue found previous was coredumps
on nvl32-obmc. Asan discovered it was a use
after free in the shared pointer in ThermalLimits
Afterwards, no core dumps or issues reported by asan.
Ran on an nvl32-obmc model with 8 GPU's
Change-Id: I61b606f3a129499089718e7ec804926db5f22c64
Signed-off-by: Marc Olberding <molberding@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index 78984c5..7ec47ff 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -6,7 +6,6 @@
 
 #include "NvidiaGpuDevice.hpp"
 
-#include "NvidiaGpuThresholds.hpp"
 #include "Thresholds.hpp"
 #include "Utils.hpp"
 
@@ -19,19 +18,32 @@
 #include <NvidiaGpuPowerSensor.hpp>
 #include <NvidiaGpuSensor.hpp>
 #include <NvidiaGpuVoltageSensor.hpp>
+#include <OcpMctpVdm.hpp>
 #include <boost/asio/io_context.hpp>
 #include <phosphor-logging/lg2.hpp>
 #include <sdbusplus/asio/connection.hpp>
 #include <sdbusplus/asio/object_server.hpp>
 
+#include <array>
 #include <chrono>
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
+static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
+static constexpr uint8_t gpuTLimitWarningThresholdId{2};
+static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
+
+// nota bene: the order has to match the order in processTLimitThresholds
+static constexpr std::array<uint8_t, 3> thresholdIds{
+    gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
+    gpuTLimitHardshutDownThresholdId};
+
 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
                      const std::string& path,
                      const std::shared_ptr<sdbusplus::asio::connection>& conn,
@@ -60,14 +72,6 @@
         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
         objectServer, std::vector<thresholds::Threshold>{});
 
-    readThermalParameters(
-        eid,
-        std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
-                                    gpuTLimitCriticalThresholdId,
-                                    gpuTLimitHardshutDownThresholdId},
-        mctpRequester,
-        std::bind_front(&GpuDevice::processTLimitThresholds, this));
-
     dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
         conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
         gpuDramTempSensorId, objectServer,
@@ -90,17 +94,93 @@
         conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
         objectServer, std::vector<thresholds::Threshold>{});
 
+    getTLimitThresholds();
+
     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
               name, "PATH", path);
-
     read();
 }
 
-void GpuDevice::processTLimitThresholds(uint8_t rc,
-                                        const std::vector<int32_t>& thresholds)
+void GpuDevice::getTLimitThresholds()
+{
+    thresholds = {};
+    current_threshold_index = 0;
+    getNextThermalParameter();
+}
+
+void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
+                                             std::span<const uint8_t> buffer)
+{
+    if (ec)
+    {
+        lg2::error(
+            "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
+            "RC", ec.message());
+        processTLimitThresholds(ec);
+        return;
+    }
+
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode = 0;
+    int32_t threshold = 0;
+
+    int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
+                                                      threshold);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+            "RC", rc, "CC", cc, "RESC", reasonCode);
+        processTLimitThresholds(ec);
+        return;
+    }
+
+    thresholds[current_threshold_index] = threshold;
+
+    current_threshold_index++;
+
+    if (current_threshold_index < thresholdIds.size())
+    {
+        getNextThermalParameter();
+        return;
+    }
+    processTLimitThresholds(std::error_code{});
+}
+
+void GpuDevice::getNextThermalParameter()
+{
+    uint8_t id = thresholdIds[current_threshold_index];
+    auto rc =
+        gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
+            "EID", eid, "PID", id, "RC", rc);
+        processTLimitThresholds(
+            std::make_error_code(static_cast<std::errc>(rc)));
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, thermalParamReqMsg,
+        [weak{weak_from_this()}](const std::error_code& ec,
+                                 std::span<const uint8_t> buffer) {
+            std::shared_ptr<GpuDevice> self = weak.lock();
+            if (!self)
+            {
+                lg2::error("Failed to get lock on GpuDevice");
+                return;
+            }
+            self->readThermalParameterCallback(ec, buffer);
+        });
+}
+
+void GpuDevice::processTLimitThresholds(const std::error_code& ec)
 {
     std::vector<thresholds::Threshold> tLimitThresholds{};
-    if (rc == 0)
+    if (!ec)
     {
         tLimitThresholds = {
             thresholds::Threshold{thresholds::Level::WARNING,
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.hpp b/src/nvidia-gpu/NvidiaGpuDevice.hpp
index ac83994..df0c2e2 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.hpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.hpp
@@ -26,7 +26,7 @@
 #include <string>
 #include <vector>
 
-class GpuDevice
+class GpuDevice : public std::enable_shared_from_this<GpuDevice>
 {
   public:
     GpuDevice(const SensorConfigs& configs, const std::string& name,
@@ -48,11 +48,16 @@
 
     void read();
 
-    void processTLimitThresholds(uint8_t rc,
-                                 const std::vector<int32_t>& thresholds);
+    void processTLimitThresholds(const std::error_code& ec);
+
+    void getTLimitThresholds();
 
     uint8_t eid{};
 
+    void getNextThermalParameter();
+    void readThermalParameterCallback(const std::error_code& ec,
+                                      std::span<const uint8_t> buffer);
+
     std::chrono::milliseconds sensorPollMs;
 
     boost::asio::steady_timer waitTimer;
@@ -71,6 +76,11 @@
     std::shared_ptr<NvidiaGpuEnergySensor> energySensor;
     std::shared_ptr<NvidiaGpuVoltageSensor> voltageSensor;
 
+    std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>
+        thermalParamReqMsg{};
+    std::array<uint8_t, 3> thresholds{};
+    size_t current_threshold_index{};
+
     SensorConfigs configs;
 
     std::string name;
diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.cpp b/src/nvidia-gpu/NvidiaGpuThresholds.cpp
deleted file mode 100644
index 79a14b2..0000000
--- a/src/nvidia-gpu/NvidiaGpuThresholds.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
- * AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "NvidiaGpuThresholds.hpp"
-
-#include <MctpRequester.hpp>
-#include <NvidiaGpuMctpVdm.hpp>
-#include <OcpMctpVdm.hpp>
-#include <phosphor-logging/lg2.hpp>
-
-#include <array>
-#include <cerrno>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <span>
-#include <system_error>
-#include <vector>
-
-void processReadThermalParameterResponse(
-    const std::function<void(uint8_t, int32_t)>& callback,
-    const std::error_code& ec, std::span<const uint8_t> respMsg)
-{
-    if (ec)
-    {
-        lg2::error(
-            "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
-            "RC", ec.message());
-        callback(EPROTO, 0);
-        return;
-    }
-
-    ocp::accelerator_management::CompletionCode cc{};
-    uint16_t reasonCode = 0;
-    int32_t threshold = 0;
-
-    auto rc = gpu::decodeReadThermalParametersResponse(respMsg, cc, reasonCode,
-                                                       threshold);
-
-    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
-    {
-        lg2::error(
-            "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
-            "RC", rc, "CC", cc, "RESC", reasonCode);
-        callback(EPROTO, 0);
-        return;
-    }
-
-    callback(0, threshold);
-};
-
-void readThermalParameter(uint8_t eid, uint8_t id,
-                          mctp::MctpRequester& mctpRequester,
-                          const std::function<void(uint8_t, int32_t)>& callback)
-{
-    auto reqMsg = std::make_shared<
-        std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>>();
-
-    auto rc = gpu::encodeReadThermalParametersRequest(0, id, *reqMsg);
-    if (rc != 0)
-    {
-        lg2::error(
-            "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
-            "EID", eid, "PID", id, "RC", rc);
-        callback(rc, 0);
-        return;
-    }
-
-    mctpRequester.sendRecvMsg(
-        eid, *reqMsg,
-        [reqMsg,
-         callback](const std::error_code& ec, std::span<const uint8_t> buff) {
-            processReadThermalParameterResponse(callback, ec, buff);
-        });
-}
-
-void readThermalParameterCallback(
-    uint8_t eid, const std::shared_ptr<std::vector<uint8_t>>& ids,
-    mctp::MctpRequester& mctpRequester,
-    const std::function<void(uint8_t, std::vector<int32_t>)>& callback,
-    size_t index, const std::shared_ptr<std::vector<int32_t>>& thresholds,
-    uint8_t rc, int32_t threshold)
-{
-    if (rc != 0)
-    {
-        lg2::error(
-            "Error reading thermal parameter for eid {EID} and parameter id {PID}. rc={RC}",
-            "EID", eid, "PID", (*ids)[index], "RC", rc);
-        callback(rc, *thresholds);
-        return;
-    }
-
-    thresholds->push_back(threshold);
-
-    ++index;
-    if (index == ids->size())
-    {
-        callback(rc, *thresholds);
-    }
-    else
-    {
-        readThermalParameter(eid, (*ids)[index], mctpRequester,
-                             std::bind_front(readThermalParameterCallback, eid,
-                                             ids, std::ref(mctpRequester),
-                                             callback, index, thresholds));
-    }
-}
-
-void readThermalParameters(
-    uint8_t eid, const std::vector<uint8_t>& ids,
-    mctp::MctpRequester& mctpRequester,
-    const std::function<void(uint8_t, std::vector<int32_t>)>& callback)
-{
-    auto thresholds = std::make_shared<std::vector<int32_t>>();
-    size_t index = 0;
-
-    readThermalParameter(
-        eid, ids[index], mctpRequester,
-        std::bind_front(readThermalParameterCallback, eid,
-                        std::make_shared<std::vector<uint8_t>>(ids),
-                        std::ref(mctpRequester), callback, index, thresholds));
-}
diff --git a/src/nvidia-gpu/NvidiaGpuThresholds.hpp b/src/nvidia-gpu/NvidiaGpuThresholds.hpp
deleted file mode 100644
index 4252c97..0000000
--- a/src/nvidia-gpu/NvidiaGpuThresholds.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
- * AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include "MctpRequester.hpp"
-
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-using gpuThresholdId = uint8_t;
-
-static constexpr gpuThresholdId gpuTLimitCriticalThresholdId{1};
-static constexpr gpuThresholdId gpuTLimitWarnringThresholdId{2};
-static constexpr gpuThresholdId gpuTLimitHardshutDownThresholdId{4};
-
-void readThermalParameters(
-    uint8_t eid, const std::vector<gpuThresholdId>& ids,
-    mctp::MctpRequester& mctpRequester,
-    const std::function<void(uint8_t, std::vector<int32_t>)>& callback);
diff --git a/src/nvidia-gpu/meson.build b/src/nvidia-gpu/meson.build
index 9d592f3..b1763c0 100644
--- a/src/nvidia-gpu/meson.build
+++ b/src/nvidia-gpu/meson.build
@@ -8,7 +8,6 @@
     'NvidiaGpuPowerPeakReading.cpp',
     'NvidiaGpuPowerSensor.cpp',
     'NvidiaGpuSensor.cpp',
-    'NvidiaGpuThresholds.cpp',
     'NvidiaGpuVoltageSensor.cpp',
     'NvidiaSmaDevice.cpp',
     'OcpMctpVdm.cpp',