nvidia-gpu: NvidiaGpuDevice fix use after free Fixes use after free for NvidiaGpuThresholds. Moves the storage used for communication to be part of the NvidiaGpuDevice class instead of ephemerally passed around through free functions Also makes NvidiaGpuDevice inherit from std::enable_shared_from_this Testing: Issue found previous was coredumps on nvl32-obmc. Asan discovered it was a use after free in the shared pointer in ThermalLimits Afterwards, no core dumps or issues reported by asan. Ran on an nvl32-obmc model with 8 GPU's Change-Id: I61b606f3a129499089718e7ec804926db5f22c64 Signed-off-by: Marc Olberding <molberding@nvidia.com>

commit: 6282a4520f7e72cb47d338683eddacc6b8ba7b75 [log] [tgz]
author: Marc Olberding <molberding@nvidia.com> Sun Sep 28 22:00:09 2025 -0700
committer: Marc Olberding <molberding@nvidia.com> Thu Oct 16 15:03:03 2025 -0700
tree: 0c3e0d91b7f34a490fef2ea5b259e6b5df93969c
parent: ac920734e2282a1ea4ebf197dae38db6c4b5704f [diff] [blame]
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index 78984c5..7ec47ff 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp

@@ -6,7 +6,6 @@
 
 #include "NvidiaGpuDevice.hpp"
 
-#include "NvidiaGpuThresholds.hpp"
 #include "Thresholds.hpp"
 #include "Utils.hpp"
 
@@ -19,19 +18,32 @@
 #include <NvidiaGpuPowerSensor.hpp>
 #include <NvidiaGpuSensor.hpp>
 #include <NvidiaGpuVoltageSensor.hpp>
+#include <OcpMctpVdm.hpp>
 #include <boost/asio/io_context.hpp>
 #include <phosphor-logging/lg2.hpp>
 #include <sdbusplus/asio/connection.hpp>
 #include <sdbusplus/asio/object_server.hpp>
 
+#include <array>
 #include <chrono>
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <span>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
+static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
+static constexpr uint8_t gpuTLimitWarningThresholdId{2};
+static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
+
+// nota bene: the order has to match the order in processTLimitThresholds
+static constexpr std::array<uint8_t, 3> thresholdIds{
+    gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
+    gpuTLimitHardshutDownThresholdId};
+
 GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
                      const std::string& path,
                      const std::shared_ptr<sdbusplus::asio::connection>& conn,
@@ -60,14 +72,6 @@
         conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
         objectServer, std::vector<thresholds::Threshold>{});
 
-    readThermalParameters(
-        eid,
-        std::vector<gpuThresholdId>{gpuTLimitWarnringThresholdId,
-                                    gpuTLimitCriticalThresholdId,
-                                    gpuTLimitHardshutDownThresholdId},
-        mctpRequester,
-        std::bind_front(&GpuDevice::processTLimitThresholds, this));
-
     dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
         conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
         gpuDramTempSensorId, objectServer,
@@ -90,17 +94,93 @@
         conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
         objectServer, std::vector<thresholds::Threshold>{});
 
+    getTLimitThresholds();
+
     lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
               name, "PATH", path);
-
     read();
 }
 
-void GpuDevice::processTLimitThresholds(uint8_t rc,
-                                        const std::vector<int32_t>& thresholds)
+void GpuDevice::getTLimitThresholds()
+{
+    thresholds = {};
+    current_threshold_index = 0;
+    getNextThermalParameter();
+}
+
+void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
+                                             std::span<const uint8_t> buffer)
+{
+    if (ec)
+    {
+        lg2::error(
+            "Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
+            "RC", ec.message());
+        processTLimitThresholds(ec);
+        return;
+    }
+
+    ocp::accelerator_management::CompletionCode cc{};
+    uint16_t reasonCode = 0;
+    int32_t threshold = 0;
+
+    int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
+                                                      threshold);
+
+    if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
+            "RC", rc, "CC", cc, "RESC", reasonCode);
+        processTLimitThresholds(ec);
+        return;
+    }
+
+    thresholds[current_threshold_index] = threshold;
+
+    current_threshold_index++;
+
+    if (current_threshold_index < thresholdIds.size())
+    {
+        getNextThermalParameter();
+        return;
+    }
+    processTLimitThresholds(std::error_code{});
+}
+
+void GpuDevice::getNextThermalParameter()
+{
+    uint8_t id = thresholdIds[current_threshold_index];
+    auto rc =
+        gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
+    if (rc != 0)
+    {
+        lg2::error(
+            "Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
+            "EID", eid, "PID", id, "RC", rc);
+        processTLimitThresholds(
+            std::make_error_code(static_cast<std::errc>(rc)));
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, thermalParamReqMsg,
+        [weak{weak_from_this()}](const std::error_code& ec,
+                                 std::span<const uint8_t> buffer) {
+            std::shared_ptr<GpuDevice> self = weak.lock();
+            if (!self)
+            {
+                lg2::error("Failed to get lock on GpuDevice");
+                return;
+            }
+            self->readThermalParameterCallback(ec, buffer);
+        });
+}
+
+void GpuDevice::processTLimitThresholds(const std::error_code& ec)
 {
     std::vector<thresholds::Threshold> tLimitThresholds{};
-    if (rc == 0)
+    if (!ec)
     {
         tLimitThresholds = {
             thresholds::Threshold{thresholds::Level::WARNING,
commit	6282a4520f7e72cb47d338683eddacc6b8ba7b75	[log] [tgz]
author	Marc Olberding <molberding@nvidia.com>	Sun Sep 28 22:00:09 2025 -0700
committer	Marc Olberding <molberding@nvidia.com>	Thu Oct 16 15:03:03 2025 -0700
tree	0c3e0d91b7f34a490fef2ea5b259e6b5df93969c
parent	ac920734e2282a1ea4ebf197dae38db6c4b5704f [diff] [blame]