nvidia-gpu: NvidiaGpuDevice fix use after free
Fixes use after free for NvidiaGpuThresholds.
Moves the storage used for communication to be part
of the NvidiaGpuDevice class instead of ephemerally
passed around through free functions
Also makes NvidiaGpuDevice inherit from
std::enable_shared_from_this
Testing: Issue found previous was coredumps
on nvl32-obmc. Asan discovered it was a use
after free in the shared pointer in ThermalLimits
Afterwards, no core dumps or issues reported by asan.
Ran on an nvl32-obmc model with 8 GPU's
Change-Id: I61b606f3a129499089718e7ec804926db5f22c64
Signed-off-by: Marc Olberding <molberding@nvidia.com>
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.hpp b/src/nvidia-gpu/NvidiaGpuDevice.hpp
index ac83994..df0c2e2 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.hpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.hpp
@@ -26,7 +26,7 @@
 #include <string>
 #include <vector>
 
-class GpuDevice
+class GpuDevice : public std::enable_shared_from_this<GpuDevice>
 {
   public:
     GpuDevice(const SensorConfigs& configs, const std::string& name,
@@ -48,11 +48,16 @@
 
     void read();
 
-    void processTLimitThresholds(uint8_t rc,
-                                 const std::vector<int32_t>& thresholds);
+    void processTLimitThresholds(const std::error_code& ec);
+
+    void getTLimitThresholds();
 
     uint8_t eid{};
 
+    void getNextThermalParameter();
+    void readThermalParameterCallback(const std::error_code& ec,
+                                      std::span<const uint8_t> buffer);
+
     std::chrono::milliseconds sensorPollMs;
 
     boost::asio::steady_timer waitTimer;
@@ -71,6 +76,11 @@
     std::shared_ptr<NvidiaGpuEnergySensor> energySensor;
     std::shared_ptr<NvidiaGpuVoltageSensor> voltageSensor;
 
+    std::array<uint8_t, sizeof(gpu::ReadThermalParametersRequest)>
+        thermalParamReqMsg{};
+    std::array<uint8_t, 3> thresholds{};
+    size_t current_threshold_index{};
+
     SensorConfigs configs;
 
     std::string name;