sensor: Implement sensor "ASYNC_READ_TIMEOUT"
This change will prevent sensors from blocking all other sensor reads
and D-Bus if they do not report failures quickly enough.
If "ASYNC_READ_TIMEOUT" environment variable is defined in the
sensor's config file for a key_type, the sensor read will be
asynchronous with timeout set in milliseconds.
For example for "sensor1":
ASYNC_READ_TIMEOUT_sensor1="1000"  // Timeout will be set to 1 sec
If the read times out, the sensor read will be skipped and the
sensor's functional property will be set to 'false'. Timed out futures
will be placed in a map to prevent their destructor from running and
blocking until the read completes (limitation of std::async).
Tested: This patch has been running downstream for over a year to
        solve a slow I2C sensor reads causing IPMI slowdown.
Change-Id: I3d9ed4d5c9cc87d3196fc281451834f3001d0b48
Signed-off-by: Brandon Kim <brandonkim@google.com>
diff --git a/mainloop.cpp b/mainloop.cpp
index 75c8c12..0b6d070 100644
--- a/mainloop.cpp
+++ b/mainloop.cpp
@@ -34,6 +34,7 @@
 #include <cassert>
 #include <cstdlib>
 #include <functional>
+#include <future>
 #include <iostream>
 #include <memory>
 #include <phosphor-logging/elog-errors.hpp>
@@ -242,7 +243,7 @@
     {
         // Add status interface based on _fault file being present
         sensorObj->addStatus(info);
-        valueInterface = sensorObj->addValue(retryIO, info);
+        valueInterface = sensorObj->addValue(retryIO, info, _timedoutMap);
     }
     catch (const std::system_error& e)
     {
@@ -484,10 +485,28 @@
                 // RAII object for GPIO unlock / lock
                 auto locker = sensor::gpioUnlock(sensor->getGpio());
 
-                // Retry for up to a second if device is busy
-                // or has a transient error.
-                value = _ioAccess->read(sensorSysfsType, sensorSysfsNum, input,
+                // For sensors with attribute ASYNC_READ_TIMEOUT,
+                // spawn a thread with timeout
+                auto asyncReadTimeout =
+                    env::getEnv("ASYNC_READ_TIMEOUT", sensorSetKey);
+                if (!asyncReadTimeout.empty())
+                {
+                    std::chrono::milliseconds asyncTimeout{
+                        std::stoi(asyncReadTimeout)};
+                    value = sensor::asyncRead(
+                        sensorSetKey, _ioAccess, asyncTimeout, _timedoutMap,
+                        sensorSysfsType, sensorSysfsNum, input,
+                        hwmonio::retries, hwmonio::delay);
+                }
+                else
+                {
+                    // Retry for up to a second if device is busy
+                    // or has a transient error.
+                    value =
+                        _ioAccess->read(sensorSysfsType, sensorSysfsNum, input,
                                         hwmonio::retries, hwmonio::delay);
+                }
+
                 // Set functional property to true if we could read sensor
                 statusIface->functional(true);
 
diff --git a/mainloop.hpp b/mainloop.hpp
index af92479..530d4f3 100644
--- a/mainloop.hpp
+++ b/mainloop.hpp
@@ -9,6 +9,7 @@
 #include "types.hpp"
 
 #include <any>
+#include <future>
 #include <memory>
 #include <optional>
 #include <sdbusplus/server.hpp>
@@ -120,6 +121,8 @@
     /** @brief Store the specifications of sensor objects */
     std::map<SensorSet::key_type, std::unique_ptr<sensor::Sensor>>
         _sensorObjects;
+    /** @brief Store the async futures of timed out sensor objects */
+    sensor::TimedoutMap _timedoutMap;
 
     /**
      * @brief Map of removed sensors
diff --git a/meson.build b/meson.build
index 764cdd6..d080492 100644
--- a/meson.build
+++ b/meson.build
@@ -45,6 +45,7 @@
     dependency('sdbusplus'),
     dependency('sdeventplus'),
     dependency('stdplus'),
+    dependency('threads'),
     sysfs_dep,
 ]
 
diff --git a/sensor.cpp b/sensor.cpp
index 43af2f6..ddca883 100644
--- a/sensor.cpp
+++ b/sensor.cpp
@@ -15,6 +15,7 @@
 #include <cmath>
 #include <cstring>
 #include <filesystem>
+#include <future>
 #include <phosphor-logging/elog-errors.hpp>
 #include <thread>
 #include <xyz/openbmc_project/Common/error.hpp>
@@ -117,7 +118,8 @@
 }
 
 std::shared_ptr<ValueObject> Sensor::addValue(const RetryIO& retryIO,
-                                              ObjectInfo& info)
+                                              ObjectInfo& info,
+                                              TimedoutMap& timedoutMap)
 {
     static constexpr bool deferSignals = true;
 
@@ -144,12 +146,27 @@
             // RAII object for GPIO unlock / lock
             auto locker = gpioUnlock(getGpio());
 
-            // Retry for up to a second if device is busy
-            // or has a transient error.
-            val =
-                _ioAccess->read(_sensor.first, _sensor.second,
+            // For sensors with attribute ASYNC_READ_TIMEOUT,
+            // spawn a thread with timeout
+            auto asyncReadTimeout = env::getEnv("ASYNC_READ_TIMEOUT", _sensor);
+            if (!asyncReadTimeout.empty())
+            {
+                std::chrono::milliseconds asyncTimeout{
+                    std::stoi(asyncReadTimeout)};
+                val = asyncRead(_sensor, _ioAccess, asyncTimeout, timedoutMap,
+                                _sensor.first, _sensor.second,
                                 hwmon::entry::cinput, std::get<size_t>(retryIO),
                                 std::get<std::chrono::milliseconds>(retryIO));
+            }
+            else
+            {
+                // Retry for up to a second if device is busy
+                // or has a transient error.
+                val = _ioAccess->read(
+                    _sensor.first, _sensor.second, hwmon::entry::cinput,
+                    std::get<size_t>(retryIO),
+                    std::get<std::chrono::milliseconds>(retryIO));
+            }
         }
 #if UPDATE_FUNCTIONAL_ON_FAIL
         catch (const std::system_error& e)
@@ -266,4 +283,68 @@
     return GpioLocker(std::move(handle));
 }
 
+SensorValueType asyncRead(const SensorSet::key_type& sensorSetKey,
+                          const hwmonio::HwmonIOInterface* ioAccess,
+                          std::chrono::milliseconds asyncTimeout,
+                          TimedoutMap& timedoutMap, const std::string& type,
+                          const std::string& id, const std::string& sensor,
+                          const size_t retries,
+                          const std::chrono::milliseconds delay)
+{
+    // Default async read timeout
+    bool valueIsValid = false;
+    std::future<int64_t> asyncThread;
+
+    auto asyncIter = timedoutMap.find(sensorSetKey);
+    if (asyncIter == timedoutMap.end())
+    {
+        // If sensor not found in timedoutMap, spawn an async thread
+        asyncThread =
+            std::async(std::launch::async, &hwmonio::HwmonIOInterface::read,
+                       ioAccess, type, id, sensor, retries, delay);
+        valueIsValid = true;
+    }
+    else
+    {
+        // If we already have the async thread in the timedoutMap, it means this
+        // sensor has already timed out in the previous reads. No need to wait
+        // on subsequent reads - proceed to check the future_status to see when
+        // the async thread finishes
+        asyncTimeout = std::chrono::seconds(0);
+        asyncThread = std::move(asyncIter->second);
+    }
+
+    // TODO: This is still not a true asynchronous read as it still blocks the
+    // main thread for asyncTimeout amount of time. To make this completely
+    // asynchronous, schedule a read and register a callback to update the
+    // sensor value
+    std::future_status status = asyncThread.wait_for(asyncTimeout);
+    switch (status)
+    {
+        case std::future_status::ready:
+            // Read has finished
+            if (valueIsValid)
+            {
+                return asyncThread.get();
+                // Good sensor reads should skip the code below
+            }
+            // Async read thread has completed but had previously timed out (was
+            // found in the timedoutMap). Erase from timedoutMap and throw to
+            // allow retry in the next read cycle. Not returning the read value
+            // as the sensor reading may be bad / corrupted if it took so long.
+            timedoutMap.erase(sensorSetKey);
+            throw AsyncSensorReadTimeOut();
+        default:
+            // Read timed out so add the thread to the timedoutMap (if the entry
+            // already exists, operator[] updates it).
+            //
+            // Keeping the timed out futures in a map is required to prevent
+            // their destructor from being called when returning from this
+            // stack. The destructor will otherwise block until the read
+            // completes due to the limitation of std::async.
+            timedoutMap[sensorSetKey] = std::move(asyncThread);
+            throw AsyncSensorReadTimeOut();
+    }
+}
+
 } // namespace sensor
diff --git a/sensor.hpp b/sensor.hpp
index 4b2d281..a1e0524 100644
--- a/sensor.hpp
+++ b/sensor.hpp
@@ -4,7 +4,10 @@
 #include "sensorset.hpp"
 #include "types.hpp"
 
+#include <cerrno>
+#include <future>
 #include <gpioplus/handle.hpp>
+#include <map>
 #include <memory>
 #include <optional>
 #include <stdplus/handle/managed.hpp>
@@ -13,6 +16,8 @@
 namespace sensor
 {
 
+using TimedoutMap = std::map<SensorSet::key_type, std::future<int64_t>>;
+
 struct valueAdjust
 {
     double gain = 1.0;
@@ -20,6 +25,17 @@
     std::unordered_set<int> rmRCs;
 };
 
+/** @brief Custom exception for async sensor reading timeout
+ */
+struct AsyncSensorReadTimeOut : public std::system_error
+{
+    AsyncSensorReadTimeOut() :
+        system_error(std::error_code(ETIMEDOUT, std::system_category()),
+                     "Async sensor read timed out")
+    {
+    }
+};
+
 /** @class Sensor
  *  @brief Sensor object based on a SensorSet container's key type
  *  @details Sensor object to create and modify an associated device's sensor
@@ -87,10 +103,13 @@
      *                      (number of and delay between)
      * @param[in] info - Sensor object information
      *
+     * @param[in] timedoutMap - Map to track timed out threads
+     *
      * @return - Shared pointer to the value object
      */
     std::shared_ptr<ValueObject> addValue(const RetryIO& retryIO,
-                                          ObjectInfo& info);
+                                          ObjectInfo& info,
+                                          TimedoutMap& timedoutMap);
 
     /**
      * @brief Add status interface and functional property for sensor
@@ -177,4 +196,29 @@
  */
 std::optional<GpioLocker> gpioUnlock(const gpioplus::HandleInterface* handle);
 
+/**
+ * @brief Asynchronously read a sensor with timeout defined by
+ *        ASYNC_READ_TIMEOUT environment variable
+ *
+ * @param[in] sensorSetKey - Sensor object's identifiers
+ * @param[in] ioAccess - Hwmon sysfs access
+ * @param[in] asyncTimeout - Async read timeout in milliseconds
+ * @param[in] timedoutMap - Map to track timed out threads
+ *
+ * (Params needed for HwmonIO::read)
+ * @param[in] type - The hwmon type (ex. temp).
+ * @param[in] id - The hwmon id (ex. 1).
+ * @param[in] sensor - The hwmon sensor (ex. input).
+ * @param[in] retries - The number of times to retry.
+ * @param[in] delay - The time to sleep between retry attempts.
+ *
+ * @return - SensorValueType read asynchronously, will throw if timed out
+ */
+SensorValueType asyncRead(const SensorSet::key_type& sensorSetKey,
+                          const hwmonio::HwmonIOInterface* ioAccess,
+                          std::chrono::milliseconds asyncTimeout,
+                          TimedoutMap& timedoutMap, const std::string& type,
+                          const std::string& id, const std::string& sensor,
+                          const size_t retries,
+                          const std::chrono::milliseconds delay);
 } // namespace sensor