sysfs IO enhancements

Add retries for some sysfs IO errors:
 EIO: Tolerate intermittant device or bus failures.
 ETIMEDOUT: Tolerate intermittant timeouts.
 EAGAIN: Tolerate drivers that do not block.
 ENXIO: Tolerate momentarily unplugged devices on busses that don't
    support hotplug.
 EBADMSG: Tolerate CRC errors.

Flush stdio buffers after writes.
Remove redundant retry and delay constants.

Resolves: openbmc/openbmc#2262

Change-Id: I2104139bf7ced96bb10f7450b42ca36e61c84287
Signed-off-by: Brad Bishop <bradleyb@fuzziesquirrel.com>
diff --git a/fan_speed.cpp b/fan_speed.cpp
index 5e4c9fc..c156258 100644
--- a/fan_speed.cpp
+++ b/fan_speed.cpp
@@ -23,7 +23,10 @@
                     value,
                     type,
                     id,
-                    entry::target);
+                    entry::target,
+                    sysfs::hwmonio::retries,
+                    sysfs::hwmonio::delay);
+
         }
         catch (const std::system_error& e)
         {
@@ -60,7 +63,9 @@
                     enable::rpmMode,
                     type::pwm,
                     id,
-                    entry::enable);
+                    entry::enable,
+                    sysfs::hwmonio::retries,
+                    sysfs::hwmonio::delay);
         }
         catch (const std::system_error& e)
         {
diff --git a/mainloop.cpp b/mainloop.cpp
index 7399df4..217c34c 100644
--- a/mainloop.cpp
+++ b/mainloop.cpp
@@ -16,7 +16,6 @@
 #include <iostream>
 #include <memory>
 #include <cstdlib>
-#include <algorithm>
 
 #include <phosphor-logging/elog-errors.hpp>
 #include "config.h"
@@ -61,8 +60,6 @@
 decltype(Thresholds<CriticalObject>::alarmHi) Thresholds<CriticalObject>::alarmHi =
     &CriticalObject::criticalAlarmHigh;
 
-
-
 static constexpr auto typeAttrMap =
 {
     // 1 - hwmon class
@@ -157,48 +154,17 @@
     auto& obj = std::get<Object>(info);
     auto& objPath = std::get<std::string>(info);
 
-    auto readWithRetry = [&ioAccess](
-            const auto& type,
-            const auto& id,
-            const auto& sensor,
-            auto retries,
-            const auto& delay)
-    {
-        auto value = 0;
-
-        while(true)
-        {
-            try
-            {
-                value = ioAccess.read(type, id, sensor);
-            }
-            catch (const std::system_error& e)
-            {
-                if (e.code().value() != EAGAIN || !retries)
-                {
-                    throw;
-                }
-
-                --retries;
-                std::this_thread::sleep_for(delay);
-                continue;
-            }
-            break;
-        }
-        return value;
-    };
-
-    static constexpr auto retries = 10;
     auto val = 0;
     try
     {
         // Retry for up to a second if device is busy
-        val = readWithRetry(
+        // or has a transient error.
+        val = ioAccess.read(
                 sensor.first,
                 sensor.second,
-                hwmon::entry::input,
-                retries,
-                std::chrono::milliseconds{100});
+                hwmon::entry::cinput,
+                sysfs::hwmonio::retries,
+                sysfs::hwmonio::delay);
     }
     catch (const std::system_error& e)
     {
@@ -389,10 +355,15 @@
                 int value;
                 try
                 {
+                    // Retry for up to a second if device is busy
+                    // or has a transient error.
+
                     value = ioAccess.read(
                             i.first.first,
                             i.first.second,
-                            hwmon::entry::input);
+                            hwmon::entry::cinput,
+                            sysfs::hwmonio::retries,
+                            sysfs::hwmonio::delay);
 
                     auto& objInfo = std::get<ObjectInfo>(i.second);
                     auto& obj = std::get<Object>(objInfo);
@@ -423,14 +394,6 @@
                 }
                 catch (const std::system_error& e)
                 {
-                    if (e.code().value() == EAGAIN)
-                    {
-                        //Just go with the current values and try again later.
-                        //TODO: openbmc/openbmc#2048 could keep an eye on
-                        //how long the device is actually busy.
-                        continue;
-                    }
-
                     using namespace sdbusplus::xyz::openbmc_project::
                         Sensor::Device::Error;
                     report<ReadFailure>(
@@ -439,7 +402,6 @@
                             xyz::openbmc_project::Sensor::Device::
                                 ReadFailure::CALLOUT_DEVICE_PATH(
                                     _devPath.c_str()));
-
 #ifdef REMOVE_ON_FAIL
                     destroy.push_back(i.first);
 #else
diff --git a/sysfs.cpp b/sysfs.cpp
index 5f53b31..9cf1552 100644
--- a/sysfs.cpp
+++ b/sysfs.cpp
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <algorithm>
 #include <cerrno>
 #include <cstdlib>
 #include <experimental/filesystem>
 #include <fstream>
 #include <memory>
+#include <thread>
 #include "sysfs.hpp"
 
 using namespace std::string_literals;
@@ -25,6 +27,35 @@
 
 namespace sysfs {
 
+static constexpr auto retryableErrors = {
+    /*
+     * Retry on bus or device errors or timeouts in case
+     * they are transient.
+     */
+    EIO,
+    ETIMEDOUT,
+
+    /*
+     * Retry CRC errors.
+     */
+    EBADMSG,
+
+    /*
+     * Some hwmon drivers do this when they aren't ready
+     * instead of blocking.  Retry.
+     */
+    EAGAIN,
+    /*
+     * We'll see this when for example i2c devices are
+     * unplugged but the driver is still bound.  Retry
+     * rather than exit on the off chance the device is
+     * plugged back in and the driver doesn't do a
+     * remove/probe.  If a remove does occur, we'll
+     * eventually get ENOENT.
+     */
+    ENXIO,
+};
+
 static const auto emptyString = ""s;
 static constexpr auto ofRoot = "/sys/firmware/devicetree/base";
 
@@ -210,7 +241,9 @@
 uint32_t HwmonIO::read(
         const std::string& type,
         const std::string& id,
-        const std::string& sensor) const
+        const std::string& sensor,
+        size_t retries,
+        std::chrono::milliseconds delay) const
 {
     uint32_t val;
     std::ifstream ifs;
@@ -221,37 +254,59 @@
             std::ifstream::failbit |
                 std::ifstream::badbit |
                 std::ifstream::eofbit);
-    try
+
+    while (true)
     {
-        ifs.open(fullPath);
-        ifs >> val;
-    }
-    catch (const std::exception& e)
-    {
-        auto rc = errno;
-
-        if (rc == ENOENT)
+        try
         {
-            // If the directory disappeared then this application should
-            // gracefully exit.  There are race conditions between the
-            // unloading of a hwmon driver and the stopping of this service
-            // by systemd.  To prevent this application from falsely failing
-            // in these scenarios, it will simply exit if the directory or
-            // file can not be found.  It is up to the user(s) of this
-            // provided hwmon object to log the appropriate errors if the
-            // object disappears when it should not.
-            exit(0);
+            if (!ifs.is_open())
+                ifs.open(fullPath);
+            ifs.clear();
+            ifs.seekg(0);
+            ifs >> val;
         }
-
-        if (rc)
+        catch (const std::exception& e)
         {
-            // Work around GCC bugs 53984 and 66145 for callers by
-            // explicitly raising system_error here.
-            throw std::system_error(rc, std::generic_category());
-        }
+            auto rc = errno;
 
-        throw;
+            if (!rc)
+            {
+                throw;
+            }
+
+            if (rc == ENOENT)
+            {
+                // If the directory disappeared then this application should
+                // gracefully exit.  There are race conditions between the
+                // unloading of a hwmon driver and the stopping of this service
+                // by systemd.  To prevent this application from falsely failing
+                // in these scenarios, it will simply exit if the directory or
+                // file can not be found.  It is up to the user(s) of this
+                // provided hwmon object to log the appropriate errors if the
+                // object disappears when it should not.
+                exit(0);
+            }
+
+            if (0 == std::count(
+                        retryableErrors.begin(),
+                        retryableErrors.end(),
+                        rc) ||
+                    !retries)
+            {
+                // Not a retryable error or out of retries.
+
+                // Work around GCC bugs 53984 and 66145 for callers by
+                // explicitly raising system_error here.
+                throw std::system_error(rc, std::generic_category());
+            }
+
+            --retries;
+            std::this_thread::sleep_for(delay);
+            continue;
+        }
+        break;
     }
+
     return val;
 }
 
@@ -259,7 +314,10 @@
         uint32_t val,
         const std::string& type,
         const std::string& id,
-        const std::string& sensor) const
+        const std::string& sensor,
+        size_t retries,
+        std::chrono::milliseconds delay) const
+
 {
     std::ofstream ofs;
     auto fullPath = sysfs::make_sysfs_path(
@@ -273,26 +331,49 @@
     // See comments in the read method for an explanation of the odd exception
     // handling behavior here.
 
-    try
+    while (true)
     {
-        ofs.open(fullPath);
-        ofs << val;
-    }
-    catch (const std::exception& e)
-    {
-        auto rc = errno;
-
-        if (rc == ENOENT)
+        try
         {
-            exit(0);
+            if (!ofs.is_open())
+                ofs.open(fullPath);
+            ofs.clear();
+            ofs.seekp(0);
+            ofs << val;
+            ofs.flush();
         }
-
-        if (rc)
+        catch (const std::exception& e)
         {
-            throw std::system_error(rc, std::generic_category());
-        }
+            auto rc = errno;
 
-        throw;
+            if (!rc)
+            {
+                throw;
+            }
+
+            if (rc == ENOENT)
+            {
+                exit(0);
+            }
+
+            if (0 == std::count(
+                        retryableErrors.begin(),
+                        retryableErrors.end(),
+                        rc) ||
+                    !retries)
+            {
+                // Not a retryable error or out of retries.
+
+                // Work around GCC bugs 53984 and 66145 for callers by
+                // explicitly raising system_error here.
+                throw std::system_error(rc, std::generic_category());
+            }
+
+            --retries;
+            std::this_thread::sleep_for(delay);
+            continue;
+        }
+        break;
     }
 }
 
diff --git a/sysfs.hpp b/sysfs.hpp
index 934c7c2..2a22952 100644
--- a/sysfs.hpp
+++ b/sysfs.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <chrono>
 #include <exception>
 #include <fstream>
 #include <string>
@@ -59,6 +60,8 @@
 
 namespace hwmonio
 {
+static constexpr auto retries = 10;
+static constexpr auto delay = std::chrono::milliseconds{100};
 
 /** @class HwmonIO
  *  @brief Convenience wrappers for HWMON sysfs attribute IO.
@@ -93,16 +96,23 @@
          *  the underlying hwmon driver is unbound and
          *  the program is inadvertently left running.
          *
+         *  For possibly transient errors will retry up to
+         *  the specified number of times.
+         *
          *  @param[in] type - The hwmon type (ex. temp).
          *  @param[in] id - The hwmon id (ex. 1).
          *  @param[in] sensor - The hwmon sensor (ex. input).
+         *  @param[in] retries - The number of times to retry.
+         *  @param[in] delay - The time to sleep between retry attempts.
          *
          *  @return val - The read value.
          */
         uint32_t read(
                 const std::string& type,
                 const std::string& id,
-                const std::string& sensor) const;
+                const std::string& sensor,
+                size_t retries,
+                std::chrono::milliseconds delay) const;
 
         /** @brief Perform formatted hwmon sysfs write.
          *
@@ -111,16 +121,23 @@
          *  the underlying hwmon driver is unbound and
          *  the program is inadvertently left running.
          *
+         *  For possibly transient errors will retry up to
+         *  the specified number of times.
+         *
          *  @param[in] val - The value to be written.
          *  @param[in] type - The hwmon type (ex. fan).
          *  @param[in] id - The hwmon id (ex. 1).
-         *  @param[in] sensor - The hwmon sensor (ex. target).
+         *  @param[in] retries - The number of times to retry.
+         *  @param[in] delay - The time to sleep between retry attempts.
          */
         void write(
                 uint32_t val,
                 const std::string& type,
                 const std::string& id,
-                const std::string& sensor) const;
+                const std::string& sensor,
+                size_t retries,
+                std::chrono::milliseconds delay) const;
+
 
         /** @brief Hwmon instance path access.
          *
diff --git a/test/hwmonio.cpp b/test/hwmonio.cpp
index 2c6bdaf..00c4ae1 100644
--- a/test/hwmonio.cpp
+++ b/test/hwmonio.cpp
@@ -32,11 +32,16 @@
 
     if ("read"s == argv[1])
     {
-        std::cout << io.read(argv[3], argv[4], argv[5]) << std::endl;
+        std::cout << io.read(argv[3], argv[4], argv[5],
+                sysfs::hwmonio::retries, sysfs::hwmonio::delay) <<
+            std::endl;
     }
     else
     {
-        io.write(strtol(argv[6], nullptr, 0), argv[3], argv[4], argv[5]);
+        io.write(
+                strtol(argv[6], nullptr, 0),
+                argv[3], argv[4], argv[5], sysfs::hwmonio::retries,
+                sysfs::hwmonio::delay);
     }
 
     return 0;