sysfs IO enhancements

Add retries for some sysfs IO errors:
 EIO: Tolerate intermittant device or bus failures.
 ETIMEDOUT: Tolerate intermittant timeouts.
 EAGAIN: Tolerate drivers that do not block.
 ENXIO: Tolerate momentarily unplugged devices on busses that don't
    support hotplug.
 EBADMSG: Tolerate CRC errors.

Flush stdio buffers after writes.
Remove redundant retry and delay constants.

Resolves: openbmc/openbmc#2262

Change-Id: I2104139bf7ced96bb10f7450b42ca36e61c84287
Signed-off-by: Brad Bishop <bradleyb@fuzziesquirrel.com>
diff --git a/sysfs.cpp b/sysfs.cpp
index 5f53b31..9cf1552 100644
--- a/sysfs.cpp
+++ b/sysfs.cpp
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <algorithm>
 #include <cerrno>
 #include <cstdlib>
 #include <experimental/filesystem>
 #include <fstream>
 #include <memory>
+#include <thread>
 #include "sysfs.hpp"
 
 using namespace std::string_literals;
@@ -25,6 +27,35 @@
 
 namespace sysfs {
 
+static constexpr auto retryableErrors = {
+    /*
+     * Retry on bus or device errors or timeouts in case
+     * they are transient.
+     */
+    EIO,
+    ETIMEDOUT,
+
+    /*
+     * Retry CRC errors.
+     */
+    EBADMSG,
+
+    /*
+     * Some hwmon drivers do this when they aren't ready
+     * instead of blocking.  Retry.
+     */
+    EAGAIN,
+    /*
+     * We'll see this when for example i2c devices are
+     * unplugged but the driver is still bound.  Retry
+     * rather than exit on the off chance the device is
+     * plugged back in and the driver doesn't do a
+     * remove/probe.  If a remove does occur, we'll
+     * eventually get ENOENT.
+     */
+    ENXIO,
+};
+
 static const auto emptyString = ""s;
 static constexpr auto ofRoot = "/sys/firmware/devicetree/base";
 
@@ -210,7 +241,9 @@
 uint32_t HwmonIO::read(
         const std::string& type,
         const std::string& id,
-        const std::string& sensor) const
+        const std::string& sensor,
+        size_t retries,
+        std::chrono::milliseconds delay) const
 {
     uint32_t val;
     std::ifstream ifs;
@@ -221,37 +254,59 @@
             std::ifstream::failbit |
                 std::ifstream::badbit |
                 std::ifstream::eofbit);
-    try
+
+    while (true)
     {
-        ifs.open(fullPath);
-        ifs >> val;
-    }
-    catch (const std::exception& e)
-    {
-        auto rc = errno;
-
-        if (rc == ENOENT)
+        try
         {
-            // If the directory disappeared then this application should
-            // gracefully exit.  There are race conditions between the
-            // unloading of a hwmon driver and the stopping of this service
-            // by systemd.  To prevent this application from falsely failing
-            // in these scenarios, it will simply exit if the directory or
-            // file can not be found.  It is up to the user(s) of this
-            // provided hwmon object to log the appropriate errors if the
-            // object disappears when it should not.
-            exit(0);
+            if (!ifs.is_open())
+                ifs.open(fullPath);
+            ifs.clear();
+            ifs.seekg(0);
+            ifs >> val;
         }
-
-        if (rc)
+        catch (const std::exception& e)
         {
-            // Work around GCC bugs 53984 and 66145 for callers by
-            // explicitly raising system_error here.
-            throw std::system_error(rc, std::generic_category());
-        }
+            auto rc = errno;
 
-        throw;
+            if (!rc)
+            {
+                throw;
+            }
+
+            if (rc == ENOENT)
+            {
+                // If the directory disappeared then this application should
+                // gracefully exit.  There are race conditions between the
+                // unloading of a hwmon driver and the stopping of this service
+                // by systemd.  To prevent this application from falsely failing
+                // in these scenarios, it will simply exit if the directory or
+                // file can not be found.  It is up to the user(s) of this
+                // provided hwmon object to log the appropriate errors if the
+                // object disappears when it should not.
+                exit(0);
+            }
+
+            if (0 == std::count(
+                        retryableErrors.begin(),
+                        retryableErrors.end(),
+                        rc) ||
+                    !retries)
+            {
+                // Not a retryable error or out of retries.
+
+                // Work around GCC bugs 53984 and 66145 for callers by
+                // explicitly raising system_error here.
+                throw std::system_error(rc, std::generic_category());
+            }
+
+            --retries;
+            std::this_thread::sleep_for(delay);
+            continue;
+        }
+        break;
     }
+
     return val;
 }
 
@@ -259,7 +314,10 @@
         uint32_t val,
         const std::string& type,
         const std::string& id,
-        const std::string& sensor) const
+        const std::string& sensor,
+        size_t retries,
+        std::chrono::milliseconds delay) const
+
 {
     std::ofstream ofs;
     auto fullPath = sysfs::make_sysfs_path(
@@ -273,26 +331,49 @@
     // See comments in the read method for an explanation of the odd exception
     // handling behavior here.
 
-    try
+    while (true)
     {
-        ofs.open(fullPath);
-        ofs << val;
-    }
-    catch (const std::exception& e)
-    {
-        auto rc = errno;
-
-        if (rc == ENOENT)
+        try
         {
-            exit(0);
+            if (!ofs.is_open())
+                ofs.open(fullPath);
+            ofs.clear();
+            ofs.seekp(0);
+            ofs << val;
+            ofs.flush();
         }
-
-        if (rc)
+        catch (const std::exception& e)
         {
-            throw std::system_error(rc, std::generic_category());
-        }
+            auto rc = errno;
 
-        throw;
+            if (!rc)
+            {
+                throw;
+            }
+
+            if (rc == ENOENT)
+            {
+                exit(0);
+            }
+
+            if (0 == std::count(
+                        retryableErrors.begin(),
+                        retryableErrors.end(),
+                        rc) ||
+                    !retries)
+            {
+                // Not a retryable error or out of retries.
+
+                // Work around GCC bugs 53984 and 66145 for callers by
+                // explicitly raising system_error here.
+                throw std::system_error(rc, std::generic_category());
+            }
+
+            --retries;
+            std::this_thread::sleep_for(delay);
+            continue;
+        }
+        break;
     }
 }