sysfs IO enhancements
Add retries for some sysfs IO errors:
EIO: Tolerate intermittant device or bus failures.
ETIMEDOUT: Tolerate intermittant timeouts.
EAGAIN: Tolerate drivers that do not block.
ENXIO: Tolerate momentarily unplugged devices on busses that don't
support hotplug.
EBADMSG: Tolerate CRC errors.
Flush stdio buffers after writes.
Remove redundant retry and delay constants.
Resolves: openbmc/openbmc#2262
Change-Id: I2104139bf7ced96bb10f7450b42ca36e61c84287
Signed-off-by: Brad Bishop <bradleyb@fuzziesquirrel.com>
diff --git a/fan_speed.cpp b/fan_speed.cpp
index 5e4c9fc..c156258 100644
--- a/fan_speed.cpp
+++ b/fan_speed.cpp
@@ -23,7 +23,10 @@
value,
type,
id,
- entry::target);
+ entry::target,
+ sysfs::hwmonio::retries,
+ sysfs::hwmonio::delay);
+
}
catch (const std::system_error& e)
{
@@ -60,7 +63,9 @@
enable::rpmMode,
type::pwm,
id,
- entry::enable);
+ entry::enable,
+ sysfs::hwmonio::retries,
+ sysfs::hwmonio::delay);
}
catch (const std::system_error& e)
{
diff --git a/mainloop.cpp b/mainloop.cpp
index 7399df4..217c34c 100644
--- a/mainloop.cpp
+++ b/mainloop.cpp
@@ -16,7 +16,6 @@
#include <iostream>
#include <memory>
#include <cstdlib>
-#include <algorithm>
#include <phosphor-logging/elog-errors.hpp>
#include "config.h"
@@ -61,8 +60,6 @@
decltype(Thresholds<CriticalObject>::alarmHi) Thresholds<CriticalObject>::alarmHi =
&CriticalObject::criticalAlarmHigh;
-
-
static constexpr auto typeAttrMap =
{
// 1 - hwmon class
@@ -157,48 +154,17 @@
auto& obj = std::get<Object>(info);
auto& objPath = std::get<std::string>(info);
- auto readWithRetry = [&ioAccess](
- const auto& type,
- const auto& id,
- const auto& sensor,
- auto retries,
- const auto& delay)
- {
- auto value = 0;
-
- while(true)
- {
- try
- {
- value = ioAccess.read(type, id, sensor);
- }
- catch (const std::system_error& e)
- {
- if (e.code().value() != EAGAIN || !retries)
- {
- throw;
- }
-
- --retries;
- std::this_thread::sleep_for(delay);
- continue;
- }
- break;
- }
- return value;
- };
-
- static constexpr auto retries = 10;
auto val = 0;
try
{
// Retry for up to a second if device is busy
- val = readWithRetry(
+ // or has a transient error.
+ val = ioAccess.read(
sensor.first,
sensor.second,
- hwmon::entry::input,
- retries,
- std::chrono::milliseconds{100});
+ hwmon::entry::cinput,
+ sysfs::hwmonio::retries,
+ sysfs::hwmonio::delay);
}
catch (const std::system_error& e)
{
@@ -389,10 +355,15 @@
int value;
try
{
+ // Retry for up to a second if device is busy
+ // or has a transient error.
+
value = ioAccess.read(
i.first.first,
i.first.second,
- hwmon::entry::input);
+ hwmon::entry::cinput,
+ sysfs::hwmonio::retries,
+ sysfs::hwmonio::delay);
auto& objInfo = std::get<ObjectInfo>(i.second);
auto& obj = std::get<Object>(objInfo);
@@ -423,14 +394,6 @@
}
catch (const std::system_error& e)
{
- if (e.code().value() == EAGAIN)
- {
- //Just go with the current values and try again later.
- //TODO: openbmc/openbmc#2048 could keep an eye on
- //how long the device is actually busy.
- continue;
- }
-
using namespace sdbusplus::xyz::openbmc_project::
Sensor::Device::Error;
report<ReadFailure>(
@@ -439,7 +402,6 @@
xyz::openbmc_project::Sensor::Device::
ReadFailure::CALLOUT_DEVICE_PATH(
_devPath.c_str()));
-
#ifdef REMOVE_ON_FAIL
destroy.push_back(i.first);
#else
diff --git a/sysfs.cpp b/sysfs.cpp
index 5f53b31..9cf1552 100644
--- a/sysfs.cpp
+++ b/sysfs.cpp
@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+#include <algorithm>
#include <cerrno>
#include <cstdlib>
#include <experimental/filesystem>
#include <fstream>
#include <memory>
+#include <thread>
#include "sysfs.hpp"
using namespace std::string_literals;
@@ -25,6 +27,35 @@
namespace sysfs {
+static constexpr auto retryableErrors = {
+ /*
+ * Retry on bus or device errors or timeouts in case
+ * they are transient.
+ */
+ EIO,
+ ETIMEDOUT,
+
+ /*
+ * Retry CRC errors.
+ */
+ EBADMSG,
+
+ /*
+ * Some hwmon drivers do this when they aren't ready
+ * instead of blocking. Retry.
+ */
+ EAGAIN,
+ /*
+ * We'll see this when for example i2c devices are
+ * unplugged but the driver is still bound. Retry
+ * rather than exit on the off chance the device is
+ * plugged back in and the driver doesn't do a
+ * remove/probe. If a remove does occur, we'll
+ * eventually get ENOENT.
+ */
+ ENXIO,
+};
+
static const auto emptyString = ""s;
static constexpr auto ofRoot = "/sys/firmware/devicetree/base";
@@ -210,7 +241,9 @@
uint32_t HwmonIO::read(
const std::string& type,
const std::string& id,
- const std::string& sensor) const
+ const std::string& sensor,
+ size_t retries,
+ std::chrono::milliseconds delay) const
{
uint32_t val;
std::ifstream ifs;
@@ -221,37 +254,59 @@
std::ifstream::failbit |
std::ifstream::badbit |
std::ifstream::eofbit);
- try
+
+ while (true)
{
- ifs.open(fullPath);
- ifs >> val;
- }
- catch (const std::exception& e)
- {
- auto rc = errno;
-
- if (rc == ENOENT)
+ try
{
- // If the directory disappeared then this application should
- // gracefully exit. There are race conditions between the
- // unloading of a hwmon driver and the stopping of this service
- // by systemd. To prevent this application from falsely failing
- // in these scenarios, it will simply exit if the directory or
- // file can not be found. It is up to the user(s) of this
- // provided hwmon object to log the appropriate errors if the
- // object disappears when it should not.
- exit(0);
+ if (!ifs.is_open())
+ ifs.open(fullPath);
+ ifs.clear();
+ ifs.seekg(0);
+ ifs >> val;
}
-
- if (rc)
+ catch (const std::exception& e)
{
- // Work around GCC bugs 53984 and 66145 for callers by
- // explicitly raising system_error here.
- throw std::system_error(rc, std::generic_category());
- }
+ auto rc = errno;
- throw;
+ if (!rc)
+ {
+ throw;
+ }
+
+ if (rc == ENOENT)
+ {
+ // If the directory disappeared then this application should
+ // gracefully exit. There are race conditions between the
+ // unloading of a hwmon driver and the stopping of this service
+ // by systemd. To prevent this application from falsely failing
+ // in these scenarios, it will simply exit if the directory or
+ // file can not be found. It is up to the user(s) of this
+ // provided hwmon object to log the appropriate errors if the
+ // object disappears when it should not.
+ exit(0);
+ }
+
+ if (0 == std::count(
+ retryableErrors.begin(),
+ retryableErrors.end(),
+ rc) ||
+ !retries)
+ {
+ // Not a retryable error or out of retries.
+
+ // Work around GCC bugs 53984 and 66145 for callers by
+ // explicitly raising system_error here.
+ throw std::system_error(rc, std::generic_category());
+ }
+
+ --retries;
+ std::this_thread::sleep_for(delay);
+ continue;
+ }
+ break;
}
+
return val;
}
@@ -259,7 +314,10 @@
uint32_t val,
const std::string& type,
const std::string& id,
- const std::string& sensor) const
+ const std::string& sensor,
+ size_t retries,
+ std::chrono::milliseconds delay) const
+
{
std::ofstream ofs;
auto fullPath = sysfs::make_sysfs_path(
@@ -273,26 +331,49 @@
// See comments in the read method for an explanation of the odd exception
// handling behavior here.
- try
+ while (true)
{
- ofs.open(fullPath);
- ofs << val;
- }
- catch (const std::exception& e)
- {
- auto rc = errno;
-
- if (rc == ENOENT)
+ try
{
- exit(0);
+ if (!ofs.is_open())
+ ofs.open(fullPath);
+ ofs.clear();
+ ofs.seekp(0);
+ ofs << val;
+ ofs.flush();
}
-
- if (rc)
+ catch (const std::exception& e)
{
- throw std::system_error(rc, std::generic_category());
- }
+ auto rc = errno;
- throw;
+ if (!rc)
+ {
+ throw;
+ }
+
+ if (rc == ENOENT)
+ {
+ exit(0);
+ }
+
+ if (0 == std::count(
+ retryableErrors.begin(),
+ retryableErrors.end(),
+ rc) ||
+ !retries)
+ {
+ // Not a retryable error or out of retries.
+
+ // Work around GCC bugs 53984 and 66145 for callers by
+ // explicitly raising system_error here.
+ throw std::system_error(rc, std::generic_category());
+ }
+
+ --retries;
+ std::this_thread::sleep_for(delay);
+ continue;
+ }
+ break;
}
}
diff --git a/sysfs.hpp b/sysfs.hpp
index 934c7c2..2a22952 100644
--- a/sysfs.hpp
+++ b/sysfs.hpp
@@ -1,5 +1,6 @@
#pragma once
+#include <chrono>
#include <exception>
#include <fstream>
#include <string>
@@ -59,6 +60,8 @@
namespace hwmonio
{
+static constexpr auto retries = 10;
+static constexpr auto delay = std::chrono::milliseconds{100};
/** @class HwmonIO
* @brief Convenience wrappers for HWMON sysfs attribute IO.
@@ -93,16 +96,23 @@
* the underlying hwmon driver is unbound and
* the program is inadvertently left running.
*
+ * For possibly transient errors will retry up to
+ * the specified number of times.
+ *
* @param[in] type - The hwmon type (ex. temp).
* @param[in] id - The hwmon id (ex. 1).
* @param[in] sensor - The hwmon sensor (ex. input).
+ * @param[in] retries - The number of times to retry.
+ * @param[in] delay - The time to sleep between retry attempts.
*
* @return val - The read value.
*/
uint32_t read(
const std::string& type,
const std::string& id,
- const std::string& sensor) const;
+ const std::string& sensor,
+ size_t retries,
+ std::chrono::milliseconds delay) const;
/** @brief Perform formatted hwmon sysfs write.
*
@@ -111,16 +121,23 @@
* the underlying hwmon driver is unbound and
* the program is inadvertently left running.
*
+ * For possibly transient errors will retry up to
+ * the specified number of times.
+ *
* @param[in] val - The value to be written.
* @param[in] type - The hwmon type (ex. fan).
* @param[in] id - The hwmon id (ex. 1).
- * @param[in] sensor - The hwmon sensor (ex. target).
+ * @param[in] retries - The number of times to retry.
+ * @param[in] delay - The time to sleep between retry attempts.
*/
void write(
uint32_t val,
const std::string& type,
const std::string& id,
- const std::string& sensor) const;
+ const std::string& sensor,
+ size_t retries,
+ std::chrono::milliseconds delay) const;
+
/** @brief Hwmon instance path access.
*
diff --git a/test/hwmonio.cpp b/test/hwmonio.cpp
index 2c6bdaf..00c4ae1 100644
--- a/test/hwmonio.cpp
+++ b/test/hwmonio.cpp
@@ -32,11 +32,16 @@
if ("read"s == argv[1])
{
- std::cout << io.read(argv[3], argv[4], argv[5]) << std::endl;
+ std::cout << io.read(argv[3], argv[4], argv[5],
+ sysfs::hwmonio::retries, sysfs::hwmonio::delay) <<
+ std::endl;
}
else
{
- io.write(strtol(argv[6], nullptr, 0), argv[3], argv[4], argv[5]);
+ io.write(
+ strtol(argv[6], nullptr, 0),
+ argv[3], argv[4], argv[5], sysfs::hwmonio::retries,
+ sysfs::hwmonio::delay);
}
return 0;