regulators: Retry failed sensor monitoring
If a failure occurs while trying to read voltage regulator sensors,
retry the operation 5 times before logging an error.
This provides "de-glitching" to ignore transient hardware problems.
Signed-off-by: Shawn McCarney <shawnmm@us.ibm.com>
Change-Id: I310c15eb0f0d36d938057d6280a12b5aef854d20
diff --git a/phosphor-regulators/src/sensor_monitoring.cpp b/phosphor-regulators/src/sensor_monitoring.cpp
index b5e31ef..04a0547 100644
--- a/phosphor-regulators/src/sensor_monitoring.cpp
+++ b/phosphor-regulators/src/sensor_monitoring.cpp
@@ -31,6 +31,15 @@
namespace phosphor::power::regulators
{
+/**
+ * Maximum number of consecutive errors before an error log entry is created.
+ * This provides "de-glitching" to ignore transient hardware problems.
+ *
+ * Also the maximum number of consecutive errors that will be logged to the
+ * journal.
+ */
+constexpr unsigned short maxErrorCount{6};
+
void SensorMonitoring::execute(Services& services, System& system,
Chassis& chassis, Device& device, Rail& rail)
{
@@ -40,7 +49,6 @@
chassis.getInventoryPath());
// Read all sensors defined for this rail
- bool errorOccurred{false};
try
{
// Create ActionEnvironment
@@ -49,27 +57,32 @@
// Execute the actions
action_utils::execute(actions, environment);
+
+ // Reset consecutive error count since sensors were read successfully
+ errorCount = 0;
}
catch (const std::exception& e)
{
- // Set flag to notify sensors service that an error occurred
- errorOccurred = true;
-
- // Log error messages in journal for the first 3 errors
- if (++errorCount <= 3)
+ // If we haven't hit the maximum consecutive error count yet
+ if (errorCount < maxErrorCount)
{
+ // Log error messages in journal
services.getJournal().logError(exception_utils::getMessages(e));
services.getJournal().logError(
"Unable to monitor sensors for rail " + rail.getID());
- }
- // Create error log entry if this type hasn't already been logged
- error_logging_utils::logError(std::current_exception(),
- Entry::Level::Warning, services,
- errorHistory);
+ // Increment error count. If now at max, create error log entry.
+ if (++errorCount >= maxErrorCount)
+ {
+ error_logging_utils::logError(std::current_exception(),
+ Entry::Level::Warning, services,
+ errorHistory);
+ }
+ }
}
// Notify sensors service that monitoring has ended for this rail
+ bool errorOccurred = (errorCount > 0);
sensors.endRail(errorOccurred);
}
diff --git a/phosphor-regulators/src/sensor_monitoring.hpp b/phosphor-regulators/src/sensor_monitoring.hpp
index fcb02a5..47db47e 100644
--- a/phosphor-regulators/src/sensor_monitoring.hpp
+++ b/phosphor-regulators/src/sensor_monitoring.hpp
@@ -117,9 +117,9 @@
ErrorHistory errorHistory{};
/**
- * Number of errors that have occurred.
+ * Number of consecutive errors that have occurred.
*/
- unsigned int errorCount{0};
+ unsigned short errorCount{0};
};
} // namespace phosphor::power::regulators