monitor: Add up/down count fault detection
Create an up/down count fault determination algorithm that
could be used in place of the current timer based outOfRange()
function.
The up/down count is a different method for determining when
a fan is faulted by counting up each iteration a rotor is
out of spec and removing those counts when the rotor
returns within spec.
Tested:
1. Remove a fan and run Mihawk, the counter add 1 when sensor
is out of spec, and replaced the fan back before hit the
threshold, the counter decrement back to 0.
2. Remove a fan, counter add 1 and mark the removed fan as
nonfunctional when counter reaches the threshold, and
Replaced the fan back, counter will decrement back to 0
and fan back to functional.
Change-Id: I632dd2c7553b007beb7ae6bb694a590d2cfc2a1c
Signed-off-by: Jolie Ku <jolie_ku@wistron.com>
Signed-off-by: Matthew Barth <msbarth@us.ibm.com>
diff --git a/monitor/json_parser.cpp b/monitor/json_parser.cpp
index b2b0c06..82fc621 100644
--- a/monitor/json_parser.cpp
+++ b/monitor/json_parser.cpp
@@ -20,6 +20,7 @@
#include "nonzero_speed_trust.hpp"
#include "power_interface.hpp"
#include "power_off_rule.hpp"
+#include "tach_sensor.hpp"
#include "types.hpp"
#include <fmt/format.h>
@@ -57,6 +58,8 @@
{"nonzerospeed", tClass::getNonZeroSpeed}};
const std::map<std::string, condHandler> conditions = {
{"propertiesmatch", condition::getPropertiesMatch}};
+const std::map<std::string, size_t> methods = {
+ {"timebased", MethodMode::timebased}, {"count", MethodMode::count}};
const std::vector<CreateGroupFunction> getTrustGrps(const json& obj)
{
@@ -155,10 +158,16 @@
{
offset = sensor["offset"].get<int64_t>();
}
+ // Threshold is optional and defaults to 1
+ auto threshold = 1;
+ if (sensor.contains("threshold"))
+ {
+ threshold = sensor["threshold"].get<size_t>();
+ }
- sensorDefs.emplace_back(std::tuple(sensor["name"].get<std::string>(),
- sensor["has_target"].get<bool>(),
- targetIntf, factor, offset));
+ sensorDefs.emplace_back(std::tuple(
+ sensor["name"].get<std::string>(), sensor["has_target"].get<bool>(),
+ targetIntf, factor, offset, threshold));
}
return sensorDefs;
@@ -170,16 +179,14 @@
for (const auto& fan : obj["fans"])
{
- if (!fan.contains("inventory") ||
- !fan.contains("allowed_out_of_range_time") ||
- !fan.contains("deviation") || !fan.contains("sensors"))
+ if (!fan.contains("inventory") || !fan.contains("deviation") ||
+ !fan.contains("sensors"))
{
// Log error on missing required parameters
log<level::ERR>(
"Missing required fan monitor definition parameters",
entry("REQUIRED_PARAMETERS=%s",
- "{inventory, allowed_out_of_range_time, deviation, "
- "sensors}"));
+ "{inventory, deviation, sensors}"));
throw std::runtime_error(
"Missing required fan monitor definition parameters");
}
@@ -193,6 +200,45 @@
funcDelay = fan["functional_delay"].get<size_t>();
}
+ // Method is optional and defaults to time based functional
+ // determination
+ size_t method = MethodMode::timebased;
+ if (fan.contains("method"))
+ {
+ auto methodConf = fan["method"].get<std::string>();
+ auto methodFunc = methods.find(methodConf);
+ if (methodFunc != methods.end())
+ {
+ method = methodFunc->second;
+ }
+ else
+ {
+ // Log error on unsupported method parameter
+ log<level::ERR>("Invalid fan method");
+ throw std::runtime_error("Invalid fan method");
+ }
+ }
+
+ // Timeout defaults to 0
+ size_t timeout = 0;
+ if (method == MethodMode::timebased)
+ {
+ if (!fan.contains("allowed_out_of_range_time"))
+ {
+ // Log error on missing required parameter
+ log<level::ERR>(
+ "Missing required fan monitor definition parameters",
+ entry("REQUIRED_PARAMETER=%s",
+ "{allowed_out_of_range_time}"));
+ throw std::runtime_error(
+ "Missing required fan monitor definition parameters");
+ }
+ else
+ {
+ timeout = fan["allowed_out_of_range_time"].get<size_t>();
+ }
+ }
+
// Monitor start delay is optional and defaults to 0
size_t monitorDelay = 0;
if (fan.contains("monitor_start_delay"))
@@ -261,9 +307,9 @@
entry("JSON_DUMP=%s", fan["condition"].dump().c_str()));
}
}
+
fanDefs.emplace_back(std::tuple(
- fan["inventory"].get<std::string>(), funcDelay,
- fan["allowed_out_of_range_time"].get<size_t>(),
+ fan["inventory"].get<std::string>(), method, funcDelay, timeout,
fan["deviation"].get<size_t>(), nonfuncSensorsCount, monitorDelay,
nonfuncRotorErrorDelay, fanMissingErrorDelay, sensorDefs, cond));
}