monitor: Add up/down count fault detection
Create an up/down count fault determination algorithm that
could be used in place of the current timer based outOfRange()
function.
The up/down count is a different method for determining when
a fan is faulted by counting up each iteration a rotor is
out of spec and removing those counts when the rotor
returns within spec.
Tested:
1. Remove a fan and run Mihawk, the counter add 1 when sensor
is out of spec, and replaced the fan back before hit the
threshold, the counter decrement back to 0.
2. Remove a fan, counter add 1 and mark the removed fan as
nonfunctional when counter reaches the threshold, and
Replaced the fan back, counter will decrement back to 0
and fan back to functional.
Change-Id: I632dd2c7553b007beb7ae6bb694a590d2cfc2a1c
Signed-off-by: Jolie Ku <jolie_ku@wistron.com>
Signed-off-by: Matthew Barth <msbarth@us.ibm.com>
diff --git a/monitor/example/monitor.yaml b/monitor/example/monitor.yaml
index 59be7e6..8d70c4f 100644
--- a/monitor/example/monitor.yaml
+++ b/monitor/example/monitor.yaml
@@ -7,10 +7,14 @@
#fans:
# - inventory:
# [The system inventory location for the fan]
+# method: [timebased|count](optional)
+# [Available methods:
+# * timebased - use a percentage based deviation
+# * count - run up/down count fault detection]
# functional_delay:
# [Delay (in secs) before a fan is marked functional after returning
# within the allowed deviation]
-# allowed_out_of_range_time:
+# allowed_out_of_range_time: (required for method timebased)
# [Time (in secs) actual speed can be outside of deviation of
# target speed]
# deviation:
@@ -88,3 +92,24 @@
# - name: fan0_0
# in_trust: false
# - name: fan0_1
+
+#Example entries for count method fan system:
+#fans:
+# - inventory: /system/chassis/motherboard/fan2
+# method: count
+# functional_delay: 5
+# deviation: 15
+# num_sensors_nonfunc_for_fan_nonfunc: 1
+# sensors:
+# - name: fan2
+# has_target: true
+# # Only create fan functional properties and monitor on air cooled
+# condition:
+# name: propertiesMatch
+# properties:
+# - object: /xyz/openbmc_project/inventory/system/chassis
+# interface: xyz.openbmc_project.Inventory.Decorator.CoolingType
+# property:
+# name: WaterCooled
+# type: bool
+# value: false
diff --git a/monitor/fan.cpp b/monitor/fan.cpp
index 3f36dc7..abc9743 100644
--- a/monitor/fan.cpp
+++ b/monitor/fan.cpp
@@ -71,7 +71,8 @@
mode, bus, *this, std::get<sensorNameField>(s),
std::get<hasTargetField>(s), std::get<funcDelay>(def),
std::get<targetInterfaceField>(s), std::get<factorField>(s),
- std::get<offsetField>(s), std::get<timeoutField>(def),
+ std::get<offsetField>(s), std::get<methodField>(def),
+ std::get<thresholdField>(s), std::get<timeoutField>(def),
std::get<nonfuncRotorErrDelayField>(def), event));
_trustManager->registerSensor(_sensors.back());
@@ -167,20 +168,44 @@
{
if (sensor.functional())
{
- // Start nonfunctional timer if not already running
- sensor.startTimer(TimerMode::nonfunc);
+ switch (sensor.getMethod())
+ {
+ case MethodMode::timebased:
+ // Start nonfunctional timer if not already running
+ sensor.startTimer(TimerMode::nonfunc);
+ break;
+ case MethodMode::count:
+ sensor.setCounter(true);
+ if (sensor.getCounter() >= sensor.getThreshold())
+ {
+ updateState(sensor);
+ }
+ break;
+ }
}
}
else
{
- if (sensor.functional())
+ switch (sensor.getMethod())
{
- sensor.stopTimer();
- }
- else
- {
- // Start functional timer if not already running
- sensor.startTimer(TimerMode::func);
+ case MethodMode::timebased:
+ if (sensor.functional())
+ {
+ sensor.stopTimer();
+ }
+ else
+ {
+ // Start functional timer if not already running
+ sensor.startTimer(TimerMode::func);
+ }
+ break;
+ case MethodMode::count:
+ sensor.setCounter(false);
+ if (!sensor.functional() && sensor.getCounter() == 0)
+ {
+ updateState(sensor);
+ }
+ break;
}
}
}
@@ -233,7 +258,7 @@
return false;
}
-void Fan::timerExpired(TachSensor& sensor)
+void Fan::updateState(TachSensor& sensor)
{
sensor.setFunctional(!sensor.functional());
diff --git a/monitor/fan.hpp b/monitor/fan.hpp
index a50bbb5..91ea395 100644
--- a/monitor/fan.hpp
+++ b/monitor/fan.hpp
@@ -107,16 +107,16 @@
void tachChanged();
/**
- * @brief The callback function for the timer
+ * @brief The callback function for the method
*
* Sets the sensor to not functional.
* If enough sensors are now not functional,
* updates the functional status of the whole
* fan in the inventory.
*
- * @param[in] sensor - the sensor whose timer expired
+ * @param[in] sensor - the sensor for state update
*/
- void timerExpired(TachSensor& sensor);
+ void updateState(TachSensor& sensor);
/**
* @brief Get the name of the fan
diff --git a/monitor/gen-fan-monitor-defs.py b/monitor/gen-fan-monitor-defs.py
index 7386508..4452a85 100755
--- a/monitor/gen-fan-monitor-defs.py
+++ b/monitor/gen-fan-monitor-defs.py
@@ -50,8 +50,9 @@
{
%for fan_data in data.get('fans', {}):
FanDefinition{"${fan_data['inventory']}",
+ ${fan_data.get('method', {})},
${fan_data.get('functional_delay', 0)},
- ${fan_data['allowed_out_of_range_time']},
+ ${fan_data.get('allowed_out_of_range_time', {})},
${fan_data['deviation']},
${fan_data['num_sensors_nonfunc_for_fan_nonfunc']},
0, // Monitor start delay - not used in YAML configs
@@ -67,12 +68,14 @@
'xyz.openbmc_project.Control.FanSpeed')
factor = sensor.get('factor', 1)
offset = sensor.get('offset', 0)
+ threshold = sensor.get('threshold', 1)
%> \
SensorDefinition{"${sensor['name']}",
${has_target},
"${target_interface}",
${factor},
- ${offset}},
+ ${offset},
+ ${threshold}},
%endfor
},
%if ('condition' in fan_data) and \
diff --git a/monitor/json_parser.cpp b/monitor/json_parser.cpp
index b2b0c06..82fc621 100644
--- a/monitor/json_parser.cpp
+++ b/monitor/json_parser.cpp
@@ -20,6 +20,7 @@
#include "nonzero_speed_trust.hpp"
#include "power_interface.hpp"
#include "power_off_rule.hpp"
+#include "tach_sensor.hpp"
#include "types.hpp"
#include <fmt/format.h>
@@ -57,6 +58,8 @@
{"nonzerospeed", tClass::getNonZeroSpeed}};
const std::map<std::string, condHandler> conditions = {
{"propertiesmatch", condition::getPropertiesMatch}};
+const std::map<std::string, size_t> methods = {
+ {"timebased", MethodMode::timebased}, {"count", MethodMode::count}};
const std::vector<CreateGroupFunction> getTrustGrps(const json& obj)
{
@@ -155,10 +158,16 @@
{
offset = sensor["offset"].get<int64_t>();
}
+ // Threshold is optional and defaults to 1
+ auto threshold = 1;
+ if (sensor.contains("threshold"))
+ {
+ threshold = sensor["threshold"].get<size_t>();
+ }
- sensorDefs.emplace_back(std::tuple(sensor["name"].get<std::string>(),
- sensor["has_target"].get<bool>(),
- targetIntf, factor, offset));
+ sensorDefs.emplace_back(std::tuple(
+ sensor["name"].get<std::string>(), sensor["has_target"].get<bool>(),
+ targetIntf, factor, offset, threshold));
}
return sensorDefs;
@@ -170,16 +179,14 @@
for (const auto& fan : obj["fans"])
{
- if (!fan.contains("inventory") ||
- !fan.contains("allowed_out_of_range_time") ||
- !fan.contains("deviation") || !fan.contains("sensors"))
+ if (!fan.contains("inventory") || !fan.contains("deviation") ||
+ !fan.contains("sensors"))
{
// Log error on missing required parameters
log<level::ERR>(
"Missing required fan monitor definition parameters",
entry("REQUIRED_PARAMETERS=%s",
- "{inventory, allowed_out_of_range_time, deviation, "
- "sensors}"));
+ "{inventory, deviation, sensors}"));
throw std::runtime_error(
"Missing required fan monitor definition parameters");
}
@@ -193,6 +200,45 @@
funcDelay = fan["functional_delay"].get<size_t>();
}
+ // Method is optional and defaults to time based functional
+ // determination
+ size_t method = MethodMode::timebased;
+ if (fan.contains("method"))
+ {
+ auto methodConf = fan["method"].get<std::string>();
+ auto methodFunc = methods.find(methodConf);
+ if (methodFunc != methods.end())
+ {
+ method = methodFunc->second;
+ }
+ else
+ {
+ // Log error on unsupported method parameter
+ log<level::ERR>("Invalid fan method");
+ throw std::runtime_error("Invalid fan method");
+ }
+ }
+
+ // Timeout defaults to 0
+ size_t timeout = 0;
+ if (method == MethodMode::timebased)
+ {
+ if (!fan.contains("allowed_out_of_range_time"))
+ {
+ // Log error on missing required parameter
+ log<level::ERR>(
+ "Missing required fan monitor definition parameters",
+ entry("REQUIRED_PARAMETER=%s",
+ "{allowed_out_of_range_time}"));
+ throw std::runtime_error(
+ "Missing required fan monitor definition parameters");
+ }
+ else
+ {
+ timeout = fan["allowed_out_of_range_time"].get<size_t>();
+ }
+ }
+
// Monitor start delay is optional and defaults to 0
size_t monitorDelay = 0;
if (fan.contains("monitor_start_delay"))
@@ -261,9 +307,9 @@
entry("JSON_DUMP=%s", fan["condition"].dump().c_str()));
}
}
+
fanDefs.emplace_back(std::tuple(
- fan["inventory"].get<std::string>(), funcDelay,
- fan["allowed_out_of_range_time"].get<size_t>(),
+ fan["inventory"].get<std::string>(), method, funcDelay, timeout,
fan["deviation"].get<size_t>(), nonfuncSensorsCount, monitorDelay,
nonfuncRotorErrorDelay, fanMissingErrorDelay, sensorDefs, cond));
}
diff --git a/monitor/tach_sensor.cpp b/monitor/tach_sensor.cpp
index 637ff68..ed00a9b 100644
--- a/monitor/tach_sensor.cpp
+++ b/monitor/tach_sensor.cpp
@@ -70,15 +70,15 @@
TachSensor::TachSensor(Mode mode, sdbusplus::bus::bus& bus, Fan& fan,
const std::string& id, bool hasTarget, size_t funcDelay,
const std::string& interface, double factor,
- int64_t offset, size_t timeout,
- const std::optional<size_t>& errorDelay,
+ int64_t offset, size_t method, size_t threshold,
+ size_t timeout, const std::optional<size_t>& errorDelay,
const sdeventplus::Event& event) :
_bus(bus),
_fan(fan), _name(FAN_SENSOR_PATH + id), _invName(path(fan.getName()) / id),
_hasTarget(hasTarget), _funcDelay(funcDelay), _interface(interface),
- _factor(factor), _offset(offset), _timeout(timeout),
- _timerMode(TimerMode::func),
- _timer(event, std::bind(&Fan::timerExpired, &fan, std::ref(*this))),
+ _factor(factor), _offset(offset), _method(method), _threshold(threshold),
+ _timeout(timeout), _timerMode(TimerMode::func),
+ _timer(event, std::bind(&Fan::updateState, &fan, std::ref(*this))),
_errorDelay(errorDelay)
{
// Start from a known state of functional
@@ -251,6 +251,24 @@
}
}
+void TachSensor::setCounter(bool count)
+{
+ if (count)
+ {
+ if (_counter < _threshold)
+ {
+ ++_counter;
+ }
+ }
+ else
+ {
+ if (_counter > 0)
+ {
+ --_counter;
+ }
+ }
+}
+
void TachSensor::updateInventory(bool functional)
{
auto objectMap =
diff --git a/monitor/tach_sensor.hpp b/monitor/tach_sensor.hpp
index 814df69..76a800d 100644
--- a/monitor/tach_sensor.hpp
+++ b/monitor/tach_sensor.hpp
@@ -42,6 +42,17 @@
};
/**
+ * The mode that the method is running in:
+ * - time - Use a percentage based deviation
+ * - count - Run up/down count fault detection
+ */
+enum MethodMode
+{
+ timebased = 0,
+ count
+};
+
+/**
* @class TachSensor
*
* This class represents the sensor that reads a tach value.
@@ -78,6 +89,8 @@
* @param[in] interface - the interface of the target
* @param[in] factor - the factor of the sensor target
* @param[in] offset - the offset of the sensor target
+ * @param[in] method - the method of out of range
+ * @param[in] threshold - the threshold of counter method
* @param[in] timeout - Normal timeout value to use
* @param[in] errorDelay - Delay in seconds before creating an error
* or std::nullopt if no errors.
@@ -87,7 +100,8 @@
TachSensor(Mode mode, sdbusplus::bus::bus& bus, Fan& fan,
const std::string& id, bool hasTarget, size_t funcDelay,
const std::string& interface, double factor, int64_t offset,
- size_t timeout, const std::optional<size_t>& errorDelay,
+ size_t method, size_t threshold, size_t timeout,
+ const std::optional<size_t>& errorDelay,
const sdeventplus::Event& event);
/**
@@ -136,6 +150,35 @@
}
/**
+ * @brief Returns the method of out of range
+ */
+ inline size_t getMethod() const
+ {
+ return _method;
+ }
+
+ /**
+ * @brief Returns the threshold of count method
+ */
+ inline size_t getThreshold() const
+ {
+ return _threshold;
+ }
+
+ /**
+ * Set the sensor faulted counter
+ */
+ void setCounter(bool count);
+
+ /**
+ * @brief Returns the sensor faulted count
+ */
+ inline size_t getCounter() const
+ {
+ return _counter;
+ }
+
+ /**
* Returns true if the hardware behind this
* sensor is considered working OK/functional.
*/
@@ -290,6 +333,21 @@
const int64_t _offset;
/**
+ * @brief The method of out of range
+ */
+ const size_t _method;
+
+ /**
+ * @brief The threshold for count method
+ */
+ const size_t _threshold;
+
+ /**
+ * @brief The counter for count method
+ */
+ size_t _counter = 0;
+
+ /**
* @brief The input speed, from the Value dbus property
*/
double _tachInput = 0;
diff --git a/monitor/types.hpp b/monitor/types.hpp
index 770998c..609199a 100644
--- a/monitor/types.hpp
+++ b/monitor/types.hpp
@@ -1,5 +1,6 @@
#pragma once
+#include "tach_sensor.hpp"
#include "trust_group.hpp"
#include <nlohmann/json.hpp>
@@ -97,23 +98,25 @@
constexpr auto targetInterfaceField = 2;
constexpr auto factorField = 3;
constexpr auto offsetField = 4;
+constexpr auto thresholdField = 5;
using SensorDefinition =
- std::tuple<std::string, bool, std::string, double, int64_t>;
+ std::tuple<std::string, bool, std::string, double, int64_t, size_t>;
constexpr auto fanNameField = 0;
-constexpr auto funcDelay = 1;
-constexpr auto timeoutField = 2;
-constexpr auto fanDeviationField = 3;
-constexpr auto numSensorFailsForNonfuncField = 4;
-constexpr auto monitorStartDelayField = 5;
-constexpr auto nonfuncRotorErrDelayField = 6;
-constexpr auto fanMissingErrDelayField = 7;
-constexpr auto sensorListField = 8;
-constexpr auto conditionField = 9;
+constexpr auto methodField = 1;
+constexpr auto funcDelay = 2;
+constexpr auto timeoutField = 3;
+constexpr auto fanDeviationField = 4;
+constexpr auto numSensorFailsForNonfuncField = 5;
+constexpr auto monitorStartDelayField = 6;
+constexpr auto nonfuncRotorErrDelayField = 7;
+constexpr auto fanMissingErrDelayField = 8;
+constexpr auto sensorListField = 9;
+constexpr auto conditionField = 10;
using FanDefinition =
- std::tuple<std::string, size_t, size_t, size_t, size_t, size_t,
+ std::tuple<std::string, size_t, size_t, size_t, size_t, size_t, size_t,
std::optional<size_t>, std::optional<size_t>,
std::vector<SensorDefinition>, std::optional<Condition>>;