monitor: Add up/down count fault detection

Create an up/down count fault determination algorithm that
could be used in place of the current timer based outOfRange()
function.
The up/down count is a different method for determining when
a fan is faulted by counting up each iteration a rotor is
out of spec and removing those counts when the rotor
returns within spec.

Tested:
    1. Remove a fan and run Mihawk, the counter add 1 when sensor
       is out of spec, and replaced the fan back before hit the
       threshold, the counter decrement back to 0.
    2. Remove a fan, counter add 1 and mark the removed fan as
       nonfunctional when counter reaches the threshold, and
       Replaced the fan back, counter will decrement back to 0
       and fan back to functional.

Change-Id: I632dd2c7553b007beb7ae6bb694a590d2cfc2a1c
Signed-off-by: Jolie Ku <jolie_ku@wistron.com>
Signed-off-by: Matthew Barth <msbarth@us.ibm.com>
diff --git a/monitor/example/monitor.yaml b/monitor/example/monitor.yaml
index 59be7e6..8d70c4f 100644
--- a/monitor/example/monitor.yaml
+++ b/monitor/example/monitor.yaml
@@ -7,10 +7,14 @@
 #fans:
 # - inventory:
 #    [The system inventory location for the fan]
+#  method: [timebased|count](optional)
+#    [Available methods:
+#     * timebased - use a percentage based deviation
+#     * count - run up/down count fault detection]
 #  functional_delay:
 #    [Delay (in secs) before a fan is marked functional after returning
 #     within the allowed deviation]
-#  allowed_out_of_range_time:
+#  allowed_out_of_range_time: (required for method timebased)
 #    [Time (in secs) actual speed can be outside of deviation of
 #     target speed]
 #  deviation:
@@ -88,3 +92,24 @@
 #      - name: fan0_0
 #        in_trust: false
 #      - name: fan0_1
+
+#Example entries for count method fan system:
+#fans:
+#  - inventory: /system/chassis/motherboard/fan2
+#    method: count
+#    functional_delay: 5
+#    deviation: 15
+#    num_sensors_nonfunc_for_fan_nonfunc: 1
+#    sensors:
+#      - name: fan2
+#        has_target: true
+#    # Only create fan functional properties and monitor on air cooled
+#    condition:
+#      name: propertiesMatch
+#      properties:
+#        - object: /xyz/openbmc_project/inventory/system/chassis
+#          interface: xyz.openbmc_project.Inventory.Decorator.CoolingType
+#          property:
+#            name: WaterCooled
+#            type: bool
+#            value: false
diff --git a/monitor/fan.cpp b/monitor/fan.cpp
index 3f36dc7..abc9743 100644
--- a/monitor/fan.cpp
+++ b/monitor/fan.cpp
@@ -71,7 +71,8 @@
                 mode, bus, *this, std::get<sensorNameField>(s),
                 std::get<hasTargetField>(s), std::get<funcDelay>(def),
                 std::get<targetInterfaceField>(s), std::get<factorField>(s),
-                std::get<offsetField>(s), std::get<timeoutField>(def),
+                std::get<offsetField>(s), std::get<methodField>(def),
+                std::get<thresholdField>(s), std::get<timeoutField>(def),
                 std::get<nonfuncRotorErrDelayField>(def), event));
 
             _trustManager->registerSensor(_sensors.back());
@@ -167,20 +168,44 @@
     {
         if (sensor.functional())
         {
-            // Start nonfunctional timer if not already running
-            sensor.startTimer(TimerMode::nonfunc);
+            switch (sensor.getMethod())
+            {
+                case MethodMode::timebased:
+                    // Start nonfunctional timer if not already running
+                    sensor.startTimer(TimerMode::nonfunc);
+                    break;
+                case MethodMode::count:
+                    sensor.setCounter(true);
+                    if (sensor.getCounter() >= sensor.getThreshold())
+                    {
+                        updateState(sensor);
+                    }
+                    break;
+            }
         }
     }
     else
     {
-        if (sensor.functional())
+        switch (sensor.getMethod())
         {
-            sensor.stopTimer();
-        }
-        else
-        {
-            // Start functional timer if not already running
-            sensor.startTimer(TimerMode::func);
+            case MethodMode::timebased:
+                if (sensor.functional())
+                {
+                    sensor.stopTimer();
+                }
+                else
+                {
+                    // Start functional timer if not already running
+                    sensor.startTimer(TimerMode::func);
+                }
+                break;
+            case MethodMode::count:
+                sensor.setCounter(false);
+                if (!sensor.functional() && sensor.getCounter() == 0)
+                {
+                    updateState(sensor);
+                }
+                break;
         }
     }
 }
@@ -233,7 +258,7 @@
     return false;
 }
 
-void Fan::timerExpired(TachSensor& sensor)
+void Fan::updateState(TachSensor& sensor)
 {
     sensor.setFunctional(!sensor.functional());
 
diff --git a/monitor/fan.hpp b/monitor/fan.hpp
index a50bbb5..91ea395 100644
--- a/monitor/fan.hpp
+++ b/monitor/fan.hpp
@@ -107,16 +107,16 @@
     void tachChanged();
 
     /**
-     * @brief The callback function for the timer
+     * @brief The callback function for the method
      *
      * Sets the sensor to not functional.
      * If enough sensors are now not functional,
      * updates the functional status of the whole
      * fan in the inventory.
      *
-     * @param[in] sensor - the sensor whose timer expired
+     * @param[in] sensor - the sensor for state update
      */
-    void timerExpired(TachSensor& sensor);
+    void updateState(TachSensor& sensor);
 
     /**
      * @brief Get the name of the fan
diff --git a/monitor/gen-fan-monitor-defs.py b/monitor/gen-fan-monitor-defs.py
index 7386508..4452a85 100755
--- a/monitor/gen-fan-monitor-defs.py
+++ b/monitor/gen-fan-monitor-defs.py
@@ -50,8 +50,9 @@
 {
 %for fan_data in data.get('fans', {}):
     FanDefinition{"${fan_data['inventory']}",
+                  ${fan_data.get('method', {})},
                   ${fan_data.get('functional_delay', 0)},
-                  ${fan_data['allowed_out_of_range_time']},
+                  ${fan_data.get('allowed_out_of_range_time', {})},
                   ${fan_data['deviation']},
                   ${fan_data['num_sensors_nonfunc_for_fan_nonfunc']},
                   0, // Monitor start delay - not used in YAML configs
@@ -67,12 +68,14 @@
                           'xyz.openbmc_project.Control.FanSpeed')
                       factor = sensor.get('factor', 1)
                       offset = sensor.get('offset', 0)
+                      threshold = sensor.get('threshold', 1)
                   %> \
                       SensorDefinition{"${sensor['name']}",
                                        ${has_target},
                                        "${target_interface}",
                                        ${factor},
-                                       ${offset}},
+                                       ${offset},
+                                       ${threshold}},
                   %endfor
                   },
                   %if ('condition' in fan_data) and \
diff --git a/monitor/json_parser.cpp b/monitor/json_parser.cpp
index b2b0c06..82fc621 100644
--- a/monitor/json_parser.cpp
+++ b/monitor/json_parser.cpp
@@ -20,6 +20,7 @@
 #include "nonzero_speed_trust.hpp"
 #include "power_interface.hpp"
 #include "power_off_rule.hpp"
+#include "tach_sensor.hpp"
 #include "types.hpp"
 
 #include <fmt/format.h>
@@ -57,6 +58,8 @@
     {"nonzerospeed", tClass::getNonZeroSpeed}};
 const std::map<std::string, condHandler> conditions = {
     {"propertiesmatch", condition::getPropertiesMatch}};
+const std::map<std::string, size_t> methods = {
+    {"timebased", MethodMode::timebased}, {"count", MethodMode::count}};
 
 const std::vector<CreateGroupFunction> getTrustGrps(const json& obj)
 {
@@ -155,10 +158,16 @@
         {
             offset = sensor["offset"].get<int64_t>();
         }
+        // Threshold is optional and defaults to 1
+        auto threshold = 1;
+        if (sensor.contains("threshold"))
+        {
+            threshold = sensor["threshold"].get<size_t>();
+        }
 
-        sensorDefs.emplace_back(std::tuple(sensor["name"].get<std::string>(),
-                                           sensor["has_target"].get<bool>(),
-                                           targetIntf, factor, offset));
+        sensorDefs.emplace_back(std::tuple(
+            sensor["name"].get<std::string>(), sensor["has_target"].get<bool>(),
+            targetIntf, factor, offset, threshold));
     }
 
     return sensorDefs;
@@ -170,16 +179,14 @@
 
     for (const auto& fan : obj["fans"])
     {
-        if (!fan.contains("inventory") ||
-            !fan.contains("allowed_out_of_range_time") ||
-            !fan.contains("deviation") || !fan.contains("sensors"))
+        if (!fan.contains("inventory") || !fan.contains("deviation") ||
+            !fan.contains("sensors"))
         {
             // Log error on missing required parameters
             log<level::ERR>(
                 "Missing required fan monitor definition parameters",
                 entry("REQUIRED_PARAMETERS=%s",
-                      "{inventory, allowed_out_of_range_time, deviation, "
-                      "sensors}"));
+                      "{inventory, deviation, sensors}"));
             throw std::runtime_error(
                 "Missing required fan monitor definition parameters");
         }
@@ -193,6 +200,45 @@
             funcDelay = fan["functional_delay"].get<size_t>();
         }
 
+        // Method is optional and defaults to time based functional
+        // determination
+        size_t method = MethodMode::timebased;
+        if (fan.contains("method"))
+        {
+            auto methodConf = fan["method"].get<std::string>();
+            auto methodFunc = methods.find(methodConf);
+            if (methodFunc != methods.end())
+            {
+                method = methodFunc->second;
+            }
+            else
+            {
+                // Log error on unsupported method parameter
+                log<level::ERR>("Invalid fan method");
+                throw std::runtime_error("Invalid fan method");
+            }
+        }
+
+        // Timeout defaults to 0
+        size_t timeout = 0;
+        if (method == MethodMode::timebased)
+        {
+            if (!fan.contains("allowed_out_of_range_time"))
+            {
+                // Log error on missing required parameter
+                log<level::ERR>(
+                    "Missing required fan monitor definition parameters",
+                    entry("REQUIRED_PARAMETER=%s",
+                          "{allowed_out_of_range_time}"));
+                throw std::runtime_error(
+                    "Missing required fan monitor definition parameters");
+            }
+            else
+            {
+                timeout = fan["allowed_out_of_range_time"].get<size_t>();
+            }
+        }
+
         // Monitor start delay is optional and defaults to 0
         size_t monitorDelay = 0;
         if (fan.contains("monitor_start_delay"))
@@ -261,9 +307,9 @@
                     entry("JSON_DUMP=%s", fan["condition"].dump().c_str()));
             }
         }
+
         fanDefs.emplace_back(std::tuple(
-            fan["inventory"].get<std::string>(), funcDelay,
-            fan["allowed_out_of_range_time"].get<size_t>(),
+            fan["inventory"].get<std::string>(), method, funcDelay, timeout,
             fan["deviation"].get<size_t>(), nonfuncSensorsCount, monitorDelay,
             nonfuncRotorErrorDelay, fanMissingErrorDelay, sensorDefs, cond));
     }
diff --git a/monitor/tach_sensor.cpp b/monitor/tach_sensor.cpp
index 637ff68..ed00a9b 100644
--- a/monitor/tach_sensor.cpp
+++ b/monitor/tach_sensor.cpp
@@ -70,15 +70,15 @@
 TachSensor::TachSensor(Mode mode, sdbusplus::bus::bus& bus, Fan& fan,
                        const std::string& id, bool hasTarget, size_t funcDelay,
                        const std::string& interface, double factor,
-                       int64_t offset, size_t timeout,
-                       const std::optional<size_t>& errorDelay,
+                       int64_t offset, size_t method, size_t threshold,
+                       size_t timeout, const std::optional<size_t>& errorDelay,
                        const sdeventplus::Event& event) :
     _bus(bus),
     _fan(fan), _name(FAN_SENSOR_PATH + id), _invName(path(fan.getName()) / id),
     _hasTarget(hasTarget), _funcDelay(funcDelay), _interface(interface),
-    _factor(factor), _offset(offset), _timeout(timeout),
-    _timerMode(TimerMode::func),
-    _timer(event, std::bind(&Fan::timerExpired, &fan, std::ref(*this))),
+    _factor(factor), _offset(offset), _method(method), _threshold(threshold),
+    _timeout(timeout), _timerMode(TimerMode::func),
+    _timer(event, std::bind(&Fan::updateState, &fan, std::ref(*this))),
     _errorDelay(errorDelay)
 {
     // Start from a known state of functional
@@ -251,6 +251,24 @@
     }
 }
 
+void TachSensor::setCounter(bool count)
+{
+    if (count)
+    {
+        if (_counter < _threshold)
+        {
+            ++_counter;
+        }
+    }
+    else
+    {
+        if (_counter > 0)
+        {
+            --_counter;
+        }
+    }
+}
+
 void TachSensor::updateInventory(bool functional)
 {
     auto objectMap =
diff --git a/monitor/tach_sensor.hpp b/monitor/tach_sensor.hpp
index 814df69..76a800d 100644
--- a/monitor/tach_sensor.hpp
+++ b/monitor/tach_sensor.hpp
@@ -42,6 +42,17 @@
 };
 
 /**
+ * The mode that the method is running in:
+ *   - time - Use a percentage based deviation
+ *   - count - Run up/down count fault detection
+ */
+enum MethodMode
+{
+    timebased = 0,
+    count
+};
+
+/**
  * @class TachSensor
  *
  * This class represents the sensor that reads a tach value.
@@ -78,6 +89,8 @@
      * @param[in] interface - the interface of the target
      * @param[in] factor - the factor of the sensor target
      * @param[in] offset - the offset of the sensor target
+     * @param[in] method - the method of out of range
+     * @param[in] threshold - the threshold of counter method
      * @param[in] timeout - Normal timeout value to use
      * @param[in] errorDelay - Delay in seconds before creating an error
      *                         or std::nullopt if no errors.
@@ -87,7 +100,8 @@
     TachSensor(Mode mode, sdbusplus::bus::bus& bus, Fan& fan,
                const std::string& id, bool hasTarget, size_t funcDelay,
                const std::string& interface, double factor, int64_t offset,
-               size_t timeout, const std::optional<size_t>& errorDelay,
+               size_t method, size_t threshold, size_t timeout,
+               const std::optional<size_t>& errorDelay,
                const sdeventplus::Event& event);
 
     /**
@@ -136,6 +150,35 @@
     }
 
     /**
+     * @brief Returns the method of out of range
+     */
+    inline size_t getMethod() const
+    {
+        return _method;
+    }
+
+    /**
+     * @brief Returns the threshold of count method
+     */
+    inline size_t getThreshold() const
+    {
+        return _threshold;
+    }
+
+    /**
+     * Set the sensor faulted counter
+     */
+    void setCounter(bool count);
+
+    /**
+     * @brief Returns the sensor faulted count
+     */
+    inline size_t getCounter() const
+    {
+        return _counter;
+    }
+
+    /**
      * Returns true if the hardware behind this
      * sensor is considered working OK/functional.
      */
@@ -290,6 +333,21 @@
     const int64_t _offset;
 
     /**
+     * @brief The method of out of range
+     */
+    const size_t _method;
+
+    /**
+     * @brief The threshold for count method
+     */
+    const size_t _threshold;
+
+    /**
+     * @brief The counter for count method
+     */
+    size_t _counter = 0;
+
+    /**
      * @brief The input speed, from the Value dbus property
      */
     double _tachInput = 0;
diff --git a/monitor/types.hpp b/monitor/types.hpp
index 770998c..609199a 100644
--- a/monitor/types.hpp
+++ b/monitor/types.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "tach_sensor.hpp"
 #include "trust_group.hpp"
 
 #include <nlohmann/json.hpp>
@@ -97,23 +98,25 @@
 constexpr auto targetInterfaceField = 2;
 constexpr auto factorField = 3;
 constexpr auto offsetField = 4;
+constexpr auto thresholdField = 5;
 
 using SensorDefinition =
-    std::tuple<std::string, bool, std::string, double, int64_t>;
+    std::tuple<std::string, bool, std::string, double, int64_t, size_t>;
 
 constexpr auto fanNameField = 0;
-constexpr auto funcDelay = 1;
-constexpr auto timeoutField = 2;
-constexpr auto fanDeviationField = 3;
-constexpr auto numSensorFailsForNonfuncField = 4;
-constexpr auto monitorStartDelayField = 5;
-constexpr auto nonfuncRotorErrDelayField = 6;
-constexpr auto fanMissingErrDelayField = 7;
-constexpr auto sensorListField = 8;
-constexpr auto conditionField = 9;
+constexpr auto methodField = 1;
+constexpr auto funcDelay = 2;
+constexpr auto timeoutField = 3;
+constexpr auto fanDeviationField = 4;
+constexpr auto numSensorFailsForNonfuncField = 5;
+constexpr auto monitorStartDelayField = 6;
+constexpr auto nonfuncRotorErrDelayField = 7;
+constexpr auto fanMissingErrDelayField = 8;
+constexpr auto sensorListField = 9;
+constexpr auto conditionField = 10;
 
 using FanDefinition =
-    std::tuple<std::string, size_t, size_t, size_t, size_t, size_t,
+    std::tuple<std::string, size_t, size_t, size_t, size_t, size_t, size_t,
                std::optional<size_t>, std::optional<size_t>,
                std::vector<SensorDefinition>, std::optional<Condition>>;