monitor: Create count timer
When the method for monitoring fan speeds is set to 'count', which is
when an up/down counter is used determine when a fan should be
considered faulted/nonfunctional, there needs to be some interval at
which the speeds are checked and the count is changed if necessary.
Otherwise, if the checks just happened in the tach changed callback as
they do today, then a fan with a non-changing rotor speed would never
trigger a check.
This commit creates a new timer in the Fan class that is constantly
running when the monitor is active if any of the fan's sensors use this
mode. In the expiration function, the sensors' tach targets will be
compared to their input values and the error count will be
incremented/decremented as was previously happening in the tach changed
callback.
The interval to use is read from a new 'count_interval' property in the
JSON, which defaults to 1 if not present and the count method is
enabled.
The timer is started in the startMonitor function, and stopped when a
power off is detected.
This commit also added some DEBUG journal traces when the counter on a
sensor changes values to help with debugging the new code, and so it can
be enabled on a system if desired by enabling debug level tracing.
Signed-off-by: Matt Spinler <spinler@us.ibm.com>
Change-Id: I238606e95bb91df93afd6ec7c00bd0577bc603f2
diff --git a/monitor/fan.cpp b/monitor/fan.cpp
index 8a30300..52b4031 100644
--- a/monitor/fan.cpp
+++ b/monitor/fan.cpp
@@ -61,8 +61,11 @@
rules::argNpath(0, util::INVENTORY_PATH + _name),
std::bind(std::mem_fn(&Fan::presenceIfaceAdded), this,
std::placeholders::_1)),
- _fanMissingErrorDelay(std::get<fanMissingErrDelayField>(def))
+ _fanMissingErrorDelay(std::get<fanMissingErrDelayField>(def)),
+ _countInterval(std::get<countIntervalField>(def))
{
+ bool enableCountTimer = false;
+
// Start from a known state of functional (even if
// _numSensorFailsForNonFunc is 0)
updateInventory(true);
@@ -80,6 +83,23 @@
std::get<nonfuncRotorErrDelayField>(def), event));
_trustManager->registerSensor(_sensors.back());
+ if (_sensors.back()->getMethod() == MethodMode::count)
+ {
+ enableCountTimer = true;
+ }
+ }
+
+ // If the error checking method will be 'count', then it needs a timer.
+ // The timer is repeating but is disabled immediately because it doesn't
+ // need to start yet.
+ if (enableCountTimer)
+ {
+ _countTimer = std::make_unique<
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>(
+ event, std::bind(&Fan::countTimerExpired, this),
+ std::chrono::seconds(_countInterval));
+
+ _countTimer->setEnabled(false);
}
#ifndef MONITOR_USE_JSON
@@ -170,6 +190,12 @@
{
_monitorReady = true;
+ if (_countTimer)
+ {
+ _countTimer->resetRemaining();
+ _countTimer->setEnabled(true);
+ }
+
std::for_each(_sensors.begin(), _sensors.end(), [this](auto& sensor) {
if (_present)
{
@@ -234,7 +260,28 @@
}
}
- process(sensor);
+ // If using the timebased method to determine functional status,
+ // check now, otherwise let _countTimer handle it. A timer is
+ // used for the count method so that stuck sensors will continue
+ // to be checked.
+ if (sensor.getMethod() == MethodMode::timebased)
+ {
+ process(sensor);
+ }
+}
+
+void Fan::countTimerExpired()
+{
+ // For sensors that use the 'count' method, time to check their
+ // status and increment/decrement counts as necessary.
+ for (auto& sensor : _sensors)
+ {
+ if (_trustManager->active() && !_trustManager->checkTrust(*sensor))
+ {
+ continue;
+ }
+ process(*sensor);
+ }
}
void Fan::process(TachSensor& sensor)
@@ -455,6 +502,12 @@
sensor->setFunctional(true);
_system.fanStatusChange(*this, true);
}
+
+ // Set the counters back to zero
+ if (sensor->getMethod() == MethodMode::count)
+ {
+ sensor->resetMethod();
+ }
}
catch (const util::DBusServiceError& e)
{
@@ -509,6 +562,11 @@
sensor->stopTimer();
}
});
+
+ if (_countTimer)
+ {
+ _countTimer->setEnabled(false);
+ }
}
#endif
}
diff --git a/monitor/fan.hpp b/monitor/fan.hpp
index 4bf1757..92016b1 100644
--- a/monitor/fan.hpp
+++ b/monitor/fan.hpp
@@ -176,6 +176,12 @@
*/
void powerStateChanged(bool powerStateOn);
+ /**
+ * @brief Timer callback function that deals with sensors using
+ * the 'count' method for determining functional status.
+ */
+ void countTimerExpired();
+
private:
/**
* @brief Returns true if the sensor input is not within
@@ -320,6 +326,20 @@
std::unique_ptr<
sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>
_fanMissingErrorTimer;
+
+ /**
+ * @brief The interval, in seconds, to use for the timer that runs
+ * the checks for sensors using the 'count' method.
+ */
+ size_t _countInterval;
+
+ /**
+ * @brief The timer whose callback function handles the sensors
+ * using the 'count' method for determining functional status.
+ */
+ std::unique_ptr<
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>
+ _countTimer;
};
} // namespace monitor
diff --git a/monitor/gen-fan-monitor-defs.py b/monitor/gen-fan-monitor-defs.py
index 4452a85..b77ea59 100755
--- a/monitor/gen-fan-monitor-defs.py
+++ b/monitor/gen-fan-monitor-defs.py
@@ -56,6 +56,7 @@
${fan_data['deviation']},
${fan_data['num_sensors_nonfunc_for_fan_nonfunc']},
0, // Monitor start delay - not used in YAML configs
+ 0, // Count interval - not used in YAML configs
std::nullopt, // nonfuncRotorErrorDelay - also not used here
std::nullopt, // fanMissingErrorDelay - also not used here
std::vector<SensorDefinition>{
diff --git a/monitor/json_parser.cpp b/monitor/json_parser.cpp
index 9218634..888a57d 100644
--- a/monitor/json_parser.cpp
+++ b/monitor/json_parser.cpp
@@ -216,6 +216,7 @@
// Method is optional and defaults to time based functional
// determination
size_t method = MethodMode::timebased;
+ size_t countInterval = 1;
if (fan.contains("method"))
{
auto methodConf = fan["method"].get<std::string>();
@@ -230,6 +231,15 @@
log<level::ERR>("Invalid fan method");
throw std::runtime_error("Invalid fan method");
}
+
+ // Read the count interval value used with the count method.
+ if (method == MethodMode::count)
+ {
+ if (fan.contains("count_interval"))
+ {
+ countInterval = fan["count_interval"].get<size_t>();
+ }
+ }
}
// Timeout defaults to 0
@@ -323,7 +333,7 @@
fanDefs.emplace_back(std::tuple(
fan["inventory"].get<std::string>(), method, funcDelay, timeout,
- deviation, nonfuncSensorsCount, monitorDelay,
+ deviation, nonfuncSensorsCount, monitorDelay, countInterval,
nonfuncRotorErrorDelay, fanMissingErrorDelay, sensorDefs, cond));
}
diff --git a/monitor/tach_sensor.cpp b/monitor/tach_sensor.cpp
index 83285b9..fa62045 100644
--- a/monitor/tach_sensor.cpp
+++ b/monitor/tach_sensor.cpp
@@ -167,7 +167,14 @@
void TachSensor::processState()
{
- _fan.process(*this);
+ // This function runs from inside trust::Manager::checkTrust(), which,
+ // for sensors using the count method, runs right before process()
+ // is called anyway inside Fan::countTimerExpired() so don't call
+ // it now if using that method.
+ if (_method == MethodMode::timebased)
+ {
+ _fan.process(*this);
+ }
}
void TachSensor::resetMethod()
@@ -275,6 +282,11 @@
if (_counter < _threshold)
{
++_counter;
+ log<level::DEBUG>(
+ fmt::format(
+ "Incremented error counter on {} to {} (threshold {})",
+ _name, _counter, _threshold)
+ .c_str());
}
}
else
@@ -282,6 +294,11 @@
if (_counter > 0)
{
--_counter;
+ log<level::DEBUG>(
+ fmt::format(
+ "Decremented error counter on {} to {} (threshold {})",
+ _name, _counter, _threshold)
+ .c_str());
}
}
}
diff --git a/monitor/types.hpp b/monitor/types.hpp
index a7f61d1..15b1f1a 100644
--- a/monitor/types.hpp
+++ b/monitor/types.hpp
@@ -119,14 +119,15 @@
constexpr auto fanDeviationField = 4;
constexpr auto numSensorFailsForNonfuncField = 5;
constexpr auto monitorStartDelayField = 6;
-constexpr auto nonfuncRotorErrDelayField = 7;
-constexpr auto fanMissingErrDelayField = 8;
-constexpr auto sensorListField = 9;
-constexpr auto conditionField = 10;
+constexpr auto countIntervalField = 7;
+constexpr auto nonfuncRotorErrDelayField = 8;
+constexpr auto fanMissingErrDelayField = 9;
+constexpr auto sensorListField = 10;
+constexpr auto conditionField = 11;
using FanDefinition =
std::tuple<std::string, size_t, size_t, size_t, size_t, size_t, size_t,
- std::optional<size_t>, std::optional<size_t>,
+ size_t, std::optional<size_t>, std::optional<size_t>,
std::vector<SensorDefinition>, std::optional<Condition>>;
constexpr auto presentHealthPos = 0;