monitor: Event logs for missing fans
This commit adds the code to create event logs calling out the fan when
it has been missing for a certain amount of time.
This is basically identical to the functionality that the fan presence
application in this repo provides, but with it in this application all
fan errors are created from the same place. This will become important
when there is a power off due to a fan missing and the error for that
needs to be re-committed at power off time so it can be shown as the
cause of the power off.
The functionality is configured in the JSON:
fan_missing_error_delay:
Defines the number of seconds a fan must be missing with power on before
an error will be created. If this isn't present in the JSON, then
errors will not be created at all.
Signed-off-by: Matt Spinler <spinler@us.ibm.com>
Change-Id: I76de9d8d1bf6e283560b1ce46e70f84522e2d708
diff --git a/logger.hpp b/logger.hpp
index 51e2b80..87590e8 100644
--- a/logger.hpp
+++ b/logger.hpp
@@ -43,7 +43,8 @@
enum Priority
{
error,
- info
+ info,
+ quiet
};
Logger() = delete;
@@ -78,7 +79,7 @@
phosphor::logging::log<phosphor::logging::level::ERR>(
message.c_str());
}
- else
+ else if (priority != Logger::quiet)
{
phosphor::logging::log<phosphor::logging::level::INFO>(
message.c_str());
diff --git a/monitor/fan.cpp b/monitor/fan.cpp
index 114b625..3f36dc7 100644
--- a/monitor/fan.cpp
+++ b/monitor/fan.cpp
@@ -54,7 +54,8 @@
rules::propertiesChanged(util::INVENTORY_PATH + _name,
util::INV_ITEM_IFACE),
std::bind(std::mem_fn(&Fan::presenceChanged), this,
- std::placeholders::_1))
+ std::placeholders::_1)),
+ _fanMissingErrorDelay(std::get<fanMissingErrDelayField>(def))
{
// Start from a known state of functional (even if
// _numSensorFailsForNonFunc is 0)
@@ -108,6 +109,24 @@
// Get the initial presence state
_present = util::SDBusPlus::getProperty<bool>(
util::INVENTORY_PATH + _name, util::INV_ITEM_IFACE, "Present");
+
+ if (_fanMissingErrorDelay)
+ {
+ _fanMissingErrorTimer = std::make_unique<
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>(
+ event, std::bind(&System::fanMissingErrorTimerExpired, &system,
+ std::ref(*this)));
+
+ if (!_present)
+ {
+ // The fan presence application handles the journal for missing
+ // fans, so only internally log missing fan info here.
+ getLogger().log(fmt::format("On startup, fan {} is missing", _name),
+ Logger::quiet);
+ _fanMissingErrorTimer->restartOnce(
+ std::chrono::seconds{*_fanMissingErrorDelay});
+ }
+ }
}
void Fan::startMonitor()
@@ -285,7 +304,24 @@
{
_present = std::get<bool>(presentProp->second);
+ getLogger().log(
+ fmt::format("Fan {} presence state change to {}", _name, _present),
+ Logger::quiet);
+
_system.fanStatusChange(*this);
+
+ if (_fanMissingErrorDelay)
+ {
+ if (!_present)
+ {
+ _fanMissingErrorTimer->restartOnce(
+ std::chrono::seconds{*_fanMissingErrorDelay});
+ }
+ else if (_fanMissingErrorTimer->isEnabled())
+ {
+ _fanMissingErrorTimer->setEnabled(false);
+ }
+ }
}
}
diff --git a/monitor/fan.hpp b/monitor/fan.hpp
index 187d626..a50bbb5 100644
--- a/monitor/fan.hpp
+++ b/monitor/fan.hpp
@@ -281,6 +281,21 @@
* @brief The current presence state
*/
bool _present = false;
+
+ /**
+ * @brief The number of seconds to wait after a fan is removed before
+ * creating an event log for it. If std::nullopt, then no
+ * event log will be created.
+ */
+ const std::optional<size_t> _fanMissingErrorDelay;
+
+ /**
+ * @brief The timer that uses the _fanMissingErrorDelay timeout,
+ * at the end of which an event log will be created.
+ */
+ std::unique_ptr<
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>
+ _fanMissingErrorTimer;
};
} // namespace monitor
diff --git a/monitor/gen-fan-monitor-defs.py b/monitor/gen-fan-monitor-defs.py
index 310f475..7386508 100755
--- a/monitor/gen-fan-monitor-defs.py
+++ b/monitor/gen-fan-monitor-defs.py
@@ -56,6 +56,7 @@
${fan_data['num_sensors_nonfunc_for_fan_nonfunc']},
0, // Monitor start delay - not used in YAML configs
std::nullopt, // nonfuncRotorErrorDelay - also not used here
+ std::nullopt, // fanMissingErrorDelay - also not used here
std::vector<SensorDefinition>{
%for sensor in fan_data['sensors']:
<%
diff --git a/monitor/json.md b/monitor/json.md
index 15fef02..4dbb543 100644
--- a/monitor/json.md
+++ b/monitor/json.md
@@ -88,6 +88,17 @@
Optional. If not present but there is a [fault handling
configuration](#fault-handling-configuration) section, then it defaults to 0.
+### fan_missing_error_delay
+
+This defines how many seconds a fan must be missing before an error will be
+created.
+
+```
+"fan_missing_error_delay": 5
+```
+
+Optional. If not present, no errors will be created for missing fans.
+
### sensors
This is an array with an entry for each tach sensor contained in the fan FRU.
diff --git a/monitor/json_parser.cpp b/monitor/json_parser.cpp
index 46ae049..1bb018d 100644
--- a/monitor/json_parser.cpp
+++ b/monitor/json_parser.cpp
@@ -223,6 +223,14 @@
nonfuncRotorErrorDelay = 0;
}
+ // fan_missing_error_delay is optional.
+ std::optional<size_t> fanMissingErrorDelay;
+ if (fan.contains("fan_missing_error_delay"))
+ {
+ fanMissingErrorDelay =
+ fan.at("fan_missing_error_delay").get<size_t>();
+ }
+
// Handle optional conditions
auto cond = std::optional<Condition>();
if (fan.contains("condition"))
@@ -253,11 +261,11 @@
entry("JSON_DUMP=%s", fan["condition"].dump().c_str()));
}
}
- fanDefs.emplace_back(
- std::tuple(fan["inventory"].get<std::string>(), funcDelay,
- fan["allowed_out_of_range_time"].get<size_t>(),
- fan["deviation"].get<size_t>(), nonfuncSensorsCount,
- monitorDelay, nonfuncRotorErrorDelay, sensorDefs, cond));
+ fanDefs.emplace_back(std::tuple(
+ fan["inventory"].get<std::string>(), funcDelay,
+ fan["allowed_out_of_range_time"].get<size_t>(),
+ fan["deviation"].get<size_t>(), nonfuncSensorsCount, monitorDelay,
+ nonfuncRotorErrorDelay, fanMissingErrorDelay, sensorDefs, cond));
}
return fanDefs;
diff --git a/monitor/system.cpp b/monitor/system.cpp
index c8bacca..6502615 100644
--- a/monitor/system.cpp
+++ b/monitor/system.cpp
@@ -260,6 +260,23 @@
// TODO: save error so it can be committed again on a power off
}
+void System::fanMissingErrorTimerExpired(const Fan& fan)
+{
+ std::string fanPath{util::INVENTORY_PATH + fan.getName()};
+
+ getLogger().log(
+ fmt::format("Creating event log for missing fan {}", fanPath),
+ Logger::error);
+
+ auto error = std::make_unique<FanError>(
+ "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
+
+ auto sensorData = captureSensorData();
+ error->commit(sensorData);
+
+ // TODO: save error so it can be committed again on a power off
+}
+
json System::captureSensorData()
{
json data;
diff --git a/monitor/system.hpp b/monitor/system.hpp
index 05e08b5..ae2f4ce 100644
--- a/monitor/system.hpp
+++ b/monitor/system.hpp
@@ -83,6 +83,14 @@
*/
void sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor);
+ /**
+ * @brief Called when the timer that starts when a fan is missing
+ * has expired so an event log needs to be created.
+ *
+ * @param[in] fan - The missing fan.
+ */
+ void fanMissingErrorTimerExpired(const Fan& fan);
+
private:
/* The mode of fan monitor */
Mode _mode;
diff --git a/monitor/types.hpp b/monitor/types.hpp
index 8897d96..770998c 100644
--- a/monitor/types.hpp
+++ b/monitor/types.hpp
@@ -108,13 +108,14 @@
constexpr auto numSensorFailsForNonfuncField = 4;
constexpr auto monitorStartDelayField = 5;
constexpr auto nonfuncRotorErrDelayField = 6;
-constexpr auto sensorListField = 7;
-constexpr auto conditionField = 8;
+constexpr auto fanMissingErrDelayField = 7;
+constexpr auto sensorListField = 8;
+constexpr auto conditionField = 9;
using FanDefinition =
std::tuple<std::string, size_t, size_t, size_t, size_t, size_t,
- std::optional<size_t>, std::vector<SensorDefinition>,
- std::optional<Condition>>;
+ std::optional<size_t>, std::optional<size_t>,
+ std::vector<SensorDefinition>, std::optional<Condition>>;
constexpr auto presentHealthPos = 0;
constexpr auto sensorFuncHealthPos = 1;