monitor: Re-log fan error on a power off
In the case where a power off rule runs to completion and powers off the
system due to either missing or faulted fans, at the point of power off
re-post the event log for the previous fan error.
This way, there can be an error associated with the power off, because
depending on the power off rule delays the original error could have
happened several minutes or more in the past.
Signed-off-by: Matt Spinler <spinler@us.ibm.com>
Change-Id: I1a38062cf75ffd4a11baa417ef3983b6c1a47ada
diff --git a/monitor/json_parser.cpp b/monitor/json_parser.cpp
index 1bb018d..b2b0c06 100644
--- a/monitor/json_parser.cpp
+++ b/monitor/json_parser.cpp
@@ -336,7 +336,8 @@
std::unique_ptr<PowerOffAction>
getPowerOffAction(const json& powerOffConfig,
- std::shared_ptr<PowerInterfaceBase>& powerInterface)
+ std::shared_ptr<PowerInterfaceBase>& powerInterface,
+ PowerOffAction::PrePowerOffFunc& func)
{
std::unique_ptr<PowerOffAction> action;
if (!powerOffConfig.contains("type"))
@@ -368,19 +369,19 @@
if (type == "hard")
{
action = std::make_unique<HardPowerOff>(
- powerOffConfig.at("delay").get<uint32_t>(), powerInterface);
+ powerOffConfig.at("delay").get<uint32_t>(), powerInterface, func);
}
else if (type == "soft")
{
action = std::make_unique<SoftPowerOff>(
- powerOffConfig.at("delay").get<uint32_t>(), powerInterface);
+ powerOffConfig.at("delay").get<uint32_t>(), powerInterface, func);
}
else if (type == "epow")
{
action = std::make_unique<EpowPowerOff>(
powerOffConfig.at("service_mode_delay").get<uint32_t>(),
- powerOffConfig.at("meltdown_delay").get<uint32_t>(),
- powerInterface);
+ powerOffConfig.at("meltdown_delay").get<uint32_t>(), powerInterface,
+ func);
}
else
{
@@ -395,7 +396,8 @@
std::vector<std::unique_ptr<PowerOffRule>>
getPowerOffRules(const json& obj,
- std::shared_ptr<PowerInterfaceBase>& powerInterface)
+ std::shared_ptr<PowerInterfaceBase>& powerInterface,
+ PowerOffAction::PrePowerOffFunc& func)
{
std::vector<std::unique_ptr<PowerOffRule>> rules;
@@ -409,7 +411,7 @@
{
auto state = getPowerOffPowerRuleState(config);
auto cause = getPowerOffCause(config);
- auto action = getPowerOffAction(config, powerInterface);
+ auto action = getPowerOffAction(config, powerInterface, func);
auto rule = std::make_unique<PowerOffRule>(
std::move(state), std::move(cause), std::move(action));
diff --git a/monitor/json_parser.hpp b/monitor/json_parser.hpp
index ba53fda..c6b64ec 100644
--- a/monitor/json_parser.hpp
+++ b/monitor/json_parser.hpp
@@ -16,6 +16,7 @@
#pragma once
#include "json_config.hpp"
+#include "power_off_action.hpp"
#include "trust_group.hpp"
#include "types.hpp"
@@ -28,6 +29,7 @@
using json = nlohmann::json;
class PowerOffRule;
class PowerInterfaceBase;
+class System;
constexpr auto confAppName = "monitor";
constexpr auto confFileName = "config.json";
@@ -86,12 +88,16 @@
*
* @param[in] powerInterface - The power interface object to use
*
+ * @param[in] func - Optional user defined function that gets called
+ * right before a power off occurs.
+ *
* @return std::vector<std::unique_ptr<PowerOffRule>> -
* The PowerOffRule objects
*/
std::vector<std::unique_ptr<PowerOffRule>>
getPowerOffRules(const json& obj,
- std::shared_ptr<PowerInterfaceBase>& powerInterface);
+ std::shared_ptr<PowerInterfaceBase>& powerInterface,
+ PowerOffAction::PrePowerOffFunc& func);
/**
* @brief Returns the 'num_nonfunc_rotors_before_error field
diff --git a/monitor/power_off_action.hpp b/monitor/power_off_action.hpp
index 05d4ff6..59b1406 100644
--- a/monitor/power_off_action.hpp
+++ b/monitor/power_off_action.hpp
@@ -33,6 +33,8 @@
class PowerOffAction
{
public:
+ using PrePowerOffFunc = std::function<void()>;
+
PowerOffAction() = delete;
virtual ~PowerOffAction() = default;
PowerOffAction(const PowerOffAction&) = delete;
@@ -44,13 +46,18 @@
* @brief Constructor
*
* @param[in] name - The action name. Used for tracing.
- * powerInterface - The object used to invoke the power off.
+ * @param[in] powerInterface - The object used to invoke the power off.
+ * @param[in] powerOffFunc - A function to call right before the power
+ * off occurs (after any delays). May be
+ * empty if no function is necessary.
*/
PowerOffAction(const std::string& name,
- std::shared_ptr<PowerInterfaceBase> powerInterface) :
+ std::shared_ptr<PowerInterfaceBase> powerInterface,
+ PrePowerOffFunc& powerOffFunc) :
_name(name),
_powerIface(std::move(powerInterface)),
- _event(sdeventplus::Event::get_default())
+ _event(sdeventplus::Event::get_default()),
+ _prePowerOffFunc(powerOffFunc)
{}
/**
@@ -120,6 +127,12 @@
* @brief The event loop object. Needed by timers.
*/
sdeventplus::Event _event;
+
+ /**
+ * @brief A function that will be called right before
+ * the power off.
+ */
+ PrePowerOffFunc _prePowerOffFunc;
};
/**
@@ -144,11 +157,15 @@
* @param[in] delay - The amount of time in seconds to wait before
* doing the power off
* @param[in] powerInterface - The object to use to do the power off
+ * @param[in] func - A function to call right before the power
+ * off occurs (after the delay). May be
+ * empty if no function is necessary.
*/
HardPowerOff(uint32_t delay,
- std::shared_ptr<PowerInterfaceBase> powerInterface) :
+ std::shared_ptr<PowerInterfaceBase> powerInterface,
+ PrePowerOffFunc func) :
PowerOffAction("Hard Power Off: " + std::to_string(delay) + "s",
- powerInterface),
+ powerInterface, func),
_delay(delay),
_timer(_event, std::bind(std::mem_fn(&HardPowerOff::powerOff), this))
{}
@@ -185,6 +202,12 @@
*/
void powerOff()
{
+
+ if (_prePowerOffFunc)
+ {
+ _prePowerOffFunc();
+ }
+
getLogger().log(
fmt::format("Action '{}' executing hard power off", name()));
_powerIface->hardPowerOff();
@@ -225,11 +248,15 @@
* @param[in] delay - The amount of time in seconds to wait before
* doing the power off
* @param[in] powerInterface - The object to use to do the power off
+ * @param[in] func - A function to call right before the power
+ * off occurs (after the delay). May be
+ * empty if no function is necessary.
*/
SoftPowerOff(uint32_t delay,
- std::shared_ptr<PowerInterfaceBase> powerInterface) :
+ std::shared_ptr<PowerInterfaceBase> powerInterface,
+ PrePowerOffFunc func) :
PowerOffAction("Soft Power Off: " + std::to_string(delay) + "s",
- powerInterface),
+ powerInterface, func),
_delay(delay),
_timer(_event, std::bind(std::mem_fn(&SoftPowerOff::powerOff), this))
{}
@@ -266,6 +293,11 @@
*/
void powerOff()
{
+ if (_prePowerOffFunc)
+ {
+ _prePowerOffFunc();
+ }
+
getLogger().log(
fmt::format("Action '{}' executing soft power off", name()));
_powerIface->softPowerOff();
@@ -302,10 +334,11 @@
EpowPowerOff& operator=(EpowPowerOff&&) = delete;
EpowPowerOff(uint32_t serviceModeDelay, uint32_t meltdownDelay,
- std::shared_ptr<PowerInterfaceBase> powerInterface) :
+ std::shared_ptr<PowerInterfaceBase> powerInterface,
+ PrePowerOffFunc func) :
PowerOffAction("EPOW Power Off: " + std::to_string(serviceModeDelay) +
"s/" + std::to_string(meltdownDelay) + "s",
- powerInterface),
+ powerInterface, func),
_serviceModeDelay(serviceModeDelay), _meltdownDelay(meltdownDelay)
{}
diff --git a/monitor/system.cpp b/monitor/system.cpp
index 6502615..229eca8 100644
--- a/monitor/system.cpp
+++ b/monitor/system.cpp
@@ -24,8 +24,6 @@
#include "json_parser.hpp"
#endif
-#include "fan_error.hpp"
-
#include <nlohmann/json.hpp>
#include <phosphor-logging/log.hpp>
#include <sdbusplus/bus.hpp>
@@ -190,7 +188,10 @@
std::shared_ptr<PowerInterfaceBase> powerInterface =
std::make_shared<PowerInterface>();
- _powerOffRules = getPowerOffRules(jsonObj, powerInterface);
+ PowerOffAction::PrePowerOffFunc func =
+ std::bind(std::mem_fn(&System::logShutdownError), this);
+
+ _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
_numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
#endif
@@ -257,7 +258,8 @@
auto sensorData = captureSensorData();
error->commit(sensorData);
- // TODO: save error so it can be committed again on a power off
+ // Save the error so it can be committed again on a power off.
+ _lastError = std::move(error);
}
void System::fanMissingErrorTimerExpired(const Fan& fan)
@@ -274,7 +276,20 @@
auto sensorData = captureSensorData();
error->commit(sensorData);
- // TODO: save error so it can be committed again on a power off
+ // Save the error so it can be committed again on a power off.
+ _lastError = std::move(error);
+}
+
+void System::logShutdownError()
+{
+ if (_lastError)
+ {
+ getLogger().log("Re-committing previous fan error before power off");
+
+ // Still use the latest sensor data
+ auto sensorData = captureSensorData();
+ _lastError->commit(sensorData);
+ }
}
json System::captureSensorData()
diff --git a/monitor/system.hpp b/monitor/system.hpp
index ae2f4ce..744c775 100644
--- a/monitor/system.hpp
+++ b/monitor/system.hpp
@@ -16,6 +16,7 @@
#pragma once
#include "fan.hpp"
+#include "fan_error.hpp"
#include "power_off_rule.hpp"
#include "power_state.hpp"
#include "tach_sensor.hpp"
@@ -40,11 +41,11 @@
{
public:
System() = delete;
+ ~System() = default;
System(const System&) = delete;
System(System&&) = delete;
System& operator=(const System&) = delete;
System& operator=(System&&) = delete;
- ~System() = default;
/**
* Constructor
@@ -91,6 +92,14 @@
*/
void fanMissingErrorTimerExpired(const Fan& fan);
+ /**
+ * @brief Called by the power off actions to log an error when there is
+ * a power off due to fan problems.
+ *
+ * The error it logs is just the last fan error that occurred.
+ */
+ void logShutdownError();
+
private:
/* The mode of fan monitor */
Mode _mode;
@@ -132,6 +141,11 @@
std::optional<size_t> _numNonfuncSensorsBeforeError;
/**
+ * @brief The most recently committed fan error.
+ */
+ std::unique_ptr<FanError> _lastError;
+
+ /**
* @brief Captures tach sensor data as JSON for use in
* fan fault and fan missing event logs.
*
diff --git a/monitor/test/power_off_rule_test.cpp b/monitor/test/power_off_rule_test.cpp
index b708bb9..ca0493c 100644
--- a/monitor/test/power_off_rule_test.cpp
+++ b/monitor/test/power_off_rule_test.cpp
@@ -9,6 +9,7 @@
TEST(PowerOffRuleTest, TestRules)
{
+ PowerOffAction::PrePowerOffFunc func;
sd_event* event;
sd_event_default(&event);
sdeventplus::Event sdEvent{event};
@@ -59,7 +60,7 @@
EXPECT_CALL(mockIface, hardPowerOff).Times(1);
EXPECT_CALL(mockIface, softPowerOff).Times(1);
- auto rules = getPowerOffRules(faultConfig, powerIface);
+ auto rules = getPowerOffRules(faultConfig, powerIface, func);
ASSERT_EQ(rules.size(), 4);
FanHealth health{{"fan0", {false, {true, true}}},