monitor: Shut down if no readings at power on
If there are no tach sensors on D-Bus when the power state changes to
on, then create an event log and shut down the system. This is done
because in this case the code is not able to know the fan state - if
there are any present or spinning.
The most likely reason there are no sensors (aside from a glaring error
in the config file) is because the fan controller device driver failed
its probe and was unable to detect it, maybe because the device didn't
have power or there was an I2C problem. To aid in root cause analysis
if this were to occur in the field, the code adds the following FFDC
(First Failure Data Capture) to the event log:
* All of the loaded hwmon drivers, taken from /sys/class/hwmon/*/name
* Failure related lines in dmesg, which is where driver errors would
show up.
Tested: Unbound the fan device driver and then powered on the system.
Also disabled I2C to the fan controller device in simulation and tried a
power on.
Signed-off-by: Matt Spinler <spinler@us.ibm.com>
Change-Id: Ic0b80d67ec79c9401f59324fe1134ff12084112a
diff --git a/monitor/Makefile.am b/monitor/Makefile.am
index 7c16680..0152277 100644
--- a/monitor/Makefile.am
+++ b/monitor/Makefile.am
@@ -19,6 +19,7 @@
argument.cpp \
fan.cpp \
fan_error.cpp \
+ ../hwmon_ffdc.cpp \
power_interface.cpp \
logging.cpp \
main.cpp \
diff --git a/monitor/fan.cpp b/monitor/fan.cpp
index b3cc2a0..5f0414d 100644
--- a/monitor/fan.cpp
+++ b/monitor/fan.cpp
@@ -479,39 +479,46 @@
{
_monitorTimer.restartOnce(std::chrono::seconds(_monitorDelay));
+ _numSensorsOnDBusAtPowerOn = 0;
+
+ std::for_each(_sensors.begin(), _sensors.end(), [this](auto& sensor) {
+ try
+ {
+ // Force a getProperty call. If sensor is on D-Bus,
+ // then make sure it's functional.
+ sensor->updateTachAndTarget();
+
+ _numSensorsOnDBusAtPowerOn++;
+
+ if (_present)
+ {
+ // If not functional, set it back to functional.
+ if (!sensor->functional())
+ {
+ sensor->setFunctional(true);
+ _system.fanStatusChange(*this, true);
+ }
+
+ // Set the counters back to zero
+ if (sensor->getMethod() == MethodMode::count)
+ {
+ sensor->resetMethod();
+ }
+ }
+ }
+ catch (const util::DBusError& e)
+ {
+ // Properties still aren't on D-Bus. Let startMonitor()
+ // deal with it, or maybe System::powerStateChanged() if
+ // there aren't any sensors at all on D-Bus.
+ getLogger().log(fmt::format(
+ "At power on, tach sensor {} value not on D-Bus",
+ sensor->name()));
+ }
+ });
+
if (_present)
{
- std::for_each(
- _sensors.begin(), _sensors.end(), [this](auto& sensor) {
- try
- {
- // Force a getProperty call. If sensor is on D-Bus,
- // then make sure it's functional.
- sensor->updateTachAndTarget();
-
- // If not functional, set it back to functional.
- if (!sensor->functional())
- {
- sensor->setFunctional(true);
- _system.fanStatusChange(*this, true);
- }
-
- // Set the counters back to zero
- if (sensor->getMethod() == MethodMode::count)
- {
- sensor->resetMethod();
- }
- }
- catch (const util::DBusServiceError& e)
- {
- // Properties still aren't on D-Bus. Let startMonitor()
- // deal with it.
- getLogger().log(fmt::format(
- "At power on, tach sensor {} value not on D-Bus",
- sensor->name()));
- }
- });
-
// If configured to change functional state on the fan itself,
// Set it back to true now if necessary.
if (_numSensorFailsForNonFunc)
diff --git a/monitor/fan.hpp b/monitor/fan.hpp
index 713a1a1..d4bcb2d 100644
--- a/monitor/fan.hpp
+++ b/monitor/fan.hpp
@@ -184,6 +184,15 @@
*/
void countTimerExpired(TachSensor& sensor);
+ /**
+ * @brief Returns the number of tach sensors (Sensor.Value ifaces)
+ * on D-Bus at the last power on.
+ */
+ inline size_t numSensorsOnDBusAtPowerOn() const
+ {
+ return _numSensorsOnDBusAtPowerOn;
+ }
+
private:
/**
* @brief Returns true if the sensor input is not within
@@ -334,6 +343,14 @@
* a fan plug is detected.
*/
bool _setFuncOnPresent;
+
+ /**
+ * @brief The number of sensors that have their Sensor.Value interfaces
+ * on D-Bus at the last power on.
+ *
+ * Will be zero until the power turns on the first time.
+ */
+ size_t _numSensorsOnDBusAtPowerOn = 0;
};
} // namespace monitor
diff --git a/monitor/fan_error.cpp b/monitor/fan_error.cpp
index bb98598..ebb7f08 100644
--- a/monitor/fan_error.cpp
+++ b/monitor/fan_error.cpp
@@ -97,7 +97,11 @@
std::map<std::string, std::string> ad;
ad.emplace("_PID", std::to_string(getpid()));
- ad.emplace("CALLOUT_INVENTORY_PATH", _fanName);
+
+ if (!_fanName.empty())
+ {
+ ad.emplace("CALLOUT_INVENTORY_PATH", _fanName);
+ }
if (!_sensorName.empty())
{
diff --git a/monitor/fan_error.hpp b/monitor/fan_error.hpp
index 63e4e45..79573e1 100644
--- a/monitor/fan_error.hpp
+++ b/monitor/fan_error.hpp
@@ -105,6 +105,24 @@
{}
/**
+ * @brief Constructor
+ *
+ * This version doesn't take a fan or sensor name.
+ *
+ * @param[in] error - The error name, like
+ * xyz.openbmc_project.Fan.Error.Fault
+ * @param[in] severity - The severity of the error
+ */
+ FanError(const std::string& error,
+ sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level
+ severity) :
+ _errorName(error),
+ _severity(
+ sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
+ severity))
+ {}
+
+ /**
* @brief Commits the error by calling the D-Bus method to create
* the event log.
*
diff --git a/monitor/power_interface.cpp b/monitor/power_interface.cpp
index 372993f..575d2bc 100644
--- a/monitor/power_interface.cpp
+++ b/monitor/power_interface.cpp
@@ -31,11 +31,16 @@
"replace");
}
-void PowerInterface::hardPowerOff()
+void PowerInterface::executeHardPowerOff()
{
util::SDBusPlus::callMethod(
systemdService, systemdPath, systemdMgrIface, "StartUnit",
"obmc-chassis-hard-poweroff@0.target", "replace");
}
+void PowerInterface::hardPowerOff()
+{
+ executeHardPowerOff();
+}
+
} // namespace phosphor::fan::monitor
diff --git a/monitor/power_interface.hpp b/monitor/power_interface.hpp
index e0a802d..edbf325 100644
--- a/monitor/power_interface.hpp
+++ b/monitor/power_interface.hpp
@@ -83,6 +83,14 @@
_alert.enabled(alert);
}
+ /**
+ * @brief Calls the D-Bus method to execute the hard power off.
+ *
+ * A static function so this can be used by code that doesn't
+ * want to create a PowerInterface object.
+ */
+ static void executeHardPowerOff();
+
private:
/**
* @brief Reference to the thermal alert D-Bus object
diff --git a/monitor/system.cpp b/monitor/system.cpp
index 22279da..01f1a9b 100644
--- a/monitor/system.cpp
+++ b/monitor/system.cpp
@@ -26,6 +26,8 @@
#include "config.h"
+#include "hwmon_ffdc.hpp"
+
#include <nlohmann/json.hpp>
#include <phosphor-logging/log.hpp>
#include <sdbusplus/bus.hpp>
@@ -209,6 +211,16 @@
throw std::runtime_error("No conf file found at power on");
}
+ // If no fan has its sensors on D-Bus, then there is a problem
+ // with the fan controller. Log an error and shut down.
+ if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
+ return fan->numSensorsOnDBusAtPowerOn() == 0;
+ }))
+ {
+ handleOfflineFanController();
+ return;
+ }
+
std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
[this](auto& rule) {
rule->check(PowerRuleState::atPgood, _fanHealth);
@@ -326,4 +338,18 @@
return data;
}
+void System::handleOfflineFanController()
+{
+ getLogger().log("The fan controller appears to be offline. Shutting down.",
+ Logger::error);
+
+ auto ffdc = collectHwmonFFDC();
+
+ FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
+ Severity::Critical};
+ error.commit(ffdc, true);
+
+ PowerInterface::executeHardPowerOff();
+}
+
} // namespace phosphor::fan::monitor
diff --git a/monitor/system.hpp b/monitor/system.hpp
index 3b4c309..6725b00 100644
--- a/monitor/system.hpp
+++ b/monitor/system.hpp
@@ -229,6 +229,11 @@
* @param[in] jsonObj - JSON object to parse from
*/
void setFaultConfig(const json& jsonObj);
+
+ /**
+ * @brief Log an error and shut down due to an offline fan controller
+ */
+ void handleOfflineFanController();
};
} // namespace phosphor::fan::monitor