Monitor fan apps for watchdog protection
This overrides the fan-control and fan-monitor service files
to call the fan-watchdog-monitor service on fails.
The watchdog monitor service will start the obmc-fan-watchdog-takeover
target to expire the fan watchdog when these apps have exceeded
their retry limits.
Note that fan-presence-tach and fan-control-init do not need watchdog
protection because they are in the poweron path and will fail the
poweron if they fail.
Resolves openbmc/openbmc#1688
Change-Id: I807da09f60213104163833114a8240f5fae9053a
Signed-off-by: Matt Spinler <spinler@us.ibm.com>
diff --git a/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%.bbappend b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%.bbappend
index b9fc663..edcd8f2 100644
--- a/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%.bbappend
+++ b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%.bbappend
@@ -22,3 +22,6 @@
SYSTEMD_ENVIRONMENT_FILE_phosphor-cooling-type += "${@compose_list(d, 'COOLING_ENV_FMT', 'OBMC_CHASSIS_INSTANCES')}"
+#These 2 services are protected by the watchdog
+SYSTEMD_OVERRIDE_phosphor-fan-control += "fan-watchdog-monitor.conf:phosphor-fan-control@0.service.d/fan-watchdog-monitor.conf"
+SYSTEMD_OVERRIDE_phosphor-fan-monitor += "fan-watchdog-monitor.conf:phosphor-fan-monitor@0.service.d/fan-watchdog-monitor.conf"
diff --git a/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%/fan-watchdog-monitor.conf b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%/fan-watchdog-monitor.conf
new file mode 100644
index 0000000..023e61a
--- /dev/null
+++ b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/phosphor-fan%/fan-watchdog-monitor.conf
@@ -0,0 +1,5 @@
+[Unit]
+#These overrides allow the fan watchdog to take over when this service dies
+OnFailure=witherspoon-fan-watchdog-monitor@%n.service
+StartLimitIntervalSec=5
+StartLimitBurst=3
diff --git a/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog.bb b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog.bb
index cebeefd..a74293f 100644
--- a/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog.bb
+++ b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog.bb
@@ -10,6 +10,8 @@
TGTFMT = "obmc-chassis-poweron@0.target"
RESET_FMT = "../${RESET_SERVICE}:${TGTFMT}.requires/${RESET_SERVICE}"
-SYSTEMD_SERVICE_${PN} += "${RESET_SERVICE}"
+MONITOR_SERVICE = "witherspoon-fan-watchdog-monitor@.service"
+
+SYSTEMD_SERVICE_${PN} += "${RESET_SERVICE} ${MONITOR_SERVICE}"
SYSTEMD_LINK_${PN} += "${RESET_FMT}"
SYSTEMD_ENVIRONMENT_FILE_${PN} += "obmc/witherspoon-fan-watchdog/witherspoon-reset-fan-watchdog.conf"
diff --git a/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog/witherspoon-fan-watchdog-monitor@.service b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog/witherspoon-fan-watchdog-monitor@.service
new file mode 100644
index 0000000..9eb9697
--- /dev/null
+++ b/meta-openbmc-machines/meta-openpower/meta-ibm/meta-witherspoon/recipes-phosphor/fans/witherspoon-fan-watchdog/witherspoon-fan-watchdog-monitor@.service
@@ -0,0 +1,9 @@
+[Unit]
+Description=Fan Watchdog Failure Monitor
+
+#This can get called every time a process dies, so ensure it's never limited
+StartLimitIntervalSec=0
+
+[Service]
+Type=oneshot
+ExecStart=/usr/sbin/phosphor-unit-failure-monitor --source %i --target obmc-fan-watchdog-takeover.target --action start