Implement a fallback watchdog option
Sometimes our initial watchdog action is not enough to recover the host
from the state it transitioned into. However, always using a more power
form of power cycle is not desirable as we can lose useful CPU crash
state. It is desirable in this case to have two levels of watchog
timers.
This patch implements the ability for the service to specify a fallback
watchdog action and interval. After the initial watchdog timeout is
encountered, the watchdog will be re-armed with the new parameters. Once
the watchdog times out again it will execute the fallback action.
Attempts to update the timeRemaining will reset the fallback just in
case something is still alive.
Change-Id: I69f4422c7e3963f02200815f3cef620af9e6cf8b
Signed-off-by: William A. Kennington III <wak@google.com>
diff --git a/test/watchdog_test.cpp b/test/watchdog_test.cpp
index a749435..af7c3ed 100644
--- a/test/watchdog_test.cpp
+++ b/test/watchdog_test.cpp
@@ -1,4 +1,6 @@
#include <chrono>
+#include <memory>
+#include <utility>
#include "watchdog_test.hpp"
@@ -170,3 +172,136 @@
EXPECT_TRUE(wdog->timerExpired());
EXPECT_FALSE(wdog->timerEnabled());
}
+
+/** @brief Make sure the watchdog is started and enabled with a fallback
+ * Wait through the initial trip and ensure the fallback is observed
+ * Make sure that fallback runs to completion and ensure the watchdog
+ * is disabled
+ */
+TEST_F(WdogTest, enableWdogWithFallbackTillEnd)
+{
+ auto primaryInterval = 5s;
+ auto primaryIntervalMs = milliseconds(primaryInterval).count();
+ auto fallbackInterval = primaryInterval * 2;
+ auto fallbackIntervalMs = milliseconds(fallbackInterval).count();
+
+ // We need to make a wdog with the right fallback options
+ // The interval is set to be noticeably different from the default
+ // so we can always tell the difference
+ Watchdog::Fallback fallback{
+ .action = Watchdog::Action::PowerOff,
+ .interval = static_cast<uint64_t>(fallbackIntervalMs),
+ };
+ std::map<Watchdog::Action, Watchdog::TargetName> emptyActionTargets;
+ wdog = std::make_unique<Watchdog>(bus, TEST_PATH, eventP,
+ std::move(emptyActionTargets), std::move(fallback));
+ EXPECT_EQ(primaryInterval, milliseconds(wdog->interval(primaryIntervalMs)));
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_EQ(0, wdog->timeRemaining());
+
+ // Enable and then verify
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // Waiting default expiration
+ EXPECT_EQ(primaryInterval - 1s, waitForWatchdog(primaryInterval));
+
+ // We should now have entered the fallback once the primary expires
+ EXPECT_FALSE(wdog->enabled());
+ auto remaining = milliseconds(wdog->timeRemaining());
+ EXPECT_GE(fallbackInterval, remaining);
+ EXPECT_LT(primaryInterval, remaining);
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ // We should still be ticking in fallback when setting action or interval
+ auto newInterval = primaryInterval - 1s;
+ auto newIntervalMs = milliseconds(newInterval).count();
+ EXPECT_EQ(newInterval, milliseconds(wdog->interval(newIntervalMs)));
+ EXPECT_EQ(Watchdog::Action::None,
+ wdog->expireAction(Watchdog::Action::None));
+
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_GE(remaining, milliseconds(wdog->timeRemaining()));
+ EXPECT_LT(primaryInterval, milliseconds(wdog->timeRemaining()));
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ // Test that setting the timeRemaining always resets the timer to the
+ // fallback interval
+ EXPECT_EQ(fallback.interval, wdog->timeRemaining(primaryInterval.count()));
+ EXPECT_FALSE(wdog->enabled());
+
+ remaining = milliseconds(wdog->timeRemaining());
+ EXPECT_GE(fallbackInterval, remaining);
+ EXPECT_LE(fallbackInterval - defaultDrift, remaining);
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ // Waiting fallback expiration
+ EXPECT_EQ(fallbackInterval - 1s, waitForWatchdog(fallbackInterval));
+
+ // We should now have disabled the watchdog after the fallback expires
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_EQ(0, wdog->timeRemaining());
+ EXPECT_TRUE(wdog->timerExpired());
+ EXPECT_FALSE(wdog->timerEnabled());
+
+ // Make sure enabling the watchdog again works
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // We should have re-entered the primary
+ EXPECT_TRUE(wdog->enabled());
+ EXPECT_GE(primaryInterval, milliseconds(wdog->timeRemaining()));
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+}
+
+/** @brief Make sure the watchdog is started and enabled with a fallback
+ * Wait through the initial trip and ensure the fallback is observed
+ * Make sure that we can re-enable the watchdog during fallback
+ */
+TEST_F(WdogTest, enableWdogWithFallbackReEnable)
+{
+ auto primaryInterval = 5s;
+ auto primaryIntervalMs = milliseconds(primaryInterval).count();
+ auto fallbackInterval = primaryInterval * 2;
+ auto fallbackIntervalMs = milliseconds(fallbackInterval).count();
+
+ // We need to make a wdog with the right fallback options
+ // The interval is set to be noticeably different from the default
+ // so we can always tell the difference
+ Watchdog::Fallback fallback{
+ .action = Watchdog::Action::PowerOff,
+ .interval = static_cast<uint64_t>(fallbackIntervalMs),
+ };
+ std::map<Watchdog::Action, Watchdog::TargetName> emptyActionTargets;
+ wdog = std::make_unique<Watchdog>(bus, TEST_PATH, eventP,
+ std::move(emptyActionTargets), std::move(fallback));
+ EXPECT_EQ(primaryInterval, milliseconds(wdog->interval(primaryIntervalMs)));
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_EQ(0, wdog->timeRemaining());
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_FALSE(wdog->timerEnabled());
+
+ // Enable and then verify
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // Waiting default expiration
+ EXPECT_EQ(primaryInterval - 1s, waitForWatchdog(primaryInterval));
+
+ // We should now have entered the fallback once the primary expires
+ EXPECT_FALSE(wdog->enabled());
+ auto remaining = milliseconds(wdog->timeRemaining());
+ EXPECT_GE(fallbackInterval, remaining);
+ EXPECT_LT(primaryInterval, remaining);
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // We should have re-entered the primary
+ EXPECT_TRUE(wdog->enabled());
+ EXPECT_GE(primaryInterval, milliseconds(wdog->timeRemaining()));
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+}