Implement a fallback watchdog option

Sometimes our initial watchdog action is not enough to recover the host
from the state it transitioned into. However, always using a more power
form of power cycle is not desirable as we can lose useful CPU crash
state. It is desirable in this case to have two levels of watchog
timers.

This patch implements the ability for the service to specify a fallback
watchdog action and interval. After the initial watchdog timeout is
encountered, the watchdog will be re-armed with the new parameters. Once
the watchdog times out again it will execute the fallback action.
Attempts to update the timeRemaining will reset the fallback just in
case something is still alive.

Change-Id: I69f4422c7e3963f02200815f3cef620af9e6cf8b
Signed-off-by: William A. Kennington III <wak@google.com>
diff --git a/argument.cpp b/argument.cpp
index d212261..c2d99ce 100644
--- a/argument.cpp
+++ b/argument.cpp
@@ -28,15 +28,17 @@
 const std::vector<std::string> emptyArg;
 const std::string ArgumentParser::trueString = "true"s;
 
-const char* ArgumentParser::optionStr = "p:s:t:a:ch";
+const char* ArgumentParser::optionStr = "p:s:t:a:f:i:ch";
 const option ArgumentParser::options[] =
 {
-    { "path",          required_argument,  nullptr,   'p' },
-    { "service",       required_argument,  nullptr,   's' },
-    { "target",        required_argument,  nullptr,   't' },
-    { "action_target", required_argument,  nullptr,   'a' },
-    { "continue",      no_argument,        nullptr,   'c' },
-    { "help",          no_argument,        nullptr,   'h' },
+    { "path",                    required_argument,  nullptr,   'p' },
+    { "service",                 required_argument,  nullptr,   's' },
+    { "target",                  required_argument,  nullptr,   't' },
+    { "action_target",           required_argument,  nullptr,   'a' },
+    { "fallback_action",         required_argument,  nullptr,   'f' },
+    { "fallback_interval",       required_argument,  nullptr,   'i' },
+    { "continue",                no_argument,        nullptr,   'c' },
+    { "help",                    no_argument,        nullptr,   'h' },
     { 0, 0, 0, 0},
 };
 
@@ -102,6 +104,13 @@
     std::cerr << " [--action_target=<action>=<systemd unit>] Map of action to "
                      "systemd unit to be called on timeout if that action is "
                      "set for ExpireAction when the timer expires.\n";
+    std::cerr << " [--fallback_action=<action>]              Enables the "
+                     "watchdog even when disabled via the dbus interface. "
+                     "Perform this action when the fallback expires.\n";
+    std::cerr << " [--fallback_interval=<interval in ms>]    Enables the "
+                     "watchdog even when disabled via the dbus interface. "
+                     "Waits for this interval before performing the fallback "
+                     "action.\n";
     std::cerr << " [--continue]                              Continue daemon "
                      "after watchdog timeout.\n";
 }
diff --git a/mainapp.cpp b/mainapp.cpp
index d935bfb..365fd50 100644
--- a/mainapp.cpp
+++ b/mainapp.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <iostream>
+#include <experimental/optional>
 #include <phosphor-logging/log.hpp>
 #include <phosphor-logging/elog.hpp>
 #include <phosphor-logging/elog-errors.hpp>
@@ -133,6 +134,47 @@
     }
     printActionTargets(actionTargets);
 
+    // Parse out the fallback settings for the watchdog. Note that we require
+    // both of the fallback arguments to do anything here, but having a fallback
+    // is entirely optional.
+    auto fallbackActionParam = (options)["fallback_action"];
+    auto fallbackIntervalParam = (options)["fallback_interval"];
+    if (fallbackActionParam.empty() ^ fallbackIntervalParam.empty())
+    {
+        exitWithError("Only one of the fallback options was specified.", argv);
+    }
+    if (fallbackActionParam.size() > 1 || fallbackIntervalParam.size() > 1)
+    {
+        exitWithError("Multiple fallbacks specified.", argv);
+    }
+    std::experimental::optional<Watchdog::Fallback> fallback;
+    if (!fallbackActionParam.empty())
+    {
+        Watchdog::Action action;
+        try
+        {
+            action = Watchdog::convertActionFromString(
+                    fallbackActionParam.back());
+        }
+        catch (const sdbusplus::exception::InvalidEnumString &)
+        {
+            exitWithError("Bad action specified.", argv);
+        }
+        uint64_t interval;
+        try
+        {
+            interval = std::stoull(fallbackIntervalParam.back());
+        }
+        catch (const std::logic_error &)
+        {
+            exitWithError("Failed to convert fallback interval to integer.", argv);
+        }
+        fallback = Watchdog::Fallback{
+            .action = action,
+            .interval = interval,
+        };
+    }
+
     sd_event* event = nullptr;
     auto r = sd_event_default(&event);
     if (r < 0)
@@ -155,7 +197,9 @@
     try
     {
         // Create a watchdog object
-        Watchdog watchdog(bus, path.c_str(), eventP, std::move(actionTargets));
+        Watchdog watchdog(bus, path.c_str(), eventP, std::move(actionTargets),
+            std::move(fallback));
+
         // Claim the bus
         bus.request_name(service.c_str());
 
diff --git a/test/watchdog_test.cpp b/test/watchdog_test.cpp
index a749435..af7c3ed 100644
--- a/test/watchdog_test.cpp
+++ b/test/watchdog_test.cpp
@@ -1,4 +1,6 @@
 #include <chrono>
+#include <memory>
+#include <utility>
 
 #include "watchdog_test.hpp"
 
@@ -170,3 +172,136 @@
     EXPECT_TRUE(wdog->timerExpired());
     EXPECT_FALSE(wdog->timerEnabled());
 }
+
+/** @brief Make sure the watchdog is started and enabled with a fallback
+ *         Wait through the initial trip and ensure the fallback is observed
+ *         Make sure that fallback runs to completion and ensure the watchdog
+ *         is disabled
+ */
+TEST_F(WdogTest, enableWdogWithFallbackTillEnd)
+{
+    auto primaryInterval = 5s;
+    auto primaryIntervalMs = milliseconds(primaryInterval).count();
+    auto fallbackInterval = primaryInterval * 2;
+    auto fallbackIntervalMs = milliseconds(fallbackInterval).count();
+
+    // We need to make a wdog with the right fallback options
+    // The interval is set to be noticeably different from the default
+    // so we can always tell the difference
+    Watchdog::Fallback fallback{
+        .action = Watchdog::Action::PowerOff,
+        .interval = static_cast<uint64_t>(fallbackIntervalMs),
+    };
+    std::map<Watchdog::Action, Watchdog::TargetName> emptyActionTargets;
+    wdog = std::make_unique<Watchdog>(bus, TEST_PATH, eventP,
+                    std::move(emptyActionTargets), std::move(fallback));
+    EXPECT_EQ(primaryInterval, milliseconds(wdog->interval(primaryIntervalMs)));
+    EXPECT_FALSE(wdog->enabled());
+    EXPECT_EQ(0, wdog->timeRemaining());
+
+    // Enable and then verify
+    EXPECT_TRUE(wdog->enabled(true));
+
+    // Waiting default expiration
+    EXPECT_EQ(primaryInterval - 1s, waitForWatchdog(primaryInterval));
+
+    // We should now have entered the fallback once the primary expires
+    EXPECT_FALSE(wdog->enabled());
+    auto remaining = milliseconds(wdog->timeRemaining());
+    EXPECT_GE(fallbackInterval, remaining);
+    EXPECT_LT(primaryInterval, remaining);
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_TRUE(wdog->timerEnabled());
+
+    // We should still be ticking in fallback when setting action or interval
+    auto newInterval = primaryInterval - 1s;
+    auto newIntervalMs = milliseconds(newInterval).count();
+    EXPECT_EQ(newInterval, milliseconds(wdog->interval(newIntervalMs)));
+    EXPECT_EQ(Watchdog::Action::None,
+            wdog->expireAction(Watchdog::Action::None));
+
+    EXPECT_FALSE(wdog->enabled());
+    EXPECT_GE(remaining, milliseconds(wdog->timeRemaining()));
+    EXPECT_LT(primaryInterval, milliseconds(wdog->timeRemaining()));
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_TRUE(wdog->timerEnabled());
+
+    // Test that setting the timeRemaining always resets the timer to the
+    // fallback interval
+    EXPECT_EQ(fallback.interval, wdog->timeRemaining(primaryInterval.count()));
+    EXPECT_FALSE(wdog->enabled());
+
+    remaining = milliseconds(wdog->timeRemaining());
+    EXPECT_GE(fallbackInterval, remaining);
+    EXPECT_LE(fallbackInterval - defaultDrift, remaining);
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_TRUE(wdog->timerEnabled());
+
+    // Waiting fallback expiration
+    EXPECT_EQ(fallbackInterval - 1s, waitForWatchdog(fallbackInterval));
+
+    // We should now have disabled the watchdog after the fallback expires
+    EXPECT_FALSE(wdog->enabled());
+    EXPECT_EQ(0, wdog->timeRemaining());
+    EXPECT_TRUE(wdog->timerExpired());
+    EXPECT_FALSE(wdog->timerEnabled());
+
+    // Make sure enabling the watchdog again works
+    EXPECT_TRUE(wdog->enabled(true));
+
+    // We should have re-entered the primary
+    EXPECT_TRUE(wdog->enabled());
+    EXPECT_GE(primaryInterval, milliseconds(wdog->timeRemaining()));
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_TRUE(wdog->timerEnabled());
+}
+
+/** @brief Make sure the watchdog is started and enabled with a fallback
+ *         Wait through the initial trip and ensure the fallback is observed
+ *         Make sure that we can re-enable the watchdog during fallback
+ */
+TEST_F(WdogTest, enableWdogWithFallbackReEnable)
+{
+    auto primaryInterval = 5s;
+    auto primaryIntervalMs = milliseconds(primaryInterval).count();
+    auto fallbackInterval = primaryInterval * 2;
+    auto fallbackIntervalMs = milliseconds(fallbackInterval).count();
+
+    // We need to make a wdog with the right fallback options
+    // The interval is set to be noticeably different from the default
+    // so we can always tell the difference
+    Watchdog::Fallback fallback{
+        .action = Watchdog::Action::PowerOff,
+        .interval = static_cast<uint64_t>(fallbackIntervalMs),
+    };
+    std::map<Watchdog::Action, Watchdog::TargetName> emptyActionTargets;
+    wdog = std::make_unique<Watchdog>(bus, TEST_PATH, eventP,
+                    std::move(emptyActionTargets), std::move(fallback));
+    EXPECT_EQ(primaryInterval, milliseconds(wdog->interval(primaryIntervalMs)));
+    EXPECT_FALSE(wdog->enabled());
+    EXPECT_EQ(0, wdog->timeRemaining());
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_FALSE(wdog->timerEnabled());
+
+    // Enable and then verify
+    EXPECT_TRUE(wdog->enabled(true));
+
+    // Waiting default expiration
+    EXPECT_EQ(primaryInterval - 1s, waitForWatchdog(primaryInterval));
+
+    // We should now have entered the fallback once the primary expires
+    EXPECT_FALSE(wdog->enabled());
+    auto remaining = milliseconds(wdog->timeRemaining());
+    EXPECT_GE(fallbackInterval, remaining);
+    EXPECT_LT(primaryInterval, remaining);
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_TRUE(wdog->timerEnabled());
+
+    EXPECT_TRUE(wdog->enabled(true));
+
+    // We should have re-entered the primary
+    EXPECT_TRUE(wdog->enabled());
+    EXPECT_GE(primaryInterval, milliseconds(wdog->timeRemaining()));
+    EXPECT_FALSE(wdog->timerExpired());
+    EXPECT_TRUE(wdog->timerEnabled());
+}
diff --git a/watchdog.cpp b/watchdog.cpp
index 24e5416..b0c7b6f 100644
--- a/watchdog.cpp
+++ b/watchdog.cpp
@@ -19,8 +19,12 @@
 {
     if (!value)
     {
-        // Attempt to disable our timer if needed
-        tryDisable();
+        // Make sure we accurately reflect our enabled state to the
+        // tryFallbackOrDisable() call
+        WatchdogInherits::enabled(value);
+
+        // Attempt to fallback or disable our timer if needed
+        tryFallbackOrDisable();
 
         return false;
     }
@@ -78,6 +82,13 @@
         return 0;
     }
 
+    if (!this->enabled())
+    {
+        // Having a timer but not displaying an enabled value means we
+        // are inside of the fallback
+        value = fallback->interval;
+    }
+
     // Update new expiration
     auto usec = duration_cast<microseconds>(milliseconds(value));
     timer.start(usec);
@@ -89,9 +100,13 @@
 // Optional callback function on timer expiration
 void Watchdog::timeOutHandler()
 {
-    auto action = expireAction();
-    auto target = actionTargets.find(action);
+    Action action = expireAction();
+    if (!this->enabled())
+    {
+        action = fallback->action;
+    }
 
+    auto target = actionTargets.find(action);
     if (target == actionTargets.end())
     {
         log<level::INFO>("watchdog: Timed out with no target",
@@ -112,12 +127,26 @@
         bus.call_noreply(method);
     }
 
-    tryDisable();
+    tryFallbackOrDisable();
 }
 
-void Watchdog::tryDisable()
+void Watchdog::tryFallbackOrDisable()
 {
-    if (timerEnabled())
+    // We only re-arm the watchdog if we were already enabled and have
+    // a possible fallback
+    if (fallback && this->enabled())
+    {
+        auto interval_ms = fallback->interval;
+        auto interval_us = duration_cast<microseconds>(milliseconds(interval_ms));
+
+        timer.clearExpired();
+        timer.start(interval_us);
+        timer.setEnabled<std::true_type>();
+
+        log<level::INFO>("watchdog: falling back",
+                entry("INTERVAL=%llu", interval_ms));
+    }
+    else if (timerEnabled())
     {
         timer.setEnabled<std::false_type>();
 
diff --git a/watchdog.hpp b/watchdog.hpp
index ee5e65d..60bad4e 100644
--- a/watchdog.hpp
+++ b/watchdog.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <systemd/sd-event.h>
+#include <experimental/optional>
 #include <sdbusplus/bus.hpp>
 #include <sdbusplus/server/object.hpp>
 #include <xyz/openbmc_project/State/Watchdog/server.hpp>
@@ -31,21 +32,32 @@
          */
         using TargetName = std::string;
 
+        /** @brief Type used to specify the parameters of a fallback watchdog
+         */
+        struct Fallback {
+            Action action;
+            uint64_t interval;
+        };
+
         /** @brief Constructs the Watchdog object
          *
          *  @param[in] bus            - DBus bus to attach to.
          *  @param[in] objPath        - Object path to attach to.
          *  @param[in] event          - reference to sd_event unique pointer
          *  @param[in] actionTargets  - map of systemd targets called on timeout
+         *  @param[in] fallback
          */
         Watchdog(sdbusplus::bus::bus& bus,
                 const char* objPath,
                 EventPtr& event,
                 std::map<Action, TargetName>&& actionTargets =
-                    std::map<Action, TargetName>()) :
+                    std::map<Action, TargetName>(),
+                std::experimental::optional<Fallback>&&
+                    fallback = std::experimental::nullopt) :
             WatchdogInherits(bus, objPath),
             bus(bus),
             actionTargets(std::move(actionTargets)),
+            fallback(std::move(fallback)),
             timer(event, std::bind(&Watchdog::timeOutHandler, this))
         {
             // Nothing
@@ -106,14 +118,17 @@
         /** @brief Map of systemd units to be started when the timer expires */
         std::map<Action, TargetName> actionTargets;
 
+        /** @brief Fallback timer options */
+        std::experimental::optional<Fallback> fallback;
+
         /** @brief Contained timer object */
         Timer timer;
 
         /** @brief Optional Callback handler on timer expirartion */
         void timeOutHandler();
 
-        /** @brief Attempt to disable the watchdog if needed */
-        void tryDisable();
+        /** @brief Attempt to enter the fallback watchdog or disables it */
+        void tryFallbackOrDisable();
 };
 
 } // namespace watchdog