app/watchdog: Only log internal failures once

Logging internal failures to the phosphor-logger is a slow process that
can take up to 5 seconds. If we do this for each watchdog reset for a
relatively fast watchdog we will clog up the channel and fill the ESEL
log with errors. We probably don't want that kind of information to
build up. Instead, only report errors to the ESEL when the state of the
internal watchdog transitions from HEALTHY -> UNHEALTHY.

Tested:
    Works on zaius when booting and terminating the watchdog daemon
    while it is ticking. Produces the expected single error in the log
    and only delays the IPMI channel once.

Change-Id: I8110ba66c4a85e188666a34cb6d055bdbec30622
Signed-off-by: William A. Kennington III <wak@google.com>
diff --git a/app/watchdog.cpp b/app/watchdog.cpp
index ad167df..f6818cb 100644
--- a/app/watchdog.cpp
+++ b/app/watchdog.cpp
@@ -14,11 +14,32 @@
 
 #include "host-ipmid/ipmid-api.h"
 
+using phosphor::logging::commit;
 using phosphor::logging::level;
 using phosphor::logging::log;
-using phosphor::logging::report;
 using sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure;
 
+static bool lastCallSuccessful = false;
+
+void reportError()
+{
+    // We don't want to fill the SEL with errors if the daemon dies and doesn't
+    // come back but the watchdog keeps on ticking. Instead, we only report the
+    // error if we haven't reported one since the last successful call
+    if (!lastCallSuccessful)
+    {
+        return;
+    }
+    lastCallSuccessful = false;
+
+    // TODO: This slow down the end of the IPMI transaction waiting
+    // for the commit to finish. commit<>() can take at least 5 seconds
+    // to complete. 5s is very slow for an IPMI command and ends up
+    // congesting the IPMI channel needlessly, especially if the watchdog
+    // is ticking fairly quickly and we have some transient issues.
+    commit<InternalFailure>();
+}
+
 ipmi_ret_t ipmi_app_watchdog_reset(ipmi_netfn_t netfn, ipmi_cmd_t cmd,
                                    ipmi_request_t request,
                                    ipmi_response_t response,
@@ -36,29 +57,31 @@
         // so it can configure actions and timeouts
         if (!wd_service.getInitialized())
         {
+            lastCallSuccessful = true;
             return IPMI_WDOG_CC_NOT_INIT;
         }
 
         // The ipmi standard dictates we enable the watchdog during reset
         wd_service.resetTimeRemaining(true);
+        lastCallSuccessful = true;
         return IPMI_CC_OK;
     }
     catch (const InternalFailure& e)
     {
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
     catch (const std::exception& e)
     {
         const std::string e_str = std::string("wd_reset: ") + e.what();
         log<level::ERR>(e_str.c_str());
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
     catch (...)
     {
         log<level::ERR>("wd_reset: Unknown Error");
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
 }
@@ -156,6 +179,7 @@
         // Mark as initialized so that future resets behave correctly
         wd_service.setInitialized(true);
 
+        lastCallSuccessful = true;
         return IPMI_CC_OK;
     }
     catch (const std::domain_error&)
@@ -164,20 +188,20 @@
     }
     catch (const InternalFailure& e)
     {
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
     catch (const std::exception& e)
     {
         const std::string e_str = std::string("wd_set: ") + e.what();
         log<level::ERR>(e_str.c_str());
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
     catch (...)
     {
         log<level::ERR>("wd_set: Unknown Error");
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
 }
@@ -264,24 +288,25 @@
 
         memcpy(response, &res, sizeof(res));
         *data_len = sizeof(res);
+        lastCallSuccessful = true;
         return IPMI_CC_OK;
     }
     catch (const InternalFailure& e)
     {
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
     catch (const std::exception& e)
     {
         const std::string e_str = std::string("wd_get: ") + e.what();
         log<level::ERR>(e_str.c_str());
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
     catch (...)
     {
         log<level::ERR>("wd_get: Unknown Error");
-        report<InternalFailure>();
+        reportError();
         return IPMI_CC_UNSPECIFIED_ERROR;
     }
 }