Watchdog timeout support in SBE boot window
Added support to handle SBE boot failure when watchdog
times out in the SBE boot window. FFDC information from SBE
is captured using libphal provided API, and the SBE specific
PEL for a valid FFDC is created. In case the error is related
to SBE timeout or no FFDC data then SBE dump to capture additional
debug data is initiated.
Tested: verified PEL log
root@p10bmc:~# peltool -l
{
"0x50000332": {
"SRC": "BD123504",
"Message": "timeout reported during SBE boot
process",
"PLID": "0x50000332",
"CreatorID": "BMC",
"Subsystem": "Processor Chip Cache",
"Commit Time": "10/04/2021 18:25:27",
"Sev": "Unrecoverable Error",
"CompID": "0x3500"
}
}
- Verified SBE dump was collected
Steps used:
1. obmcutil poweroff
2. istep -s0
3. systemctl start org.open_power.Dump.Manager.service
4. systemctl start openpower-debug-collector-watchdog@0.service
5. Check journal log to see SBE dump requested, dump entry created
and the dump is completed
journalctl -f -t watchdog_timeout
6. Verify the SBE dump:
ls /var/lib/phosphor-debug-collector/sbedump/<dump-entry-id>
- Verified Hostboot dump was collected
Steps Used:
1. obmcutil poweroff
2. istep -s0..6
3. systemctl start org.open_power.Dump.Manager.service
4. systemctl start openpower-debug-collector-watchdog@0.service
5. Check journal log to see Hostboot dump requested, dump entry
created and the dump is completed
journalctl -f -t watchdog_timeout
6. Verify the SBE dump:
ls /var/lib/phosphor-debug-collector/hostbootdump/<dump-entry-id>
Signed-off-by: Shantappa Teekappanavar <sbteeks@yahoo.com>
Change-Id: Ibfe7cc6619cd99f303c6106e617bc636632d0940
diff --git a/watchdog_timeout.cpp b/watchdog_timeout.cpp
index ca22ba3..5c374ef 100644
--- a/watchdog_timeout.cpp
+++ b/watchdog_timeout.cpp
@@ -2,7 +2,19 @@
#include <CLI/CLI.hpp>
-#ifdef HOSTBOOT_DUMP_COLLECTION
+#ifdef WATCHDOG_DUMP_COLLECTION
+extern "C"
+{
+#include <libpdbg.h>
+#include <libpdbg_sbe.h>
+}
+
+#include <fmt/format.h>
+#include <libphal.H>
+
+#include <phosphor-logging/log.hpp>
+#include <watchdog/watchdog_common.hpp>
+#include <watchdog/watchdog_dbus.hpp>
#include <watchdog/watchdog_main.hpp>
#else
#include "org/open_power/Host/Boot/error.hpp"
@@ -15,18 +27,61 @@
{
CLI::App app{"Hostboot dump collector for watchdog timeout"};
-#ifdef HOSTBOOT_DUMP_COLLECTION
- uint32_t timeoutInterval = 1500; // in seconds
- app.add_option("-t,--timeout", timeoutInterval,
+#ifdef WATCHDOG_DUMP_COLLECTION
+ constexpr uint32_t dumpTimeout = 1500; // in seconds
+ uint32_t timeout = dumpTimeout;
+ app.add_option("-t,--timeout", timeout,
"Set timeout interval for watchdog timeout in seconds");
#endif
CLI11_PARSE(app, argc, argv);
-#ifdef HOSTBOOT_DUMP_COLLECTION
+#ifdef WATCHDOG_DUMP_COLLECTION
+ using namespace phosphor::logging;
using namespace watchdog::dump;
- // TODO: trigger SBE dump if in SBE window otherwise hostboot dump
- triggerHostbootDump(timeoutInterval);
+
+ log<level::INFO>("Host did not respond within watchdog timeout interval");
+ try
+ {
+ using namespace openpower::phal;
+
+ // Initialize pdbg library, default parameters are used for init()
+ pdbg::init();
+
+ // Get Primary Proc
+ struct pdbg_target* procTarget = pdbg::getPrimaryProc();
+
+ // Check Primary IPL done
+ bool primaryIplDone = sbe::isPrimaryIplDone();
+ if (primaryIplDone)
+ {
+ // SBE boot done, Need to collect hostboot dump
+ log<level::INFO>("Handle Hostboot boot failure");
+ triggerHostbootDump(timeout);
+ }
+ else
+ {
+ // SBE boot window, handle SBE boot failure
+ log<level::INFO>("Handle SBE boot failure");
+ handleSbeBootError(procTarget, timeout);
+ }
+ }
+ catch (const std::exception& e)
+ {
+ log<level::ERR>(fmt::format("Exception {} occurred", e.what()).c_str());
+ std::string eventType =
+ "org.open_power.Processor.Error.WatchdogTimeout";
+ auto ffdc = std::vector<FFDCTuple>{};
+ std::map<std::string, std::string> additionalData;
+
+ if (!createPel(eventType, additionalData, ffdc))
+ {
+ log<level::ERR>("Failed to create PEL");
+ }
+
+ return EXIT_SUCCESS;
+ }
+
#else
using namespace phosphor::logging;
using error =
@@ -34,5 +89,5 @@
report<error>();
#endif
- return 0;
+ return EXIT_SUCCESS;
}