Watchdog timeout support in SBE boot window
Added support to handle SBE boot failure when watchdog
times out in the SBE boot window. FFDC information from SBE
is captured using libphal provided API, and the SBE specific
PEL for a valid FFDC is created. In case the error is related
to SBE timeout or no FFDC data then SBE dump to capture additional
debug data is initiated.
Tested: verified PEL log
root@p10bmc:~# peltool -l
{
"0x50000332": {
"SRC": "BD123504",
"Message": "timeout reported during SBE boot
process",
"PLID": "0x50000332",
"CreatorID": "BMC",
"Subsystem": "Processor Chip Cache",
"Commit Time": "10/04/2021 18:25:27",
"Sev": "Unrecoverable Error",
"CompID": "0x3500"
}
}
- Verified SBE dump was collected
Steps used:
1. obmcutil poweroff
2. istep -s0
3. systemctl start org.open_power.Dump.Manager.service
4. systemctl start openpower-debug-collector-watchdog@0.service
5. Check journal log to see SBE dump requested, dump entry created
and the dump is completed
journalctl -f -t watchdog_timeout
6. Verify the SBE dump:
ls /var/lib/phosphor-debug-collector/sbedump/<dump-entry-id>
- Verified Hostboot dump was collected
Steps Used:
1. obmcutil poweroff
2. istep -s0..6
3. systemctl start org.open_power.Dump.Manager.service
4. systemctl start openpower-debug-collector-watchdog@0.service
5. Check journal log to see Hostboot dump requested, dump entry
created and the dump is completed
journalctl -f -t watchdog_timeout
6. Verify the SBE dump:
ls /var/lib/phosphor-debug-collector/hostbootdump/<dump-entry-id>
Signed-off-by: Shantappa Teekappanavar <sbteeks@yahoo.com>
Change-Id: Ibfe7cc6619cd99f303c6106e617bc636632d0940
diff --git a/watchdog/watchdog_main.cpp b/watchdog/watchdog_main.cpp
index f503674..274867a 100644
--- a/watchdog/watchdog_main.cpp
+++ b/watchdog/watchdog_main.cpp
@@ -1,4 +1,16 @@
+#include <fmt/format.h>
+extern "C"
+{
+#include <libpdbg.h>
+#include <libpdbg_sbe.h>
+}
+
+#include <libphal.H>
+
+#include <phosphor-logging/log.hpp>
#include <watchdog_common.hpp>
+#include <watchdog_dbus.hpp>
+#include <watchdog_handler.hpp>
#include <watchdog_logging.hpp>
namespace watchdog
@@ -6,6 +18,8 @@
namespace dump
{
+using namespace phosphor::logging;
+
void triggerHostbootDump(const uint32_t timeout)
{
constexpr auto HOST_STATE_DIAGNOSTIC_MODE =
@@ -21,5 +35,99 @@
transitionHost(HOST_STATE_QUIESCE_TGT);
}
+void handleSbeBootError(struct pdbg_target* procTarget, const uint32_t timeout)
+{
+ using namespace openpower::phal;
+
+ sbeError_t sbeError;
+ bool dumpIsRequired = false;
+
+ try
+ {
+ // Capture FFDC information on primary processor
+ sbeError = sbe::captureFFDC(procTarget);
+ }
+ catch (const std::exception& e)
+ {
+ // Failed to collect FFDC information
+ log<level::ERR>(
+ fmt::format("captureFFDC: Exception{}", e.what()).c_str());
+ dumpIsRequired = true;
+ }
+
+ // event type
+ std::string event;
+ if ((sbeError.errType() == exception::SBE_FFDC_NO_DATA) ||
+ (sbeError.errType() == exception::SBE_CMD_TIMEOUT) || (dumpIsRequired))
+ {
+ log<level::INFO>("No FFDC data");
+ event = "org.open_power.Processor.Error.SbeBootTimeout";
+ dumpIsRequired = true;
+ }
+ else
+ {
+ log<level::ERR>("SBE Boot failure");
+ event = "org.open_power.Processor.Error.SbeBootFailure";
+ }
+
+ // Additional data
+ std::map<std::string, std::string> additionalData;
+
+ // SRC6 : [0:15] chip position
+ uint32_t index = pdbg_target_index(procTarget);
+ additionalData.emplace("SRC6", std::to_string(index << 16));
+ additionalData.emplace("SBE_ERR_MSG", sbeError.what());
+
+ // FFDC
+ auto ffdc = std::vector<FFDCTuple>{};
+ // get SBE ffdc file descriptor
+ auto fd = sbeError.getFd();
+
+ // Log error with additional ffdc if fd is valid
+ if (fd > 0)
+ {
+ ffdc.push_back(
+ std::make_tuple(sdbusplus::xyz::openbmc_project::Logging::server::
+ Create::FFDCFormat::Custom,
+ static_cast<uint8_t>(0xCB),
+ static_cast<uint8_t>(0x01), sbeError.getFd()));
+ }
+
+ auto pelId = createPel(event, additionalData, ffdc);
+
+ if (dumpIsRequired)
+ {
+ try
+ {
+ using namespace openpower::phal;
+
+ // Check SBE dump collection allowed
+ bool dumpAllowed = sbe::isDumpAllowed(procTarget);
+ if (!dumpAllowed)
+ {
+ // Possibly another collection in progress, skip dump collection
+ log<level::INFO>("Another collection is in progress, skipping "
+ "dump collection");
+ return;
+ }
+ }
+ catch (const std::exception& e)
+ {
+ log<level::ERR>(
+ fmt::format("Exception {} occurred", e.what()).c_str());
+ return;
+ }
+
+ DumpParameters dumpParameters;
+ dumpParameters.logId = pelId;
+ dumpParameters.unitId = index;
+ dumpParameters.timeout = timeout;
+ dumpParameters.dumpType = DumpType::SBE;
+
+ // will not return until dump is complete or timeout
+ requestDump(dumpParameters);
+ }
+}
+
} // namespace dump
} // namespace watchdog