Watchdog timeout support in SBE boot window
Added support to handle SBE boot failure when watchdog
times out in the SBE boot window. FFDC information from SBE
is captured using libphal provided API, and the SBE specific
PEL for a valid FFDC is created. In case the error is related
to SBE timeout or no FFDC data then SBE dump to capture additional
debug data is initiated.
Tested: verified PEL log
root@p10bmc:~# peltool -l
{
"0x50000332": {
"SRC": "BD123504",
"Message": "timeout reported during SBE boot
process",
"PLID": "0x50000332",
"CreatorID": "BMC",
"Subsystem": "Processor Chip Cache",
"Commit Time": "10/04/2021 18:25:27",
"Sev": "Unrecoverable Error",
"CompID": "0x3500"
}
}
- Verified SBE dump was collected
Steps used:
1. obmcutil poweroff
2. istep -s0
3. systemctl start org.open_power.Dump.Manager.service
4. systemctl start openpower-debug-collector-watchdog@0.service
5. Check journal log to see SBE dump requested, dump entry created
and the dump is completed
journalctl -f -t watchdog_timeout
6. Verify the SBE dump:
ls /var/lib/phosphor-debug-collector/sbedump/<dump-entry-id>
- Verified Hostboot dump was collected
Steps Used:
1. obmcutil poweroff
2. istep -s0..6
3. systemctl start org.open_power.Dump.Manager.service
4. systemctl start openpower-debug-collector-watchdog@0.service
5. Check journal log to see Hostboot dump requested, dump entry
created and the dump is completed
journalctl -f -t watchdog_timeout
6. Verify the SBE dump:
ls /var/lib/phosphor-debug-collector/hostbootdump/<dump-entry-id>
Signed-off-by: Shantappa Teekappanavar <sbteeks@yahoo.com>
Change-Id: Ibfe7cc6619cd99f303c6106e617bc636632d0940
diff --git a/meson.build b/meson.build
index b03d995..4f3415c 100644
--- a/meson.build
+++ b/meson.build
@@ -44,10 +44,6 @@
fmt_dep = fmt_proj.dependency('fmt')
endif
-deps = [
- systemd, sdbusplus, phosphorlogging, fmt_dep
-]
-
realpath_prog = find_program('realpath')
selected_subdirs = []
@@ -86,15 +82,30 @@
endforeach
endforeach
+cxx = meson.get_compiler('cpp')
conf_data = configuration_data()
if get_option('hostboot-dump-collection').enabled()
- conf_data.set('HOSTBOOT_DUMP_COLLECTION', true)
+ conf_data.set('WATCHDOG_DUMP_COLLECTION', true)
+ extra_deps = [
+ cxx.find_library('pdbg'),
+ cxx.find_library('libdt-api'),
+ cxx.find_library('phal')
+ ]
subdir('watchdog')
else
- conf_data.set('HOSTBOOT_DUMP_COLLECTION', false)
+ conf_data.set('WATCHDOG_DUMP_COLLECTION', false)
watchdog_lib = []
+ extra_deps = []
endif
+deps = [
+ systemd,
+ sdbusplus,
+ phosphorlogging,
+ fmt_dep,
+ extra_deps
+]
+
executable('watchdog_timeout',
'watchdog_timeout.cpp',
configure_file(output: 'config.h', configuration: conf_data),
diff --git a/watchdog/watchdog_common.cpp b/watchdog/watchdog_common.cpp
index d8e64b4..4d9acd9 100644
--- a/watchdog/watchdog_common.cpp
+++ b/watchdog/watchdog_common.cpp
@@ -1,5 +1,3 @@
-#include <libpdbg.h>
-
#include <phosphor-logging/log.hpp>
#include <sdbusplus/bus.hpp>
#include <watchdog_common.hpp>
diff --git a/watchdog/watchdog_handler.cpp b/watchdog/watchdog_handler.cpp
index 093bf0a..cb22446 100644
--- a/watchdog/watchdog_handler.cpp
+++ b/watchdog/watchdog_handler.cpp
@@ -1,3 +1,5 @@
+#include <fmt/format.h>
+
#include <phosphor-logging/log.hpp>
#include <sdbusplus/bus.hpp>
#include <sdbusplus/bus/match.hpp>
@@ -17,11 +19,11 @@
*
* @param msg - dbus message from the dbus match infrastructure
* @param path - the object path we are monitoring
- * @param inProgress - used to break out of our dbus wait loop
+ * @param progressStatus - dump progress status
* @return Always non-zero indicating no error, no cascading callbacks
*/
uint dumpStatusChanged(sdbusplus::message::message& msg, std::string path,
- bool& inProgress)
+ DumpProgressStatus& progressStatus)
{
// reply (msg) will be a property change message
std::string interface;
@@ -40,10 +42,22 @@
if ((nullptr != status) && ("xyz.openbmc_project.Common.Progress."
"OperationStatus.InProgress" != *status))
{
- // dump is done, trace some info and change in progress flag
+ // dump is not in InProgress state, trace some info and change in
+ // progress status
log<level::INFO>(path.c_str());
log<level::INFO>((*status).c_str());
- inProgress = false;
+
+ if ("xyz.openbmc_project.Common.Progress.OperationStatus."
+ "Completed" == *status)
+ {
+ // Dump completed successfully
+ progressStatus = DumpProgressStatus::Completed;
+ }
+ else
+ {
+ // Dump Failed
+ progressStatus = DumpProgressStatus::Failed;
+ }
}
}
@@ -58,7 +72,8 @@
*/
void monitorDump(const std::string& path, const uint32_t timeout)
{
- bool inProgress = true; // callback will update this
+ // callback will update progressStatus
+ DumpProgressStatus progressStatus = DumpProgressStatus::InProgress;
// setup the signal match rules and callback
std::string matchInterface = "xyz.openbmc_project.Common.Progress";
@@ -70,15 +85,15 @@
sdbusplus::bus::match::rules::propertiesChanged(
path.c_str(), matchInterface.c_str()),
[&](auto& msg) {
- return dumpStatusChanged(msg, path, inProgress);
+ return dumpStatusChanged(msg, path, progressStatus);
});
// wait for dump status to be completed (complete == true)
// or until timeout interval
- log<level::INFO>("hbdump requested");
+
bool timedOut = false;
uint32_t secondsCount = 0;
- while ((true == inProgress) && !timedOut)
+ while ((DumpProgressStatus::InProgress == progressStatus) && !timedOut)
{
bus.wait(std::chrono::seconds(1));
bus.process_discard();
@@ -91,16 +106,20 @@
if (timedOut)
{
- log<level::ERR>("hbdump dump progress status did not change to "
+ log<level::ERR>("Dump progress status did not change to "
"complete within the timeout interval, exiting...");
}
+ else if (DumpProgressStatus::Completed == progressStatus)
+ {
+ log<level::INFO>("dump collection completed");
+ }
else
{
- log<level::INFO>("hbdump completed");
+ log<level::INFO>("dump collection failed");
}
}
-void requestDump(const uint32_t logId, const uint32_t timeout)
+void requestDump(const DumpParameters& dumpParameters)
{
constexpr auto path = "/org/openpower/dump";
constexpr auto interface = "xyz.openbmc_project.Dump.Create";
@@ -115,10 +134,24 @@
// dbus call arguments
std::map<std::string, std::variant<std::string, uint64_t>>
createParams;
- createParams["com.ibm.Dump.Create.CreateParameters.DumpType"] =
- "com.ibm.Dump.Create.DumpType.Hostboot";
createParams["com.ibm.Dump.Create.CreateParameters.ErrorLogId"] =
- uint64_t(logId);
+ uint64_t(dumpParameters.logId);
+ if (DumpType::Hostboot == dumpParameters.dumpType)
+ {
+ log<level::INFO>("hostboot dump requested");
+ createParams["com.ibm.Dump.Create.CreateParameters.DumpType"] =
+ "com.ibm.Dump.Create.DumpType.Hostboot";
+ }
+ else if (DumpType::SBE == dumpParameters.dumpType)
+ {
+ log<level::INFO>("SBE dump requested");
+ createParams["com.ibm.Dump.Create.CreateParameters.DumpType"] =
+ "com.ibm.Dump.Create.DumpType.SBE";
+ createParams
+ ["com.ibm.Dump.Create.CreateParameters.FailingUnitId"] =
+ dumpParameters.unitId;
+ }
+
method.append(createParams);
// using system dbus
@@ -130,12 +163,29 @@
response.read(reply);
// monitor dump progress
- monitorDump(reply, timeout);
+ monitorDump(reply, dumpParameters.timeout);
}
catch (const sdbusplus::exception::exception& e)
{
- log<level::ERR>("Error in requestDump",
- entry("ERROR=%s", e.what()));
+ constexpr auto ERROR_DUMP_DISABLED =
+ "xyz.openbmc_project.Dump.Create.Error.Disabled";
+ if (e.name() == ERROR_DUMP_DISABLED)
+ {
+ // Dump is disabled, Skip the dump collection.
+ log<level::INFO>(
+ fmt::format(
+ "Dump is disabled on({}), skipping dump collection",
+ dumpParameters.unitId)
+ .c_str());
+ }
+ else
+ {
+ log<level::ERR>(
+ fmt::format("D-Bus call createDump exception ",
+ "OBJPATH={}, INTERFACE={}, EXCEPTION={}", path,
+ interface, e.what())
+ .c_str());
+ }
}
}
}
diff --git a/watchdog/watchdog_handler.hpp b/watchdog/watchdog_handler.hpp
index 57bbdf8..e3ff2a0 100644
--- a/watchdog/watchdog_handler.hpp
+++ b/watchdog/watchdog_handler.hpp
@@ -3,7 +3,7 @@
#include <stdint.h>
/**
- * @brief Hostboot dump collector handler
+ * @brief dump collection handler
*
* Handle collection due to host going down
*/
@@ -13,17 +13,40 @@
namespace dump
{
+/** @brief Dump types supported by dump request */
+enum class DumpType
+{
+ Hostboot,
+ SBE
+};
+
+/** @brief Structure for dump request parameters */
+struct DumpParameters
+{
+ uint32_t logId;
+ uint32_t unitId;
+ uint32_t timeout;
+ DumpType dumpType;
+};
+
+/** @brief Dump progress states */
+enum class DumpProgressStatus
+{
+ InProgress,
+ Completed,
+ Failed
+};
+
/**
* @brief Request a dump from the dump manager
*
* Request a dump from the dump manager and register a monitor for observing
* the dump progress.
*
- * @param logId - the id of the event log associated with this dump request
- * @param timeout - timeout interval in seconds
+ * @param dumpParameters - parameters for the dump request
*
*/
-void requestDump(const uint32_t logId, const uint32_t timeout);
+void requestDump(const DumpParameters&);
} // namespace dump
} // namespace watchdog
diff --git a/watchdog/watchdog_logging.cpp b/watchdog/watchdog_logging.cpp
index 47024f6..1e993c0 100644
--- a/watchdog/watchdog_logging.cpp
+++ b/watchdog/watchdog_logging.cpp
@@ -28,8 +28,15 @@
// Create PEL with additional data.
auto pelId = createPel(eventName, additional, emptyFfdc);
- // will not return until dump is complete or times out
- requestDump(pelId, timeout);
+ // Collect Hostboot dump if auto reboot is enabled
+ DumpParameters dumpParameters;
+ dumpParameters.logId = pelId;
+ dumpParameters.unitId = 0; // Not used for Hostboot dump
+ dumpParameters.timeout = timeout;
+ dumpParameters.dumpType = DumpType::Hostboot;
+
+ // will not return until dump is complete or timeout
+ requestDump(dumpParameters);
}
void eventWatchdogTimeout(const uint32_t timeout)
diff --git a/watchdog/watchdog_main.cpp b/watchdog/watchdog_main.cpp
index f503674..274867a 100644
--- a/watchdog/watchdog_main.cpp
+++ b/watchdog/watchdog_main.cpp
@@ -1,4 +1,16 @@
+#include <fmt/format.h>
+extern "C"
+{
+#include <libpdbg.h>
+#include <libpdbg_sbe.h>
+}
+
+#include <libphal.H>
+
+#include <phosphor-logging/log.hpp>
#include <watchdog_common.hpp>
+#include <watchdog_dbus.hpp>
+#include <watchdog_handler.hpp>
#include <watchdog_logging.hpp>
namespace watchdog
@@ -6,6 +18,8 @@
namespace dump
{
+using namespace phosphor::logging;
+
void triggerHostbootDump(const uint32_t timeout)
{
constexpr auto HOST_STATE_DIAGNOSTIC_MODE =
@@ -21,5 +35,99 @@
transitionHost(HOST_STATE_QUIESCE_TGT);
}
+void handleSbeBootError(struct pdbg_target* procTarget, const uint32_t timeout)
+{
+ using namespace openpower::phal;
+
+ sbeError_t sbeError;
+ bool dumpIsRequired = false;
+
+ try
+ {
+ // Capture FFDC information on primary processor
+ sbeError = sbe::captureFFDC(procTarget);
+ }
+ catch (const std::exception& e)
+ {
+ // Failed to collect FFDC information
+ log<level::ERR>(
+ fmt::format("captureFFDC: Exception{}", e.what()).c_str());
+ dumpIsRequired = true;
+ }
+
+ // event type
+ std::string event;
+ if ((sbeError.errType() == exception::SBE_FFDC_NO_DATA) ||
+ (sbeError.errType() == exception::SBE_CMD_TIMEOUT) || (dumpIsRequired))
+ {
+ log<level::INFO>("No FFDC data");
+ event = "org.open_power.Processor.Error.SbeBootTimeout";
+ dumpIsRequired = true;
+ }
+ else
+ {
+ log<level::ERR>("SBE Boot failure");
+ event = "org.open_power.Processor.Error.SbeBootFailure";
+ }
+
+ // Additional data
+ std::map<std::string, std::string> additionalData;
+
+ // SRC6 : [0:15] chip position
+ uint32_t index = pdbg_target_index(procTarget);
+ additionalData.emplace("SRC6", std::to_string(index << 16));
+ additionalData.emplace("SBE_ERR_MSG", sbeError.what());
+
+ // FFDC
+ auto ffdc = std::vector<FFDCTuple>{};
+ // get SBE ffdc file descriptor
+ auto fd = sbeError.getFd();
+
+ // Log error with additional ffdc if fd is valid
+ if (fd > 0)
+ {
+ ffdc.push_back(
+ std::make_tuple(sdbusplus::xyz::openbmc_project::Logging::server::
+ Create::FFDCFormat::Custom,
+ static_cast<uint8_t>(0xCB),
+ static_cast<uint8_t>(0x01), sbeError.getFd()));
+ }
+
+ auto pelId = createPel(event, additionalData, ffdc);
+
+ if (dumpIsRequired)
+ {
+ try
+ {
+ using namespace openpower::phal;
+
+ // Check SBE dump collection allowed
+ bool dumpAllowed = sbe::isDumpAllowed(procTarget);
+ if (!dumpAllowed)
+ {
+ // Possibly another collection in progress, skip dump collection
+ log<level::INFO>("Another collection is in progress, skipping "
+ "dump collection");
+ return;
+ }
+ }
+ catch (const std::exception& e)
+ {
+ log<level::ERR>(
+ fmt::format("Exception {} occurred", e.what()).c_str());
+ return;
+ }
+
+ DumpParameters dumpParameters;
+ dumpParameters.logId = pelId;
+ dumpParameters.unitId = index;
+ dumpParameters.timeout = timeout;
+ dumpParameters.dumpType = DumpType::SBE;
+
+ // will not return until dump is complete or timeout
+ requestDump(dumpParameters);
+ }
+}
+
} // namespace dump
} // namespace watchdog
diff --git a/watchdog/watchdog_main.hpp b/watchdog/watchdog_main.hpp
index 1b1e5a6..16cae7c 100644
--- a/watchdog/watchdog_main.hpp
+++ b/watchdog/watchdog_main.hpp
@@ -19,5 +19,13 @@
*/
void triggerHostbootDump(const uint32_t timeout);
+/**
+ * @brief Handle SBE Boot Error
+ *
+ * @param procTarget - Processor target
+ * @param timeout - timeout interval in seconds
+ */
+void handleSbeBootError(struct pdbg_target* procTarget, const uint32_t timeout);
+
} // namespace dump
} // namespace watchdog
diff --git a/watchdog_timeout.cpp b/watchdog_timeout.cpp
index ca22ba3..5c374ef 100644
--- a/watchdog_timeout.cpp
+++ b/watchdog_timeout.cpp
@@ -2,7 +2,19 @@
#include <CLI/CLI.hpp>
-#ifdef HOSTBOOT_DUMP_COLLECTION
+#ifdef WATCHDOG_DUMP_COLLECTION
+extern "C"
+{
+#include <libpdbg.h>
+#include <libpdbg_sbe.h>
+}
+
+#include <fmt/format.h>
+#include <libphal.H>
+
+#include <phosphor-logging/log.hpp>
+#include <watchdog/watchdog_common.hpp>
+#include <watchdog/watchdog_dbus.hpp>
#include <watchdog/watchdog_main.hpp>
#else
#include "org/open_power/Host/Boot/error.hpp"
@@ -15,18 +27,61 @@
{
CLI::App app{"Hostboot dump collector for watchdog timeout"};
-#ifdef HOSTBOOT_DUMP_COLLECTION
- uint32_t timeoutInterval = 1500; // in seconds
- app.add_option("-t,--timeout", timeoutInterval,
+#ifdef WATCHDOG_DUMP_COLLECTION
+ constexpr uint32_t dumpTimeout = 1500; // in seconds
+ uint32_t timeout = dumpTimeout;
+ app.add_option("-t,--timeout", timeout,
"Set timeout interval for watchdog timeout in seconds");
#endif
CLI11_PARSE(app, argc, argv);
-#ifdef HOSTBOOT_DUMP_COLLECTION
+#ifdef WATCHDOG_DUMP_COLLECTION
+ using namespace phosphor::logging;
using namespace watchdog::dump;
- // TODO: trigger SBE dump if in SBE window otherwise hostboot dump
- triggerHostbootDump(timeoutInterval);
+
+ log<level::INFO>("Host did not respond within watchdog timeout interval");
+ try
+ {
+ using namespace openpower::phal;
+
+ // Initialize pdbg library, default parameters are used for init()
+ pdbg::init();
+
+ // Get Primary Proc
+ struct pdbg_target* procTarget = pdbg::getPrimaryProc();
+
+ // Check Primary IPL done
+ bool primaryIplDone = sbe::isPrimaryIplDone();
+ if (primaryIplDone)
+ {
+ // SBE boot done, Need to collect hostboot dump
+ log<level::INFO>("Handle Hostboot boot failure");
+ triggerHostbootDump(timeout);
+ }
+ else
+ {
+ // SBE boot window, handle SBE boot failure
+ log<level::INFO>("Handle SBE boot failure");
+ handleSbeBootError(procTarget, timeout);
+ }
+ }
+ catch (const std::exception& e)
+ {
+ log<level::ERR>(fmt::format("Exception {} occurred", e.what()).c_str());
+ std::string eventType =
+ "org.open_power.Processor.Error.WatchdogTimeout";
+ auto ffdc = std::vector<FFDCTuple>{};
+ std::map<std::string, std::string> additionalData;
+
+ if (!createPel(eventType, additionalData, ffdc))
+ {
+ log<level::ERR>("Failed to create PEL");
+ }
+
+ return EXIT_SUCCESS;
+ }
+
#else
using namespace phosphor::logging;
using error =
@@ -34,5 +89,5 @@
report<error>();
#endif
- return 0;
+ return EXIT_SUCCESS;
}