Update SBE dump collection process and error handling
This commit added the dump collection from SBE. It introduces the
capability to invoke the getdump chip operation on the SBE) and
added traces for any failures encountered during the operation.
In addition to the above, this update brings in a new utility function
designed to write the collected dump data into a specified file format
and path.
Furthermore, the commit incorporates a logic check function to determine
the necessity of fastarray data collection when performing the dump from
SBE. This addition optimizes the dump collection process by selectively
gathering fastarray data only when it's deemed necessary.
The dump-collect application reports a failure if the destination path
remains empty post-operation, indicating that no dump files were
successfully collected from any of the SBEs.
Tests:
- Collect hardware dump
- Collect hostboot dump
Signed-off-by: Dhruvaraj Subhashchandran <dhruvaraj@in.ibm.com>
Change-Id: Ia65142818ff7723721b78dae19f0d13afc1a33fc
diff --git a/dump/sbe_dump_collector.cpp b/dump/sbe_dump_collector.cpp
index 5f0f83f..119fdff 100644
--- a/dump/sbe_dump_collector.cpp
+++ b/dump/sbe_dump_collector.cpp
@@ -11,10 +11,17 @@
#include <sys/wait.h>
#include <unistd.h>
+#include <phosphor-logging/elog-errors.hpp>
#include <phosphor-logging/lg2.hpp>
#include <phosphor-logging/log.hpp>
+#include <sbe_consts.hpp>
+#include <xyz/openbmc_project/Common/File/error.hpp>
+#include <xyz/openbmc_project/Common/error.hpp>
+#include <cstdint>
+#include <filesystem>
#include <format>
+#include <fstream>
#include <stdexcept>
namespace openpower::dump::sbe_chipop
@@ -57,7 +64,15 @@
// Wait for all asynchronous tasks to complete
for (auto& future : futures)
{
- future.wait();
+ try
+ {
+ future.wait();
+ }
+ catch (const std::exception& e)
+ {
+ lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
+ "ERROR", e);
+ }
}
lg2::info(
"Dump collection completed for clock state({CSTATE}): type({TYPE}) "
@@ -65,7 +80,11 @@
"CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
failingUnit, "PATH", path.string());
}
-
+ if (std::filesystem::is_empty(path))
+ {
+ lg2::error("Failed to collect the dump");
+ throw std::runtime_error("Failed to collect the dump");
+ }
lg2::info("Dump collection completed");
}
@@ -89,12 +108,20 @@
continue;
}
- // Launch an asynchronous task instead of forking
auto future =
std::async(std::launch::async,
[this, target, path, id, type, cstate, failingUnit]() {
- this->collectDumpFromSBE(target, path, id, type, cstate,
- failingUnit);
+ try
+ {
+ this->collectDumpFromSBE(target, path, id, type, cstate,
+ failingUnit);
+ }
+ catch (const std::exception& e)
+ {
+ lg2::error(
+ "Failed to collect dump from SBE on Proc-({PROCINDEX})",
+ "PROCINDEX", pdbg_target_index(target));
+ }
});
futures.push_back(std::move(future));
@@ -115,6 +142,102 @@
"type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
"PROC", chipPos, "PATH", path.string(), "ID", id, "TYPE", type,
"CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
+
+ util::DumpDataPtr dataPtr;
+ uint32_t len = 0;
+ uint8_t collectFastArray =
+ checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
+
+ try
+ {
+ openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
+ dataPtr.getPtr(), &len);
+ }
+ catch (const openpower::phal::sbeError_t& sbeError)
+ {
+ if (sbeError.errType() ==
+ openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
+ {
+ // SBE is not ready to accept chip-ops,
+ // Skip the request, no additional error handling required.
+ lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
+ "on proc({PROC}) clock state({CLOCKSTATE})",
+ "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
+ "CLOCKSTATE", clockState);
+ return;
+ }
+
+ lg2::error("Error in collecting dump dump type({TYPE}), "
+ "clockstate({CLOCKSTATE}), proc position({PROC}), "
+ "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
+ "TYPE", type, "CLOCKSTATE", clockState, "PROC", chipPos,
+ "COLLECTFASTARRAY", collectFastArray, "ERROR", sbeError);
+
+ return;
+ }
+ writeDumpFile(path, id, clockState, 0, "proc", chipPos, dataPtr, len);
+}
+
+void SbeDumpCollector::writeDumpFile(
+ const std::filesystem::path& path, const uint32_t id,
+ const uint8_t clockState, const uint8_t nodeNum, std::string chipName,
+ const uint8_t chipPos, util::DumpDataPtr& dataPtr, const uint32_t len)
+{
+ using namespace sdbusplus::xyz::openbmc_project::Common::Error;
+ namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
+
+ // Construct the filename
+ std::ostringstream filenameBuilder;
+ filenameBuilder << std::setw(8) << std::setfill('0') << id
+ << ".SbeDataClocks"
+ << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
+ << static_cast<int>(nodeNum) << "." << chipName
+ << static_cast<int>(chipPos);
+
+ auto dumpPath = path / filenameBuilder.str();
+
+ // Attempt to open the file
+ std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
+ if (!outfile)
+ {
+ using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
+ using metadata = xyz::openbmc_project::Common::File::Open;
+ // Unable to open the file for writing
+ auto err = errno;
+ lg2::error("Error opening file to write dump, "
+ "errno({ERRNO}), filepath({FILEPATH})",
+ "ERRNO", err, "FILEPATH", dumpPath.string());
+
+ report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
+ // Just return here, so that the dumps collected from other
+ // SBEs can be packaged.
+ return;
+ }
+
+ // Write to the file
+ try
+ {
+ outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
+
+ lg2::info("Successfully wrote dump file "
+ "path=({PATH}) size=({SIZE})",
+ "PATH", dumpPath.string(), "SIZE", len);
+ }
+ catch (const std::ofstream::failure& oe)
+ {
+ using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
+ using metadata = xyz::openbmc_project::Common::File::Write;
+
+ lg2::error(
+ "Failed to write to dump file, "
+ "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
+ "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
+ dumpPath.string());
+ report<Write>(metadata::ERRNO(oe.code().value()),
+ metadata::PATH(dumpPath.c_str()));
+ // Just return here so dumps collected from other SBEs can be
+ // packaged.
+ }
}
} // namespace openpower::dump::sbe_chipop
diff --git a/dump/sbe_dump_collector.hpp b/dump/sbe_dump_collector.hpp
index 000022c..93ecdd4 100644
--- a/dump/sbe_dump_collector.hpp
+++ b/dump/sbe_dump_collector.hpp
@@ -6,6 +6,9 @@
#include <libpdbg_sbe.h>
}
+#include "dump_utils.hpp"
+#include "sbe_consts.hpp"
+
#include <cstdint>
#include <filesystem>
#include <future>
@@ -114,6 +117,49 @@
uint8_t type, uint32_t id, const std::filesystem::path& path,
uint64_t failingUnit, uint8_t cstate,
const std::vector<struct pdbg_target*>& targets);
+
+ /** @brief This function creates the new dump file in dump file name
+ * format and then writes the contents into it.
+ * @param path - Path to dump file
+ * @param id - A unique id assigned to dump to be collected
+ * @param clockState - Clock state, ON or Off
+ * @param nodeNum - Node containing the chip
+ * @param chipName - Name of the chip
+ * @param chipPos - Chip position of the failing unit
+ * @param dataPtr - Content to write to file
+ * @param len - Length of the content
+ */
+ void writeDumpFile(const std::filesystem::path& path, const uint32_t id,
+ const uint8_t clockState, const uint8_t nodeNum,
+ std::string chipName, const uint8_t chipPos,
+ util::DumpDataPtr& dataPtr, const uint32_t len);
+
+ /**
+ * @brief Determines if fastarray collection is needed based on dump type
+ * and unit.
+ *
+ * @param clockState The current state of the clock.
+ * @param type The type of the dump being collected.
+ * @param failingUnit The ID of the failing unit.
+ * @param chipPos The position of the chip for which the dump is being
+ * collected.
+ *
+ * @return uint8_t - Returns 1 if fastarray collection is needed, 0
+ * otherwise.
+ */
+ inline uint8_t checkFastarrayCollectionNeeded(const uint8_t clockState,
+ const uint8_t type,
+ uint64_t failingUnit,
+ const uint8_t chipPos) const
+ {
+ using namespace openpower::dump::SBE;
+
+ return (clockState == SBE_CLOCK_OFF &&
+ (type == SBE_DUMP_TYPE_HOSTBOOT ||
+ (type == SBE_DUMP_TYPE_HARDWARE && chipPos == failingUnit)))
+ ? 1
+ : 0;
+ }
};
} // namespace openpower::dump::sbe_chipop