Handle Exceptions and Uncorrectable Errors
We are getting processes crashes in the fleet which we want to avoid.
Handle exceptions gracefully by reinitializing instead of relying on
systemd reinit (crash and restart).
Also, ensure that we are reading the Uncorrectable Error region first.
The logic should be:
```
1. BIOS_switch ^ BMC_switch == 0 → reserved region has nothing to read
a. CONTINUE
2. BIOS_switch ^ BMC_switch == 1 → reserved region contains unread log
a. Read the Uncorrectable Error log
b. If corruption is detected (log is not parsable)
i. Go through corruption handling flow (reinit)
c. Else toggle the BMC Uncorrectable Error log flag
```
Tested: Added unit test
Signed-off-by: Brandon Kim <brandonkim@google.com>
Change-Id: I8212476be11ea7f13f68e42ff440c2bdd2fc8c2a
diff --git a/src/buffer.cpp b/src/buffer.cpp
index 20427cb..0254cac 100644
--- a/src/buffer.cpp
+++ b/src/buffer.cpp
@@ -6,6 +6,7 @@
#include <boost/endian/arithmetic.hpp>
#include <boost/endian/conversion.hpp>
+#include <stdplus/print.hpp>
#include <algorithm>
#include <array>
@@ -222,6 +223,52 @@
return *reinterpret_cast<struct QueueEntryHeader*>(bytesRead.data());
}
+std::vector<uint8_t> BufferImpl::readUeLogFromReservedRegion()
+{
+ // Ensure cachedBufferHeader is up-to-date
+ readBufferHeader();
+
+ uint16_t currentUeRegionSize =
+ boost::endian::little_to_native(cachedBufferHeader.ueRegionSize);
+ if (currentUeRegionSize == 0)
+ {
+ stdplus::print(stderr,
+ "[readUeLogFromReservedRegion] UE Region size is 0\n");
+ return {};
+ }
+
+ uint32_t biosSideFlags =
+ boost::endian::little_to_native(cachedBufferHeader.biosFlags);
+ uint32_t bmcSideFlags =
+ boost::endian::little_to_native(cachedBufferHeader.bmcFlags);
+
+ // (BIOS_switch ^ BMC_switch) & BIT0 == BIT0 -> unread log
+ // This means if the ueSwitch bit differs, there's an unread log.
+ if (!((biosSideFlags ^ bmcSideFlags) &
+ static_cast<uint32_t>(BufferFlags::ueSwitch)))
+ {
+ return {};
+ }
+ // UE log should be present and unread by BMC, read from end of header
+ // (0x30) to the size of the UE region specified in the header.
+ size_t ueRegionOffset = sizeof(struct CircularBufferHeader);
+ std::vector<uint8_t> ueLogData =
+ dataInterface->read(ueRegionOffset, currentUeRegionSize);
+
+ if (ueLogData.size() == currentUeRegionSize)
+ {
+ return ueLogData;
+ }
+ stdplus::print(stderr,
+ "[readUeLogFromReservedRegion] Failed to read "
+ "full UE log. Expected {}, got {}\n",
+ currentUeRegionSize, ueLogData.size());
+ // Throwing an exception allows main loop to handle re-init.
+ throw std::runtime_error(
+ std::format("Failed to read full UE log. Expected {}, got {}",
+ currentUeRegionSize, ueLogData.size()));
+}
+
EntryPair BufferImpl::readEntry()
{
struct QueueEntryHeader entryHeader = readEntryHeader();
diff --git a/src/main.cpp b/src/main.cpp
index 60fc8b5..4465185 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -46,20 +46,113 @@
stdplus::print(stderr, "Async wait failed {}\n", error.message());
return;
}
- std::vector<EntryPair> entryPairs = bufferInterface->readErrorLogs();
- for (const auto& [entryHeader, entry] : entryPairs)
+
+ try
{
- rde::RdeDecodeStatus rdeDecodeStatus =
- rdeCommandHandler->decodeRdeCommand(
- entry,
- static_cast<rde::RdeCommandType>(entryHeader.rdeCommandType));
- if (rdeDecodeStatus == rde::RdeDecodeStatus::RdeStopFlagReceived)
+ std::vector<uint8_t> ueLog =
+ bufferInterface->readUeLogFromReservedRegion();
+ if (!ueLog.empty())
{
+ stdplus::print(
+ stdout,
+ "UE log found in reserved region, attempting to process\n");
+
+ // UE log is BEJ encoded data, requiring RdeOperationInitRequest
+ rde::RdeDecodeStatus ueDecodeStatus =
+ rdeCommandHandler->decodeRdeCommand(
+ ueLog, rde::RdeCommandType::RdeOperationInitRequest);
+
+ if (ueDecodeStatus != rde::RdeDecodeStatus::RdeOk &&
+ ueDecodeStatus != rde::RdeDecodeStatus::RdeStopFlagReceived)
+ {
+ throw std::runtime_error(std::format(
+ "Corruption detected processing UE log from reserved region. RDE decode status: {}",
+ static_cast<int>(ueDecodeStatus)));
+ }
+ stdplus::print(stdout, "UE log processed successfully.\n");
+ // Successfully processed. Toggle BMC's view of ueSwitch flag.
auto bufferHeader = bufferInterface->getCachedBufferHeader();
- auto newbmcFlags =
- boost::endian::little_to_native(bufferHeader.bmcFlags) |
- static_cast<uint32_t>(BmcFlags::ready);
- bufferInterface->updateBmcFlags(newbmcFlags);
+ uint32_t bmcSideFlags =
+ boost::endian::little_to_native(bufferHeader.bmcFlags);
+ uint32_t newBmcFlags =
+ bmcSideFlags ^ static_cast<uint32_t>(BufferFlags::ueSwitch);
+ bufferInterface->updateBmcFlags(newBmcFlags);
+ }
+
+ std::vector<EntryPair> entryPairs = bufferInterface->readErrorLogs();
+ for (const auto& [entryHeader, entry] : entryPairs)
+ {
+ rde::RdeDecodeStatus rdeDecodeStatus =
+ rdeCommandHandler->decodeRdeCommand(
+ entry, static_cast<rde::RdeCommandType>(
+ entryHeader.rdeCommandType));
+ if (rdeDecodeStatus == rde::RdeDecodeStatus::RdeStopFlagReceived)
+ {
+ auto bufferHeader = bufferInterface->getCachedBufferHeader();
+ auto newbmcFlags =
+ boost::endian::little_to_native(bufferHeader.bmcFlags) |
+ static_cast<uint32_t>(BmcFlags::ready);
+ bufferInterface->updateBmcFlags(newbmcFlags);
+ }
+ }
+ }
+ catch (const std::exception& e)
+ {
+ stdplus::print(
+ stderr,
+ "Error during log processing (std::exception): {}. Attempting to reinitialize buffer.\n",
+ e.what());
+ try
+ {
+ bufferInterface->initialize(bmcInterfaceVersion, queueSize,
+ ueRegionSize, magicNumber);
+ stdplus::print(
+ stdout,
+ "Buffer reinitialized successfully after std::exception.\n");
+ }
+ catch (const std::exception& reinit_e)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (std::exception): {}. Terminating read loop.\n",
+ reinit_e.what());
+ return;
+ }
+ catch (...)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (unknown exception). Terminating read loop.\n");
+ return;
+ }
+ }
+ catch (...)
+ {
+ stdplus::print(
+ stderr,
+ "Unknown error during log processing. Attempting to reinitialize buffer.\n");
+ try
+ {
+ bufferInterface->initialize(bmcInterfaceVersion, queueSize,
+ ueRegionSize, magicNumber);
+ stdplus::print(
+ stdout,
+ "Buffer reinitialized successfully after unknown error.\n");
+ }
+ catch (const std::exception& reinit_e)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (std::exception): {}. Terminating read loop.\n",
+ reinit_e.what());
+ return;
+ }
+ catch (...)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (unknown exception). Terminating read loop.\n");
+ return;
}
}