Handle Exceptions and Uncorrectable Errors
We are getting processes crashes in the fleet which we want to avoid.
Handle exceptions gracefully by reinitializing instead of relying on
systemd reinit (crash and restart).
Also, ensure that we are reading the Uncorrectable Error region first.
The logic should be:
```
1. BIOS_switch ^ BMC_switch == 0 → reserved region has nothing to read
a. CONTINUE
2. BIOS_switch ^ BMC_switch == 1 → reserved region contains unread log
a. Read the Uncorrectable Error log
b. If corruption is detected (log is not parsable)
i. Go through corruption handling flow (reinit)
c. Else toggle the BMC Uncorrectable Error log flag
```
Tested: Added unit test
Signed-off-by: Brandon Kim <brandonkim@google.com>
Change-Id: I8212476be11ea7f13f68e42ff440c2bdd2fc8c2a
diff --git a/src/main.cpp b/src/main.cpp
index 60fc8b5..4465185 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -46,20 +46,113 @@
stdplus::print(stderr, "Async wait failed {}\n", error.message());
return;
}
- std::vector<EntryPair> entryPairs = bufferInterface->readErrorLogs();
- for (const auto& [entryHeader, entry] : entryPairs)
+
+ try
{
- rde::RdeDecodeStatus rdeDecodeStatus =
- rdeCommandHandler->decodeRdeCommand(
- entry,
- static_cast<rde::RdeCommandType>(entryHeader.rdeCommandType));
- if (rdeDecodeStatus == rde::RdeDecodeStatus::RdeStopFlagReceived)
+ std::vector<uint8_t> ueLog =
+ bufferInterface->readUeLogFromReservedRegion();
+ if (!ueLog.empty())
{
+ stdplus::print(
+ stdout,
+ "UE log found in reserved region, attempting to process\n");
+
+ // UE log is BEJ encoded data, requiring RdeOperationInitRequest
+ rde::RdeDecodeStatus ueDecodeStatus =
+ rdeCommandHandler->decodeRdeCommand(
+ ueLog, rde::RdeCommandType::RdeOperationInitRequest);
+
+ if (ueDecodeStatus != rde::RdeDecodeStatus::RdeOk &&
+ ueDecodeStatus != rde::RdeDecodeStatus::RdeStopFlagReceived)
+ {
+ throw std::runtime_error(std::format(
+ "Corruption detected processing UE log from reserved region. RDE decode status: {}",
+ static_cast<int>(ueDecodeStatus)));
+ }
+ stdplus::print(stdout, "UE log processed successfully.\n");
+ // Successfully processed. Toggle BMC's view of ueSwitch flag.
auto bufferHeader = bufferInterface->getCachedBufferHeader();
- auto newbmcFlags =
- boost::endian::little_to_native(bufferHeader.bmcFlags) |
- static_cast<uint32_t>(BmcFlags::ready);
- bufferInterface->updateBmcFlags(newbmcFlags);
+ uint32_t bmcSideFlags =
+ boost::endian::little_to_native(bufferHeader.bmcFlags);
+ uint32_t newBmcFlags =
+ bmcSideFlags ^ static_cast<uint32_t>(BufferFlags::ueSwitch);
+ bufferInterface->updateBmcFlags(newBmcFlags);
+ }
+
+ std::vector<EntryPair> entryPairs = bufferInterface->readErrorLogs();
+ for (const auto& [entryHeader, entry] : entryPairs)
+ {
+ rde::RdeDecodeStatus rdeDecodeStatus =
+ rdeCommandHandler->decodeRdeCommand(
+ entry, static_cast<rde::RdeCommandType>(
+ entryHeader.rdeCommandType));
+ if (rdeDecodeStatus == rde::RdeDecodeStatus::RdeStopFlagReceived)
+ {
+ auto bufferHeader = bufferInterface->getCachedBufferHeader();
+ auto newbmcFlags =
+ boost::endian::little_to_native(bufferHeader.bmcFlags) |
+ static_cast<uint32_t>(BmcFlags::ready);
+ bufferInterface->updateBmcFlags(newbmcFlags);
+ }
+ }
+ }
+ catch (const std::exception& e)
+ {
+ stdplus::print(
+ stderr,
+ "Error during log processing (std::exception): {}. Attempting to reinitialize buffer.\n",
+ e.what());
+ try
+ {
+ bufferInterface->initialize(bmcInterfaceVersion, queueSize,
+ ueRegionSize, magicNumber);
+ stdplus::print(
+ stdout,
+ "Buffer reinitialized successfully after std::exception.\n");
+ }
+ catch (const std::exception& reinit_e)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (std::exception): {}. Terminating read loop.\n",
+ reinit_e.what());
+ return;
+ }
+ catch (...)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (unknown exception). Terminating read loop.\n");
+ return;
+ }
+ }
+ catch (...)
+ {
+ stdplus::print(
+ stderr,
+ "Unknown error during log processing. Attempting to reinitialize buffer.\n");
+ try
+ {
+ bufferInterface->initialize(bmcInterfaceVersion, queueSize,
+ ueRegionSize, magicNumber);
+ stdplus::print(
+ stdout,
+ "Buffer reinitialized successfully after unknown error.\n");
+ }
+ catch (const std::exception& reinit_e)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (std::exception): {}. Terminating read loop.\n",
+ reinit_e.what());
+ return;
+ }
+ catch (...)
+ {
+ stdplus::print(
+ stderr,
+ "CRITICAL: Failed to reinitialize buffer (unknown exception). Terminating read loop.\n");
+ return;
}
}