Add IERR logging to the CATERR/IERR handler
This change attempts to determine which CPU asserted the CATERR
signal and the type of CATERR to include in the log.
Tested:
Manually triggered an IERR and confirmed that the correct data
was logged.
Change-Id: I8e9ad26889c093392254ae1d70af3dde2c62a519
Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
index b2c485c..2d5d900 100644
--- a/src/host_error_monitor.cpp
+++ b/src/host_error_monitor.cpp
@@ -51,6 +51,31 @@
static gpiod::line pchThermtripLine;
static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
+static void cpuIERRLog()
+{
+ sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
+ "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
+}
+
+static void cpuIERRLog(const int cpuNum)
+{
+ std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
+
+ sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+ LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
+}
+
+static void cpuIERRLog(const int cpuNum, const std::string& type)
+{
+ std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
+
+ sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+ LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
+}
+
static void cpuERR2Log()
{
sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
@@ -245,9 +270,208 @@
"com.intel.crashdump.Stored", "GenerateStoredLog");
}
+static bool checkIERRCPUs()
+{
+ bool cpuIERRFound = false;
+ for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
+ cpu++, addr++)
+ {
+ uint8_t cc = 0;
+ CPUModel model{};
+ uint8_t stepping = 0;
+ if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
+ {
+ std::cerr << "Cannot get CPUID!\n";
+ continue;
+ }
+
+ switch (model)
+ {
+ case skx:
+ {
+ // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+ // that caused the IERR
+ uint32_t mcaErrSrcLog = 0;
+ if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+ if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
+ {
+ // TODO: Light the CPU fault LED?
+ cpuIERRFound = true;
+ // Next check if it's a CPU/VR mismatch by reading the
+ // IA32_MC4_STATUS MSR (0x411)
+ uint64_t mc4Status = 0;
+ if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
+ PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // Check MSEC bits 31:24 for
+ // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
+ // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
+ // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
+ if ((mc4Status & (0x40 << 24)) ||
+ (mc4Status & (0x42 << 24)) ||
+ (mc4Status & (0x43 << 24)))
+ {
+ cpuIERRLog(cpu, "CPU/VR Mismatch");
+ continue;
+ }
+
+ // Next check if it's a Core FIVR fault by looking for a
+ // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
+ // 80h)
+ uint32_t coreFIVRErrLog = 0;
+ if (peci_RdPCIConfigLocal(
+ addr, 1, 30, 2, 0x80, sizeof(uint32_t),
+ (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (coreFIVRErrLog)
+ {
+ cpuIERRLog(cpu, "Core FIVR Fault");
+ continue;
+ }
+
+ // Next check if it's an Uncore FIVR fault by looking for a
+ // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
+ // 84h)
+ uint32_t uncoreFIVRErrLog = 0;
+ if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
+ sizeof(uint32_t),
+ (uint8_t*)&uncoreFIVRErrLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (uncoreFIVRErrLog)
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+
+ // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
+ // both zero, but MSEC bits 31:24 have either
+ // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
+ // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
+ // uncore FIVR fault
+ if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
+ ((mc4Status & (0x51 << 24)) ||
+ (mc4Status & (0x52 << 24))))
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+ cpuIERRLog(cpu);
+ }
+ break;
+ }
+ case icx:
+ {
+ // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+ // that caused the IERR
+ uint32_t mcaErrSrcLog = 0;
+ if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+ if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
+ {
+ // TODO: Light the CPU fault LED?
+ cpuIERRFound = true;
+ // Next check if it's a CPU/VR mismatch by reading the
+ // IA32_MC4_STATUS MSR (0x411)
+ uint64_t mc4Status = 0;
+ if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
+ PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // TODO: Update MSEC/MSCOD_31_24 check
+ // Check MSEC bits 31:24 for
+ // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
+ // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
+ // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
+ if ((mc4Status & (0x40 << 24)) ||
+ (mc4Status & (0x42 << 24)) ||
+ (mc4Status & (0x43 << 24)))
+ {
+ cpuIERRLog(cpu, "CPU/VR Mismatch");
+ continue;
+ }
+
+ // Next check if it's a Core FIVR fault by looking for a
+ // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
+ // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
+ uint32_t coreFIVRErrLog0 = 0;
+ uint32_t coreFIVRErrLog1 = 0;
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
+ (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
+ (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (coreFIVRErrLog0 || coreFIVRErrLog1)
+ {
+ cpuIERRLog(cpu, "Core FIVR Fault");
+ continue;
+ }
+
+ // Next check if it's an Uncore FIVR fault by looking for a
+ // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
+ // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
+ uint32_t uncoreFIVRErrLog = 0;
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
+ (uint8_t*)&uncoreFIVRErrLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (uncoreFIVRErrLog)
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+
+ // TODO: Update MSEC/MSCOD_31_24 check
+ // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
+ // both zero, but MSEC bits 31:24 have either
+ // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
+ // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
+ // uncore FIVR fault
+ if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
+ !uncoreFIVRErrLog &&
+ ((mc4Status & (0x51 << 24)) ||
+ (mc4Status & (0x52 << 24))))
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+ cpuIERRLog(cpu);
+ }
+ break;
+ }
+ }
+ }
+ return cpuIERRFound;
+}
+
static void caterrAssertHandler()
{
- std::cout << "CPU CATERR detected, starting timer\n";
caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
if (ec)
@@ -259,10 +483,14 @@
std::cerr << "caterr timeout async_wait failed: "
<< ec.message() << "\n";
}
- std::cout << "CATERR assert timer canceled\n";
return;
}
- std::cout << "CATERR asset timer completed\n";
+ std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
+ << " ms\n";
+ if (!checkIERRCPUs())
+ {
+ cpuIERRLog();
+ }
conn->async_method_call(
[](boost::system::error_code ec,
const std::variant<bool>& property) {