blob: e7431773a778309e2cac80db954011636fac0dc6 [file] [log] [blame]
/*
// Copyright (c) 2021 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#pragma once
#include <systemd/sd-journal.h>
#include <error_monitors/base_gpio_poll_monitor.hpp>
#include <host_error_monitor.hpp>
#include <sdbusplus/asio/object_server.hpp>
namespace host_error_monitor::ierr_monitor
{
static constexpr bool debug = true;
class IERRMonitor :
public host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor
{
const static host_error_monitor::base_gpio_poll_monitor::AssertValue
assertValue =
host_error_monitor::base_gpio_poll_monitor::AssertValue::lowAssert;
const static constexpr size_t ierrPollingTimeMs = 100;
const static constexpr size_t ierrTimeoutMs = 2000;
const static constexpr size_t ierrTimeoutMsMax =
600000; // 10 minutes maximum
const static constexpr uint8_t beepCPUIERR = 4;
std::shared_ptr<sdbusplus::asio::dbus_interface> associationIERR;
std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
static const constexpr char* callbackMgrPath =
"/xyz/openbmc_project/CallbackManager";
void logEvent()
{
if (!checkIERRCPUs())
{
cpuIERRLog();
}
}
void cpuIERRLog()
{
sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
"REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
"REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
}
void cpuIERRLog(const int cpuNum)
{
std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
LOG_INFO, "REDFISH_MESSAGE_ID=%s",
"OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
msg.c_str(), NULL);
}
void cpuIERRLog(const int cpuNum, const std::string& type)
{
std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
LOG_INFO, "REDFISH_MESSAGE_ID=%s",
"OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
msg.c_str(), NULL);
}
bool checkIERRCPUs()
{
bool cpuIERRFound = false;
for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
cpu++, addr++)
{
EPECIStatus peciStatus = PECI_CC_SUCCESS;
uint8_t cc = 0;
CPUModel model{};
uint8_t stepping = 0;
if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
{
std::cerr << "Cannot get CPUID!\n";
continue;
}
switch (model)
{
case skx:
{
// First check the MCA_ERR_SRC_LOG to see if this is the CPU
// that caused the IERR
uint32_t mcaErrSrcLog = 0;
peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
(uint8_t*)&mcaErrSrcLog, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
continue;
}
// Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
if ((mcaErrSrcLog & (1 << 20)) ||
(mcaErrSrcLog & (1 << 27)))
{
// TODO: Light the CPU fault LED?
cpuIERRFound = true;
incrementCPUErrorCount(cpu);
// Next check if it's a CPU/VR mismatch by reading the
// IA32_MC4_STATUS MSR (0x411)
uint64_t mc4Status = 0;
peciStatus =
peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("IA32_MC4_STATUS", addr, peciStatus,
cc);
continue;
}
// Check MSEC bits 31:24 for
// MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
// MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
// MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
uint64_t msec = (mc4Status >> 24) & 0xFF;
if (msec == 0x40 || msec == 0x42 || msec == 0x43)
{
cpuIERRLog(cpu, "CPU/VR Mismatch");
continue;
}
// Next check if it's a Core FIVR fault by looking for a
// non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2
// offset 80h)
uint32_t coreFIVRErrLog = 0;
peciStatus = peci_RdPCIConfigLocal(
addr, 1, 30, 2, 0x80, sizeof(uint32_t),
(uint8_t*)&coreFIVRErrLog, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("CORE_FIVR_ERR_LOG", addr,
peciStatus, cc);
continue;
}
if (coreFIVRErrLog)
{
cpuIERRLog(cpu, "Core FIVR Fault");
continue;
}
// Next check if it's an Uncore FIVR fault by looking
// for a non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30
// F2 offset 84h)
uint32_t uncoreFIVRErrLog = 0;
peciStatus = peci_RdPCIConfigLocal(
addr, 1, 30, 2, 0x84, sizeof(uint32_t),
(uint8_t*)&uncoreFIVRErrLog, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("UNCORE_FIVR_ERR_LOG", addr,
peciStatus, cc);
continue;
}
if (uncoreFIVRErrLog)
{
cpuIERRLog(cpu, "Uncore FIVR Fault");
continue;
}
// Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
// both zero, but MSEC bits 31:24 have either
// MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
// MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
// an uncore FIVR fault
if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
(msec == 0x51 || msec == 0x52))
{
cpuIERRLog(cpu, "Uncore FIVR Fault");
continue;
}
cpuIERRLog(cpu);
}
break;
}
case icx:
{
// First check the MCA_ERR_SRC_LOG to see if this is the CPU
// that caused the IERR
uint32_t mcaErrSrcLog = 0;
peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
(uint8_t*)&mcaErrSrcLog, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
continue;
}
// Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
if ((mcaErrSrcLog & (1 << 20)) ||
(mcaErrSrcLog & (1 << 27)))
{
// TODO: Light the CPU fault LED?
cpuIERRFound = true;
incrementCPUErrorCount(cpu);
// Next check if it's a CPU/VR mismatch by reading the
// IA32_MC4_STATUS MSR (0x411)
uint64_t mc4Status = 0;
peciStatus =
peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("IA32_MC4_STATUS", addr, peciStatus,
cc);
continue;
}
// Check MSEC bits 31:24 for
// MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
// MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
// MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
uint64_t msec = (mc4Status >> 24) & 0xFF;
if (msec == 0x40 || msec == 0x42 || msec == 0x43)
{
cpuIERRLog(cpu, "CPU/VR Mismatch");
continue;
}
// Next check if it's a Core FIVR fault by looking for a
// non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2
// offsets C0h and C4h) (Note: Bus 31 is accessed on
// PECI as bus 14)
uint32_t coreFIVRErrLog0 = 0;
uint32_t coreFIVRErrLog1 = 0;
peciStatus = peci_RdEndPointConfigPciLocal(
addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
(uint8_t*)&coreFIVRErrLog0, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("CORE_FIVR_ERR_LOG_0", addr,
peciStatus, cc);
continue;
}
peciStatus = peci_RdEndPointConfigPciLocal(
addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
(uint8_t*)&coreFIVRErrLog1, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("CORE_FIVR_ERR_LOG_1", addr,
peciStatus, cc);
continue;
}
if (coreFIVRErrLog0 || coreFIVRErrLog1)
{
cpuIERRLog(cpu, "Core FIVR Fault");
continue;
}
// Next check if it's an Uncore FIVR fault by looking
// for a non-zero value of UNCORE_FIVR_ERR_LOG (B(31)
// D30 F2 offset 84h) (Note: Bus 31 is accessed on PECI
// as bus 14)
uint32_t uncoreFIVRErrLog = 0;
peciStatus = peci_RdEndPointConfigPciLocal(
addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
(uint8_t*)&uncoreFIVRErrLog, &cc);
if (peciError(peciStatus, cc))
{
printPECIError("UNCORE_FIVR_ERR_LOG", addr,
peciStatus, cc);
continue;
}
if (uncoreFIVRErrLog)
{
cpuIERRLog(cpu, "Uncore FIVR Fault");
continue;
}
// TODO: Update MSEC/MSCOD_31_24 check
// Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
// both zero, but MSEC bits 31:24 have either
// MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
// MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
// an uncore FIVR fault
if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
!uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
{
cpuIERRLog(cpu, "Uncore FIVR Fault");
continue;
}
cpuIERRLog(cpu);
}
break;
}
}
}
return cpuIERRFound;
}
void incrementCPUErrorCount(int cpuNum)
{
std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
// Get the current count
conn->async_method_call(
[this, propertyName](boost::system::error_code ec,
const std::variant<uint8_t>& property) {
if (ec)
{
std::cerr << "Failed to read " << propertyName << ": "
<< ec.message() << "\n";
return;
}
const uint8_t* errorCountVariant =
std::get_if<uint8_t>(&property);
if (errorCountVariant == nullptr)
{
std::cerr << propertyName << " invalid\n";
return;
}
uint8_t errorCount = *errorCountVariant;
if (errorCount == std::numeric_limits<uint8_t>::max())
{
std::cerr << "Maximum error count reached\n";
return;
}
// Increment the count
errorCount++;
conn->async_method_call(
[propertyName](boost::system::error_code ec) {
if (ec)
{
std::cerr << "Failed to set " << propertyName
<< ": " << ec.message() << "\n";
}
},
"xyz.openbmc_project.Settings",
"/xyz/openbmc_project/control/processor_error_config",
"org.freedesktop.DBus.Properties", "Set",
"xyz.openbmc_project.Control.Processor.ErrConfig",
propertyName, std::variant<uint8_t>{errorCount});
},
"xyz.openbmc_project.Settings",
"/xyz/openbmc_project/control/processor_error_config",
"org.freedesktop.DBus.Properties", "Get",
"xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
}
void assertHandler() override
{
host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor::
assertHandler();
setLED();
beep(conn, beepCPUIERR);
conn->async_method_call(
[this](boost::system::error_code ec,
const std::variant<bool>& property) {
if (ec)
{
return;
}
const bool* reset = std::get_if<bool>(&property);
if (reset == nullptr)
{
std::cerr << "Unable to read reset on CATERR value\n";
return;
}
startCrashdumpAndRecovery(conn, *reset, "IERR");
},
"xyz.openbmc_project.Settings",
"/xyz/openbmc_project/control/processor_error_config",
"org.freedesktop.DBus.Properties", "Get",
"xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
}
void deassertHandler() override
{
unsetLED();
}
void setLED()
{
std::vector<Association> associations;
associations.emplace_back(
"", "critical", "/xyz/openbmc_project/host_error_monitor/ierr");
associations.emplace_back("", "critical", callbackMgrPath);
associationIERR->set_property("Associations", associations);
}
void unsetLED()
{
std::vector<Association> associations;
associations.emplace_back("", "", "");
associationIERR->set_property("Associations", associations);
}
public:
IERRMonitor(boost::asio::io_service& io,
std::shared_ptr<sdbusplus::asio::connection> conn,
const std::string& signalName) :
BaseGPIOPollMonitor(io, conn, signalName, assertValue,
ierrPollingTimeMs, ierrTimeoutMs)
{
// Associations interface for led status
std::vector<host_error_monitor::Association> associations;
associations.emplace_back("", "", "");
sdbusplus::asio::object_server server =
sdbusplus::asio::object_server(conn);
associationIERR =
server.add_interface("/xyz/openbmc_project/host_error_monitor/ierr",
"xyz.openbmc_project.Association.Definitions");
associationIERR->register_property("Associations", associations);
associationIERR->initialize();
hostErrorTimeoutIface = server.add_interface(
"/xyz/openbmc_project/host_error_monitor",
"xyz.openbmc_project.HostErrorMonitor.Timeout");
hostErrorTimeoutIface->register_property(
"IERRTimeoutMs", ierrTimeoutMs,
[this](const std::size_t& requested, std::size_t& resp) {
if (requested > ierrTimeoutMsMax)
{
std::cerr << "IERRTimeoutMs update to " << requested
<< "ms rejected. Cannot be greater than "
<< ierrTimeoutMsMax << "ms.\n";
return 0;
}
std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
setTimeoutMs(requested);
resp = requested;
return 1;
},
[this](std::size_t& resp) { return getTimeoutMs(); });
hostErrorTimeoutIface->initialize();
if (valid)
{
startPolling();
}
}
};
} // namespace host_error_monitor::ierr_monitor