Move IERR monitor to the new architecture

Add an error monitor for CPU_CATERR that polls to check if it is
asserted through the IERR timeout and logs the event.  If it is ever
not asserted, it will wait for an interrupt to start polling
again.

Change-Id: I43c03ece0f706e82aa352505869654b18adddd06
Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
diff --git a/include/error_monitors/ierr_monitor.hpp b/include/error_monitors/ierr_monitor.hpp
new file mode 100644
index 0000000..e743177
--- /dev/null
+++ b/include/error_monitors/ierr_monitor.hpp
@@ -0,0 +1,456 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+#pragma once
+#include <systemd/sd-journal.h>
+
+#include <error_monitors/base_gpio_poll_monitor.hpp>
+#include <host_error_monitor.hpp>
+#include <sdbusplus/asio/object_server.hpp>
+
+namespace host_error_monitor::ierr_monitor
+{
+static constexpr bool debug = true;
+
+class IERRMonitor :
+    public host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor
+{
+    const static host_error_monitor::base_gpio_poll_monitor::AssertValue
+        assertValue =
+            host_error_monitor::base_gpio_poll_monitor::AssertValue::lowAssert;
+    const static constexpr size_t ierrPollingTimeMs = 100;
+    const static constexpr size_t ierrTimeoutMs = 2000;
+    const static constexpr size_t ierrTimeoutMsMax =
+        600000; // 10 minutes maximum
+
+    const static constexpr uint8_t beepCPUIERR = 4;
+
+    std::shared_ptr<sdbusplus::asio::dbus_interface> associationIERR;
+    std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
+
+    static const constexpr char* callbackMgrPath =
+        "/xyz/openbmc_project/CallbackManager";
+
+    void logEvent()
+    {
+        if (!checkIERRCPUs())
+        {
+            cpuIERRLog();
+        }
+    }
+
+    void cpuIERRLog()
+    {
+        sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
+                        "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+                        "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
+    }
+
+    void cpuIERRLog(const int cpuNum)
+    {
+        std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
+
+        sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+                        LOG_INFO, "REDFISH_MESSAGE_ID=%s",
+                        "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
+                        msg.c_str(), NULL);
+    }
+
+    void cpuIERRLog(const int cpuNum, const std::string& type)
+    {
+        std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
+
+        sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+                        LOG_INFO, "REDFISH_MESSAGE_ID=%s",
+                        "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
+                        msg.c_str(), NULL);
+    }
+
+    bool checkIERRCPUs()
+    {
+        bool cpuIERRFound = false;
+        for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
+             cpu++, addr++)
+        {
+            EPECIStatus peciStatus = PECI_CC_SUCCESS;
+            uint8_t cc = 0;
+            CPUModel model{};
+            uint8_t stepping = 0;
+            if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
+            {
+                std::cerr << "Cannot get CPUID!\n";
+                continue;
+            }
+
+            switch (model)
+            {
+                case skx:
+                {
+                    // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+                    // that caused the IERR
+                    uint32_t mcaErrSrcLog = 0;
+                    peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
+                                                  (uint8_t*)&mcaErrSrcLog, &cc);
+                    if (peciError(peciStatus, cc))
+                    {
+                        printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
+                        continue;
+                    }
+                    // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+                    if ((mcaErrSrcLog & (1 << 20)) ||
+                        (mcaErrSrcLog & (1 << 27)))
+                    {
+                        // TODO: Light the CPU fault LED?
+                        cpuIERRFound = true;
+                        incrementCPUErrorCount(cpu);
+                        // Next check if it's a CPU/VR mismatch by reading the
+                        // IA32_MC4_STATUS MSR (0x411)
+                        uint64_t mc4Status = 0;
+                        peciStatus =
+                            peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("IA32_MC4_STATUS", addr, peciStatus,
+                                           cc);
+                            continue;
+                        }
+                        // Check MSEC bits 31:24 for
+                        // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
+                        // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
+                        // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
+                        uint64_t msec = (mc4Status >> 24) & 0xFF;
+                        if (msec == 0x40 || msec == 0x42 || msec == 0x43)
+                        {
+                            cpuIERRLog(cpu, "CPU/VR Mismatch");
+                            continue;
+                        }
+
+                        // Next check if it's a Core FIVR fault by looking for a
+                        // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2
+                        // offset 80h)
+                        uint32_t coreFIVRErrLog = 0;
+                        peciStatus = peci_RdPCIConfigLocal(
+                            addr, 1, 30, 2, 0x80, sizeof(uint32_t),
+                            (uint8_t*)&coreFIVRErrLog, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("CORE_FIVR_ERR_LOG", addr,
+                                           peciStatus, cc);
+                            continue;
+                        }
+                        if (coreFIVRErrLog)
+                        {
+                            cpuIERRLog(cpu, "Core FIVR Fault");
+                            continue;
+                        }
+
+                        // Next check if it's an Uncore FIVR fault by looking
+                        // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30
+                        // F2 offset 84h)
+                        uint32_t uncoreFIVRErrLog = 0;
+                        peciStatus = peci_RdPCIConfigLocal(
+                            addr, 1, 30, 2, 0x84, sizeof(uint32_t),
+                            (uint8_t*)&uncoreFIVRErrLog, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("UNCORE_FIVR_ERR_LOG", addr,
+                                           peciStatus, cc);
+                            continue;
+                        }
+                        if (uncoreFIVRErrLog)
+                        {
+                            cpuIERRLog(cpu, "Uncore FIVR Fault");
+                            continue;
+                        }
+
+                        // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
+                        // both zero, but MSEC bits 31:24 have either
+                        // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
+                        // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
+                        // an uncore FIVR fault
+                        if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
+                            (msec == 0x51 || msec == 0x52))
+                        {
+                            cpuIERRLog(cpu, "Uncore FIVR Fault");
+                            continue;
+                        }
+                        cpuIERRLog(cpu);
+                    }
+                    break;
+                }
+                case icx:
+                {
+                    // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+                    // that caused the IERR
+                    uint32_t mcaErrSrcLog = 0;
+                    peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
+                                                  (uint8_t*)&mcaErrSrcLog, &cc);
+                    if (peciError(peciStatus, cc))
+                    {
+                        printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
+                        continue;
+                    }
+                    // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+                    if ((mcaErrSrcLog & (1 << 20)) ||
+                        (mcaErrSrcLog & (1 << 27)))
+                    {
+                        // TODO: Light the CPU fault LED?
+                        cpuIERRFound = true;
+                        incrementCPUErrorCount(cpu);
+                        // Next check if it's a CPU/VR mismatch by reading the
+                        // IA32_MC4_STATUS MSR (0x411)
+                        uint64_t mc4Status = 0;
+                        peciStatus =
+                            peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("IA32_MC4_STATUS", addr, peciStatus,
+                                           cc);
+                            continue;
+                        }
+                        // Check MSEC bits 31:24 for
+                        // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
+                        // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
+                        // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
+                        uint64_t msec = (mc4Status >> 24) & 0xFF;
+                        if (msec == 0x40 || msec == 0x42 || msec == 0x43)
+                        {
+                            cpuIERRLog(cpu, "CPU/VR Mismatch");
+                            continue;
+                        }
+
+                        // Next check if it's a Core FIVR fault by looking for a
+                        // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2
+                        // offsets C0h and C4h) (Note: Bus 31 is accessed on
+                        // PECI as bus 14)
+                        uint32_t coreFIVRErrLog0 = 0;
+                        uint32_t coreFIVRErrLog1 = 0;
+                        peciStatus = peci_RdEndPointConfigPciLocal(
+                            addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
+                            (uint8_t*)&coreFIVRErrLog0, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("CORE_FIVR_ERR_LOG_0", addr,
+                                           peciStatus, cc);
+                            continue;
+                        }
+                        peciStatus = peci_RdEndPointConfigPciLocal(
+                            addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
+                            (uint8_t*)&coreFIVRErrLog1, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("CORE_FIVR_ERR_LOG_1", addr,
+                                           peciStatus, cc);
+                            continue;
+                        }
+                        if (coreFIVRErrLog0 || coreFIVRErrLog1)
+                        {
+                            cpuIERRLog(cpu, "Core FIVR Fault");
+                            continue;
+                        }
+
+                        // Next check if it's an Uncore FIVR fault by looking
+                        // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(31)
+                        // D30 F2 offset 84h) (Note: Bus 31 is accessed on PECI
+                        // as bus 14)
+                        uint32_t uncoreFIVRErrLog = 0;
+                        peciStatus = peci_RdEndPointConfigPciLocal(
+                            addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
+                            (uint8_t*)&uncoreFIVRErrLog, &cc);
+                        if (peciError(peciStatus, cc))
+                        {
+                            printPECIError("UNCORE_FIVR_ERR_LOG", addr,
+                                           peciStatus, cc);
+                            continue;
+                        }
+                        if (uncoreFIVRErrLog)
+                        {
+                            cpuIERRLog(cpu, "Uncore FIVR Fault");
+                            continue;
+                        }
+
+                        // TODO: Update MSEC/MSCOD_31_24 check
+                        // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
+                        // both zero, but MSEC bits 31:24 have either
+                        // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
+                        // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
+                        // an uncore FIVR fault
+                        if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
+                            !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
+                        {
+                            cpuIERRLog(cpu, "Uncore FIVR Fault");
+                            continue;
+                        }
+                        cpuIERRLog(cpu);
+                    }
+                    break;
+                }
+            }
+        }
+        return cpuIERRFound;
+    }
+
+    void incrementCPUErrorCount(int cpuNum)
+    {
+        std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
+
+        // Get the current count
+        conn->async_method_call(
+            [this, propertyName](boost::system::error_code ec,
+                                 const std::variant<uint8_t>& property) {
+                if (ec)
+                {
+                    std::cerr << "Failed to read " << propertyName << ": "
+                              << ec.message() << "\n";
+                    return;
+                }
+                const uint8_t* errorCountVariant =
+                    std::get_if<uint8_t>(&property);
+                if (errorCountVariant == nullptr)
+                {
+                    std::cerr << propertyName << " invalid\n";
+                    return;
+                }
+                uint8_t errorCount = *errorCountVariant;
+                if (errorCount == std::numeric_limits<uint8_t>::max())
+                {
+                    std::cerr << "Maximum error count reached\n";
+                    return;
+                }
+                // Increment the count
+                errorCount++;
+                conn->async_method_call(
+                    [propertyName](boost::system::error_code ec) {
+                        if (ec)
+                        {
+                            std::cerr << "Failed to set " << propertyName
+                                      << ": " << ec.message() << "\n";
+                        }
+                    },
+                    "xyz.openbmc_project.Settings",
+                    "/xyz/openbmc_project/control/processor_error_config",
+                    "org.freedesktop.DBus.Properties", "Set",
+                    "xyz.openbmc_project.Control.Processor.ErrConfig",
+                    propertyName, std::variant<uint8_t>{errorCount});
+            },
+            "xyz.openbmc_project.Settings",
+            "/xyz/openbmc_project/control/processor_error_config",
+            "org.freedesktop.DBus.Properties", "Get",
+            "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
+    }
+
+    void assertHandler() override
+    {
+        host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor::
+            assertHandler();
+
+        setLED();
+
+        beep(conn, beepCPUIERR);
+
+        conn->async_method_call(
+            [this](boost::system::error_code ec,
+                   const std::variant<bool>& property) {
+                if (ec)
+                {
+                    return;
+                }
+                const bool* reset = std::get_if<bool>(&property);
+                if (reset == nullptr)
+                {
+                    std::cerr << "Unable to read reset on CATERR value\n";
+                    return;
+                }
+                startCrashdumpAndRecovery(conn, *reset, "IERR");
+            },
+            "xyz.openbmc_project.Settings",
+            "/xyz/openbmc_project/control/processor_error_config",
+            "org.freedesktop.DBus.Properties", "Get",
+            "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
+    }
+
+    void deassertHandler() override
+    {
+        unsetLED();
+    }
+
+    void setLED()
+    {
+        std::vector<Association> associations;
+
+        associations.emplace_back(
+            "", "critical", "/xyz/openbmc_project/host_error_monitor/ierr");
+        associations.emplace_back("", "critical", callbackMgrPath);
+
+        associationIERR->set_property("Associations", associations);
+    }
+
+    void unsetLED()
+    {
+        std::vector<Association> associations;
+
+        associations.emplace_back("", "", "");
+
+        associationIERR->set_property("Associations", associations);
+    }
+
+  public:
+    IERRMonitor(boost::asio::io_service& io,
+                std::shared_ptr<sdbusplus::asio::connection> conn,
+                const std::string& signalName) :
+        BaseGPIOPollMonitor(io, conn, signalName, assertValue,
+                            ierrPollingTimeMs, ierrTimeoutMs)
+    {
+        // Associations interface for led status
+        std::vector<host_error_monitor::Association> associations;
+        associations.emplace_back("", "", "");
+
+        sdbusplus::asio::object_server server =
+            sdbusplus::asio::object_server(conn);
+        associationIERR =
+            server.add_interface("/xyz/openbmc_project/host_error_monitor/ierr",
+                                 "xyz.openbmc_project.Association.Definitions");
+        associationIERR->register_property("Associations", associations);
+        associationIERR->initialize();
+
+        hostErrorTimeoutIface = server.add_interface(
+            "/xyz/openbmc_project/host_error_monitor",
+            "xyz.openbmc_project.HostErrorMonitor.Timeout");
+
+        hostErrorTimeoutIface->register_property(
+            "IERRTimeoutMs", ierrTimeoutMs,
+            [this](const std::size_t& requested, std::size_t& resp) {
+                if (requested > ierrTimeoutMsMax)
+                {
+                    std::cerr << "IERRTimeoutMs update to " << requested
+                              << "ms rejected. Cannot be greater than "
+                              << ierrTimeoutMsMax << "ms.\n";
+                    return 0;
+                }
+                std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
+                setTimeoutMs(requested);
+                resp = requested;
+                return 1;
+            },
+            [this](std::size_t& resp) { return getTimeoutMs(); });
+        hostErrorTimeoutIface->initialize();
+
+        if (valid)
+        {
+            startPolling();
+        }
+    }
+};
+} // namespace host_error_monitor::ierr_monitor