Add ERR2 timeout monitoring and logging

This adds ERR2 timeout monitoring to the host error monitor.  When
the ERR2 signal is asserted for more than 90 seconds, the BMC will
log which CPU asserted the ERR2 signal, trigger a Crashdump, and
reset the system if enabled.

Tested:
Manually triggered an ERR2 timeout and confirmed that the event is
handled and logged correctly.

Change-Id: I779cd02c649603f41fba6a93c1187b5be008af4f
Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce7dbc1..43f9510 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@
 
 target_include_directories (host-error-monitor PRIVATE ${CMAKE_SOURCE_DIR})
 
-target_link_libraries (host-error-monitor sdbusplus -lsystemd gpiodcxx)
+target_link_libraries (host-error-monitor sdbusplus -lsystemd gpiodcxx peci)
 
 include_directories (${CMAKE_CURRENT_SOURCE_DIR}/include)
 
diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
index 3d1b7e7..b2c485c 100644
--- a/src/host_error_monitor.cpp
+++ b/src/host_error_monitor.cpp
@@ -13,8 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 */
+#include <peci.h>
 #include <systemd/sd-journal.h>
 
+#include <bitset>
 #include <boost/asio/posix/stream_descriptor.hpp>
 #include <gpiod.hpp>
 #include <iostream>
@@ -28,15 +30,20 @@
 static bool hostOff = true;
 
 const static constexpr size_t caterrTimeoutMs = 2000;
+const static constexpr size_t err2TimeoutMs = 90000;
 const static constexpr size_t crashdumpTimeoutS = 300;
 
 // Timers
 // Timer for CATERR asserted
 static boost::asio::steady_timer caterrAssertTimer(io);
+// Timer for ERR2 asserted
+static boost::asio::steady_timer err2AssertTimer(io);
 
 // GPIO Lines and Event Descriptors
 static gpiod::line caterrLine;
 static boost::asio::posix::stream_descriptor caterrEvent(io);
+static gpiod::line err2Line;
+static boost::asio::posix::stream_descriptor err2Event(io);
 //----------------------------------
 // PCH_BMC_THERMTRIP function related definition
 //----------------------------------
@@ -44,6 +51,22 @@
 static gpiod::line pchThermtripLine;
 static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
 
+static void cpuERR2Log()
+{
+    sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
+                    "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+                    "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
+}
+
+static void cpuERR2Log(const int cpuNum)
+{
+    std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
+
+    sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+                    LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+                    "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
+}
+
 static void initializeErrorState();
 static void initializeHostState()
 {
@@ -102,6 +125,7 @@
             if (hostOff)
             {
                 caterrAssertTimer.cancel();
+                err2AssertTimer.cancel();
             }
         });
 }
@@ -319,6 +343,143 @@
         });
 }
 
+static std::bitset<MAX_CPUS> checkERR2CPUs()
+{
+    std::bitset<MAX_CPUS> err2CPUs = 0;
+    for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
+         cpu++, addr++)
+    {
+        if (peci_Ping(addr) == PECI_CC_SUCCESS)
+        {
+            uint8_t cc = 0;
+            CPUModel model{};
+            uint8_t stepping = 0;
+            if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
+            {
+                std::cerr << "Cannot get CPUID!\n";
+                continue;
+            }
+
+            switch (model)
+            {
+                case skx:
+                {
+                    // Check the ERRPINSTS to see if this is the CPU that caused
+                    // the ERR2 (B(0) D8 F0 offset 210h)
+                    uint32_t errpinsts = 0;
+                    if (peci_RdPCIConfigLocal(
+                            addr, 0, 8, 0, 0x210, sizeof(uint32_t),
+                            (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
+                    {
+                        err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
+                    }
+                    break;
+                }
+                case icx:
+                {
+                    // Check the ERRPINSTS to see if this is the CPU that caused
+                    // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
+                    // accessed on PECI as bus 13)
+                    uint32_t errpinsts = 0;
+                    if (peci_RdEndPointConfigPciLocal(
+                            addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
+                            (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
+                    {
+                        err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
+                    }
+                    break;
+                }
+            }
+        }
+    }
+    return err2CPUs;
+}
+
+static void err2AssertHandler()
+{
+    // ERR2 status is not guaranteed through the timeout, so save which
+    // CPUs have asserted ERR2 now
+    std::bitset<MAX_CPUS> err2CPUs = checkERR2CPUs();
+    err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
+    err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
+        if (ec)
+        {
+            // operation_aborted is expected if timer is canceled before
+            // completion.
+            if (ec != boost::asio::error::operation_aborted)
+            {
+                std::cerr << "err2 timeout async_wait failed: " << ec.message()
+                          << "\n";
+            }
+            return;
+        }
+        std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
+                  << " ms\n";
+        if (err2CPUs.count())
+        {
+            for (int i = 0; i < err2CPUs.size(); i++)
+            {
+                if (err2CPUs[i])
+                {
+                    cpuERR2Log(i);
+                }
+            }
+        }
+        else
+        {
+            cpuERR2Log();
+        }
+        conn->async_method_call(
+            [](boost::system::error_code ec,
+               const std::variant<bool>& property) {
+                if (ec)
+                {
+                    return;
+                }
+                const bool* reset = std::get_if<bool>(&property);
+                if (reset == nullptr)
+                {
+                    std::cerr << "Unable to read reset on ERR2 value\n";
+                    return;
+                }
+                startCrashdumpAndRecovery(*reset);
+            },
+            "xyz.openbmc_project.Settings",
+            "/xyz/openbmc_project/control/processor_error_config",
+            "org.freedesktop.DBus.Properties", "Get",
+            "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
+    });
+}
+
+static void err2Handler()
+{
+    if (!hostOff)
+    {
+        gpiod::line_event gpioLineEvent = err2Line.event_read();
+
+        bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
+        if (err2)
+        {
+            err2AssertHandler();
+        }
+        else
+        {
+            err2AssertTimer.cancel();
+        }
+    }
+    err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
+                         [](const boost::system::error_code ec) {
+                             if (ec)
+                             {
+                                 std::cerr
+                                     << "err2 handler error: " << ec.message()
+                                     << "\n";
+                                 return;
+                             }
+                             err2Handler();
+                         });
+}
+
 static void initializeErrorState()
 {
     // Handle CPU_CATERR if it's asserted now
@@ -326,6 +487,12 @@
     {
         caterrAssertHandler();
     }
+
+    // Handle CPU_ERR2 if it's asserted now
+    if (err2Line.get_value() == 0)
+    {
+        err2AssertHandler();
+    }
 }
 } // namespace host_error_monitor
 
@@ -356,6 +523,14 @@
         return -1;
     }
 
+    // Request CPU_ERR2 GPIO events
+    if (!host_error_monitor::requestGPIOEvents(
+            "CPU_ERR2", host_error_monitor::err2Handler,
+            host_error_monitor::err2Line, host_error_monitor::err2Event))
+    {
+        return -1;
+    }
+
     // Request PCH_BMC_THERMTRIP GPIO events
     if (!host_error_monitor::requestGPIOEvents(
             "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,