Hardware analysis support of TOD failures

Signed-off-by: Zane Shelley <zshelle@us.ibm.com>
Change-Id: I8e2659cc832113e05c99b8dd091757bcab3dcdea
diff --git a/analyzer/plugins/p10-tod-plugins.cpp b/analyzer/plugins/p10-tod-plugins.cpp
index 843e954..35e306b 100644
--- a/analyzer/plugins/p10-tod-plugins.cpp
+++ b/analyzer/plugins/p10-tod-plugins.cpp
@@ -1,5 +1,6 @@
 
 #include <analyzer/plugins/plugin.hpp>
+#include <hei_main.hpp>
 #include <util/pdbg.hpp>
 #include <util/trace.hpp>
 
@@ -22,6 +23,13 @@
     BACKUP = 1,
 };
 
+/** Each topology can be configured as either primary or secondary. */
+enum class Configuration
+{
+    PRIMARY,
+    SECONDARY,
+};
+
 class Data
 {
   public:
@@ -107,13 +115,188 @@
     }
 };
 
+enum class Register
+{
+    TOD_ERROR           = 0x00040030,
+    TOD_PSS_MSS_STATUS  = 0x00040008,
+    TOD_PRI_PORT_0_CTRL = 0x00040001,
+    TOD_PRI_PORT_1_CTRL = 0x00040002,
+    TOD_SEC_PORT_0_CTRL = 0x00040003,
+    TOD_SEC_PORT_1_CTRL = 0x00040004,
+};
+
+bool readRegister(pdbg_target* i_chip, Register i_addr,
+                  libhei::BitStringBuffer& o_val)
+{
+    assert(64 == o_val.getBitLen());
+
+    uint64_t scomValue;
+    if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue))
+    {
+        trace::err("Register read failed: addr=0x%08x chip=%s",
+                   static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip));
+        return true; // SCOM failed
+    }
+
+    o_val.setFieldRight(0, 64, scomValue);
+
+    return false; // no failures
+}
+
+pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError,
+                                  unsigned int i_iohsPos)
+{
+    using namespace util::pdbg;
+
+    pdbg_target* chipSourcingClock = nullptr;
+
+    // Given the chip reporting the error and the IOHS position within that
+    // chip, we must get
+    //  - The associated IOHS target on this chip.
+    //  - Next, the IOHS target on the other side of the bus.
+    //  - Finally, the chip containing the IOHS target on the other side of the
+    //    bus.
+
+    auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos);
+    if (nullptr != iohsUnit)
+    {
+        auto clockSourceUnit =
+            getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS);
+        if (nullptr != clockSourceUnit)
+        {
+            chipSourcingClock = getParentChip(clockSourceUnit);
+        }
+    }
+
+    return chipSourcingClock;
+}
+
 /**
  * @brief Collects TOD fault data for each processor chip.
  */
-void collectTodFaultData(pdbg_target*, Data&)
+void collectTodFaultData(pdbg_target* i_chip, Data& o_data)
 {
-    // TODO: Need to query hardware for this chip and add to Data if any
-    // faults found.
+    // TODO: We should use a register cache captured by the isolator so that
+    //       this code is using the same values the isolator used.  However, at
+    //       the moment the isolator does not have a register cache. Instead,
+    //       we'll have to manually SCOM the registers we need.  Fortunately,
+    //       for a checkstop attention the hardware should freeze and the
+    //       values will never change. Unfortunately, we don't have that same
+    //       guarantee for TIs, but at the time of this writing, all TOD errors
+    //       will trigger a checkstop attention away. So the TI case is not as
+    //       important.
+
+    libhei::BitStringBuffer errorReg{64};
+    if (readRegister(i_chip, Register::TOD_ERROR, errorReg))
+    {
+        return; // cannot continue on this chip
+    }
+
+    libhei::BitStringBuffer statusReg{64};
+    if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg))
+    {
+        return; // cannot continue on this chip
+    }
+
+    // Determine which topology is configured primary or secondary.
+    std::map<Topology, Configuration> topConfig;
+
+    if (0 == statusReg.getFieldRight(0, 3))
+    {
+        // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary.
+        topConfig[Topology::ACTIVE] = Configuration::PRIMARY;
+        topConfig[Topology::BACKUP] = Configuration::SECONDARY;
+    }
+    else
+    {
+        // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary.
+        topConfig[Topology::ACTIVE] = Configuration::SECONDARY;
+        topConfig[Topology::BACKUP] = Configuration::PRIMARY;
+    }
+
+    for (const auto top : {Topology::ACTIVE, Topology::BACKUP})
+    {
+        // Bit positions in some registers are dependent on this topology's
+        // configuration.
+        bool isPriTop = (Configuration::PRIMARY == topConfig[top]);
+
+        // Determine if this is the MDMT chip.
+        bool isMasterTod    = statusReg.isBitSet(isPriTop ? 13 : 17);
+        bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18);
+
+        if (isMasterDrawer && isMasterTod)
+        {
+            // The master path selects are sourced from the oscilator reference
+            // clocks. So, we'll need to determine which one was used at the
+            // time of the failure.
+            auto masterPathSelect =
+                statusReg.getFieldRight(isPriTop ? 12 : 16, 1);
+
+            // Determine if there is a step check fault for this path select.
+            if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15))
+            {
+                trace::inf(
+                    "TOD MDMT fault found: top=%u config=%u path=%u chip=%s",
+                    static_cast<unsigned int>(top),
+                    static_cast<unsigned int>(topConfig[top]), masterPathSelect,
+                    util::pdbg::getPath(i_chip));
+
+                o_data.setMdmtFault(top, i_chip);
+            }
+        }
+        else // not the MDMT on this topology
+        {
+            // The slave path selects are sourced from other processor chips.
+            // So, we'll need to determine which one was used at the time of the
+            // failure.
+            auto slavePathSelect =
+                statusReg.getFieldRight(isPriTop ? 15 : 19, 1);
+
+            // Determine if there is a step check fault for this path select.
+            if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21))
+            {
+                // Get the IOHS unit position on this chip that is connected to
+                // the clock source chip.
+                auto addr = (0 == slavePathSelect)
+                                ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL
+                                            : Register::TOD_SEC_PORT_0_CTRL)
+                                : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL
+                                            : Register::TOD_SEC_PORT_1_CTRL);
+
+                libhei::BitStringBuffer portCtrl{64};
+                if (readRegister(i_chip, addr, portCtrl))
+                {
+                    continue; // try the other topology
+                }
+
+                auto iohsPos           = portCtrl.getFieldRight(0, 3);
+                auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos);
+
+                if (nullptr != chipSourcingClock)
+                {
+                    trace::inf("TOD network fault found: top=%u config=%u "
+                               "path=%u chip=%s iohs=%u clockSrc=%s",
+                               static_cast<unsigned int>(top),
+                               static_cast<unsigned int>(topConfig[top]),
+                               slavePathSelect, util::pdbg::getPath(i_chip),
+                               iohsPos, util::pdbg::getPath(chipSourcingClock));
+
+                    o_data.setNetworkFault(top, chipSourcingClock);
+                }
+            }
+        }
+
+        // Check for any internal path errors in the active topology only.
+        if (Topology::ACTIVE == top && errorReg.isBitSet(17))
+        {
+            trace::inf("TOD internal fault found: top=%u config=%u chip=%s",
+                       static_cast<unsigned int>(top),
+                       static_cast<unsigned int>(topConfig[top]),
+                       util::pdbg::getPath(i_chip));
+
+            o_data.setInternalFault(top, i_chip);
+        }
+    }
 }
 
 } // namespace tod
diff --git a/test/test-tod-step-check-fault.cpp b/test/test-tod-step-check-fault.cpp
index 99ecc28..e91736d 100644
--- a/test/test-tod-step-check-fault.cpp
+++ b/test/test-tod-step-check-fault.cpp
@@ -2,6 +2,7 @@
 
 #include <analyzer/plugins/plugin.hpp>
 #include <hei_util.hpp>
+#include <test/sim-hw-access.hpp>
 #include <util/pdbg.hpp>
 #include <util/trace.hpp>
 
@@ -12,24 +13,59 @@
 static const auto nodeId =
     static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("TOD_ERROR"));
 
-TEST(TodStepCheckFault, TestSet1)
+TEST(TodStepCheckFault, MdmtFault)
 {
     pdbg_targets_init(nullptr);
 
-    libhei::Chip chip1{util::pdbg::getTrgt("/proc1"), P10_20};
+    auto proc0 = util::pdbg::getTrgt("/proc0");
+    auto proc1 = util::pdbg::getTrgt("/proc1");
 
-    // TOD_ERROR(0)[16]
-    libhei::Signature sig{chip1, nodeId, 0, 16, libhei::ATTN_TYPE_CHECKSTOP};
+    libhei::Chip chip0{proc0, P10_20};
+    libhei::Chip chip1{proc1, P10_20};
 
+    sim::ScomAccess& scom = sim::ScomAccess::getSingleton();
+    scom.flush();
+
+    // TOD_ERROR[14]    = 0b1   step check on primary config master select 0
+    scom.add(proc0, 0x00040030, 0x0002000000000000); // TOD_ERROR
+
+    // TOD_PSS_MSS_STATUS[0:2] = 0b000  primary config is active
+    // TOD_PSS_MSS_STATUS[12]  = 0b0    primary config master select 0
+    // TOD_PSS_MSS_STATUS[13]  = 0b1    primary config master TOD
+    // TOD_PSS_MSS_STATUS[14]  = 0b1    primary config master drawer
+    scom.add(proc0, 0x00040008, 0x0006000000000000);
+
+    // TOD_ERROR[17]    = 0b1   internal step check
+    // TOD_ERROR[21]    = 0b1   step check on primary config slave select 1
+    scom.add(proc1, 0x00040030, 0x0000440000000000); // TOD_ERROR
+
+    // TOD_PSS_MSS_STATUS[0:2] = 0b000  primary config is active
+    // TOD_PSS_MSS_STATUS[15]  = 0b1    primary config slave path select 1
+    scom.add(proc1, 0x00040008, 0x0001000000000000);
+
+    // TOD_PRI_PORT_1_CTRL[0:2] = 0b001  IOHS 1
+    scom.add(proc1, 0x00040002, 0x2000000000000000);
+
+    // TOD_ERROR(0)[14] step check error on master select 0
+    libhei::Signature sig0{chip0, nodeId, 0, 14, libhei::ATTN_TYPE_CHECKSTOP};
+
+    // TOD_ERROR(0)[17] internal step check error
+    libhei::Signature sig1{chip1, nodeId, 0, 17, libhei::ATTN_TYPE_CHECKSTOP};
+
+    // TOD_ERROR(0)[21] step check error on slave select 1
+    libhei::Signature sig2{chip1, nodeId, 0, 21, libhei::ATTN_TYPE_CHECKSTOP};
+
+    libhei::IsolationData isoData{};
+    isoData.addSignature(sig0);
+    isoData.addSignature(sig1);
+    isoData.addSignature(sig2);
+    ServiceData sd{sig1, AnalysisType::SYSTEM_CHECKSTOP, isoData};
+
+    // Call the plugin.
     auto plugin =
         PluginMap::getSingleton().get(chip1.getType(), "tod_step_check_fault");
 
-    libhei::IsolationData isoData{};
-    isoData.addSignature(sig);
-    ServiceData sd{sig, AnalysisType::SYSTEM_CHECKSTOP, isoData};
-
-    // Call the plugin.
-    plugin(1, chip1, sd);
+    plugin(0, chip1, sd);
 
     nlohmann::json j{};
     std::string s{};
@@ -39,10 +75,16 @@
     s = R"([
     {
         "Deconfigured": false,
+        "Guarded": false,
+        "LocationCode": "P0",
+        "Priority": "M"
+    },
+    {
+        "Deconfigured": false,
         "EntityPath": [],
         "GuardType": "GARD_Unrecoverable",
         "Guarded": true,
-        "LocationCode": "/proc1",
+        "LocationCode": "/proc0",
         "Priority": "M"
     }
 ])";
@@ -52,10 +94,15 @@
     j = sd.getCalloutFFDC();
     s = R"([
     {
+        "Callout Type": "Clock Callout",
+        "Clock Type": "TOD_CLOCK",
+        "Priority": "medium"
+    },
+    {
         "Callout Type": "Hardware Callout",
         "Guard": true,
         "Priority": "medium",
-        "Target": "/proc1"
+        "Target": "/proc0"
     }
 ])";
     EXPECT_EQ(s, j.dump(4));