Hardware analysis support of TOD failures
Signed-off-by: Zane Shelley <zshelle@us.ibm.com>
Change-Id: I8e2659cc832113e05c99b8dd091757bcab3dcdea
diff --git a/analyzer/plugins/p10-tod-plugins.cpp b/analyzer/plugins/p10-tod-plugins.cpp
index 843e954..35e306b 100644
--- a/analyzer/plugins/p10-tod-plugins.cpp
+++ b/analyzer/plugins/p10-tod-plugins.cpp
@@ -1,5 +1,6 @@
#include <analyzer/plugins/plugin.hpp>
+#include <hei_main.hpp>
#include <util/pdbg.hpp>
#include <util/trace.hpp>
@@ -22,6 +23,13 @@
BACKUP = 1,
};
+/** Each topology can be configured as either primary or secondary. */
+enum class Configuration
+{
+ PRIMARY,
+ SECONDARY,
+};
+
class Data
{
public:
@@ -107,13 +115,188 @@
}
};
+enum class Register
+{
+ TOD_ERROR = 0x00040030,
+ TOD_PSS_MSS_STATUS = 0x00040008,
+ TOD_PRI_PORT_0_CTRL = 0x00040001,
+ TOD_PRI_PORT_1_CTRL = 0x00040002,
+ TOD_SEC_PORT_0_CTRL = 0x00040003,
+ TOD_SEC_PORT_1_CTRL = 0x00040004,
+};
+
+bool readRegister(pdbg_target* i_chip, Register i_addr,
+ libhei::BitStringBuffer& o_val)
+{
+ assert(64 == o_val.getBitLen());
+
+ uint64_t scomValue;
+ if (util::pdbg::getScom(i_chip, static_cast<uint64_t>(i_addr), scomValue))
+ {
+ trace::err("Register read failed: addr=0x%08x chip=%s",
+ static_cast<uint64_t>(i_addr), util::pdbg::getPath(i_chip));
+ return true; // SCOM failed
+ }
+
+ o_val.setFieldRight(0, 64, scomValue);
+
+ return false; // no failures
+}
+
+pdbg_target* getChipSourcingClock(pdbg_target* i_chipReportingError,
+ unsigned int i_iohsPos)
+{
+ using namespace util::pdbg;
+
+ pdbg_target* chipSourcingClock = nullptr;
+
+ // Given the chip reporting the error and the IOHS position within that
+ // chip, we must get
+ // - The associated IOHS target on this chip.
+ // - Next, the IOHS target on the other side of the bus.
+ // - Finally, the chip containing the IOHS target on the other side of the
+ // bus.
+
+ auto iohsUnit = getChipUnit(i_chipReportingError, TYPE_IOHS, i_iohsPos);
+ if (nullptr != iohsUnit)
+ {
+ auto clockSourceUnit =
+ getConnectedTarget(iohsUnit, callout::BusType::SMP_BUS);
+ if (nullptr != clockSourceUnit)
+ {
+ chipSourcingClock = getParentChip(clockSourceUnit);
+ }
+ }
+
+ return chipSourcingClock;
+}
+
/**
* @brief Collects TOD fault data for each processor chip.
*/
-void collectTodFaultData(pdbg_target*, Data&)
+void collectTodFaultData(pdbg_target* i_chip, Data& o_data)
{
- // TODO: Need to query hardware for this chip and add to Data if any
- // faults found.
+ // TODO: We should use a register cache captured by the isolator so that
+ // this code is using the same values the isolator used. However, at
+ // the moment the isolator does not have a register cache. Instead,
+ // we'll have to manually SCOM the registers we need. Fortunately,
+ // for a checkstop attention the hardware should freeze and the
+ // values will never change. Unfortunately, we don't have that same
+ // guarantee for TIs, but at the time of this writing, all TOD errors
+ // will trigger a checkstop attention away. So the TI case is not as
+ // important.
+
+ libhei::BitStringBuffer errorReg{64};
+ if (readRegister(i_chip, Register::TOD_ERROR, errorReg))
+ {
+ return; // cannot continue on this chip
+ }
+
+ libhei::BitStringBuffer statusReg{64};
+ if (readRegister(i_chip, Register::TOD_PSS_MSS_STATUS, statusReg))
+ {
+ return; // cannot continue on this chip
+ }
+
+ // Determine which topology is configured primary or secondary.
+ std::map<Topology, Configuration> topConfig;
+
+ if (0 == statusReg.getFieldRight(0, 3))
+ {
+ // TOD_PSS_MSS_STATUS[0:2] == 0b000 means active topology is primary.
+ topConfig[Topology::ACTIVE] = Configuration::PRIMARY;
+ topConfig[Topology::BACKUP] = Configuration::SECONDARY;
+ }
+ else
+ {
+ // TOD_PSS_MSS_STATUS[0:2] == 0b111 means active topology is secondary.
+ topConfig[Topology::ACTIVE] = Configuration::SECONDARY;
+ topConfig[Topology::BACKUP] = Configuration::PRIMARY;
+ }
+
+ for (const auto top : {Topology::ACTIVE, Topology::BACKUP})
+ {
+ // Bit positions in some registers are dependent on this topology's
+ // configuration.
+ bool isPriTop = (Configuration::PRIMARY == topConfig[top]);
+
+ // Determine if this is the MDMT chip.
+ bool isMasterTod = statusReg.isBitSet(isPriTop ? 13 : 17);
+ bool isMasterDrawer = statusReg.isBitSet(isPriTop ? 14 : 18);
+
+ if (isMasterDrawer && isMasterTod)
+ {
+ // The master path selects are sourced from the oscilator reference
+ // clocks. So, we'll need to determine which one was used at the
+ // time of the failure.
+ auto masterPathSelect =
+ statusReg.getFieldRight(isPriTop ? 12 : 16, 1);
+
+ // Determine if there is a step check fault for this path select.
+ if (errorReg.isBitSet((0 == masterPathSelect) ? 14 : 15))
+ {
+ trace::inf(
+ "TOD MDMT fault found: top=%u config=%u path=%u chip=%s",
+ static_cast<unsigned int>(top),
+ static_cast<unsigned int>(topConfig[top]), masterPathSelect,
+ util::pdbg::getPath(i_chip));
+
+ o_data.setMdmtFault(top, i_chip);
+ }
+ }
+ else // not the MDMT on this topology
+ {
+ // The slave path selects are sourced from other processor chips.
+ // So, we'll need to determine which one was used at the time of the
+ // failure.
+ auto slavePathSelect =
+ statusReg.getFieldRight(isPriTop ? 15 : 19, 1);
+
+ // Determine if there is a step check fault for this path select.
+ if (errorReg.isBitSet((0 == slavePathSelect) ? 16 : 21))
+ {
+ // Get the IOHS unit position on this chip that is connected to
+ // the clock source chip.
+ auto addr = (0 == slavePathSelect)
+ ? (isPriTop ? Register::TOD_PRI_PORT_0_CTRL
+ : Register::TOD_SEC_PORT_0_CTRL)
+ : (isPriTop ? Register::TOD_PRI_PORT_1_CTRL
+ : Register::TOD_SEC_PORT_1_CTRL);
+
+ libhei::BitStringBuffer portCtrl{64};
+ if (readRegister(i_chip, addr, portCtrl))
+ {
+ continue; // try the other topology
+ }
+
+ auto iohsPos = portCtrl.getFieldRight(0, 3);
+ auto chipSourcingClock = getChipSourcingClock(i_chip, iohsPos);
+
+ if (nullptr != chipSourcingClock)
+ {
+ trace::inf("TOD network fault found: top=%u config=%u "
+ "path=%u chip=%s iohs=%u clockSrc=%s",
+ static_cast<unsigned int>(top),
+ static_cast<unsigned int>(topConfig[top]),
+ slavePathSelect, util::pdbg::getPath(i_chip),
+ iohsPos, util::pdbg::getPath(chipSourcingClock));
+
+ o_data.setNetworkFault(top, chipSourcingClock);
+ }
+ }
+ }
+
+ // Check for any internal path errors in the active topology only.
+ if (Topology::ACTIVE == top && errorReg.isBitSet(17))
+ {
+ trace::inf("TOD internal fault found: top=%u config=%u chip=%s",
+ static_cast<unsigned int>(top),
+ static_cast<unsigned int>(topConfig[top]),
+ util::pdbg::getPath(i_chip));
+
+ o_data.setInternalFault(top, i_chip);
+ }
+ }
}
} // namespace tod
diff --git a/test/test-tod-step-check-fault.cpp b/test/test-tod-step-check-fault.cpp
index 99ecc28..e91736d 100644
--- a/test/test-tod-step-check-fault.cpp
+++ b/test/test-tod-step-check-fault.cpp
@@ -2,6 +2,7 @@
#include <analyzer/plugins/plugin.hpp>
#include <hei_util.hpp>
+#include <test/sim-hw-access.hpp>
#include <util/pdbg.hpp>
#include <util/trace.hpp>
@@ -12,24 +13,59 @@
static const auto nodeId =
static_cast<libhei::NodeId_t>(libhei::hash<libhei::NodeId_t>("TOD_ERROR"));
-TEST(TodStepCheckFault, TestSet1)
+TEST(TodStepCheckFault, MdmtFault)
{
pdbg_targets_init(nullptr);
- libhei::Chip chip1{util::pdbg::getTrgt("/proc1"), P10_20};
+ auto proc0 = util::pdbg::getTrgt("/proc0");
+ auto proc1 = util::pdbg::getTrgt("/proc1");
- // TOD_ERROR(0)[16]
- libhei::Signature sig{chip1, nodeId, 0, 16, libhei::ATTN_TYPE_CHECKSTOP};
+ libhei::Chip chip0{proc0, P10_20};
+ libhei::Chip chip1{proc1, P10_20};
+ sim::ScomAccess& scom = sim::ScomAccess::getSingleton();
+ scom.flush();
+
+ // TOD_ERROR[14] = 0b1 step check on primary config master select 0
+ scom.add(proc0, 0x00040030, 0x0002000000000000); // TOD_ERROR
+
+ // TOD_PSS_MSS_STATUS[0:2] = 0b000 primary config is active
+ // TOD_PSS_MSS_STATUS[12] = 0b0 primary config master select 0
+ // TOD_PSS_MSS_STATUS[13] = 0b1 primary config master TOD
+ // TOD_PSS_MSS_STATUS[14] = 0b1 primary config master drawer
+ scom.add(proc0, 0x00040008, 0x0006000000000000);
+
+ // TOD_ERROR[17] = 0b1 internal step check
+ // TOD_ERROR[21] = 0b1 step check on primary config slave select 1
+ scom.add(proc1, 0x00040030, 0x0000440000000000); // TOD_ERROR
+
+ // TOD_PSS_MSS_STATUS[0:2] = 0b000 primary config is active
+ // TOD_PSS_MSS_STATUS[15] = 0b1 primary config slave path select 1
+ scom.add(proc1, 0x00040008, 0x0001000000000000);
+
+ // TOD_PRI_PORT_1_CTRL[0:2] = 0b001 IOHS 1
+ scom.add(proc1, 0x00040002, 0x2000000000000000);
+
+ // TOD_ERROR(0)[14] step check error on master select 0
+ libhei::Signature sig0{chip0, nodeId, 0, 14, libhei::ATTN_TYPE_CHECKSTOP};
+
+ // TOD_ERROR(0)[17] internal step check error
+ libhei::Signature sig1{chip1, nodeId, 0, 17, libhei::ATTN_TYPE_CHECKSTOP};
+
+ // TOD_ERROR(0)[21] step check error on slave select 1
+ libhei::Signature sig2{chip1, nodeId, 0, 21, libhei::ATTN_TYPE_CHECKSTOP};
+
+ libhei::IsolationData isoData{};
+ isoData.addSignature(sig0);
+ isoData.addSignature(sig1);
+ isoData.addSignature(sig2);
+ ServiceData sd{sig1, AnalysisType::SYSTEM_CHECKSTOP, isoData};
+
+ // Call the plugin.
auto plugin =
PluginMap::getSingleton().get(chip1.getType(), "tod_step_check_fault");
- libhei::IsolationData isoData{};
- isoData.addSignature(sig);
- ServiceData sd{sig, AnalysisType::SYSTEM_CHECKSTOP, isoData};
-
- // Call the plugin.
- plugin(1, chip1, sd);
+ plugin(0, chip1, sd);
nlohmann::json j{};
std::string s{};
@@ -39,10 +75,16 @@
s = R"([
{
"Deconfigured": false,
+ "Guarded": false,
+ "LocationCode": "P0",
+ "Priority": "M"
+ },
+ {
+ "Deconfigured": false,
"EntityPath": [],
"GuardType": "GARD_Unrecoverable",
"Guarded": true,
- "LocationCode": "/proc1",
+ "LocationCode": "/proc0",
"Priority": "M"
}
])";
@@ -52,10 +94,15 @@
j = sd.getCalloutFFDC();
s = R"([
{
+ "Callout Type": "Clock Callout",
+ "Clock Type": "TOD_CLOCK",
+ "Priority": "medium"
+ },
+ {
"Callout Type": "Hardware Callout",
"Guard": true,
"Priority": "medium",
- "Target": "/proc1"
+ "Target": "/proc0"
}
])";
EXPECT_EQ(s, j.dump(4));