blob: 19429f46bd1208e7fe4165e9162f18e8e6d4a711 [file] [log] [blame]
Zane Shelleyd84ed6e2020-06-08 13:41:48 -05001#include <assert.h>
Zane Shelley9fb73932020-09-15 13:34:57 -05002#include <unistd.h>
Ben Tyner87eabc62020-05-14 17:56:54 -05003
Zane Shelleyebff0d32021-11-21 10:52:07 -06004#include <analyzer/analyzer_main.hpp>
Zane Shelleya9b44342021-08-08 17:15:52 -05005#include <analyzer/ras-data/ras-data-parser.hpp>
Zane Shelley4ed4be52021-02-15 17:53:40 -06006#include <analyzer/service_data.hpp>
Ben Tyner7029e522021-08-09 19:18:24 -05007#include <attn/attn_dump.hpp>
Ben Tyner0205f3b2020-02-24 10:24:47 -06008#include <hei_main.hpp>
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -06009#include <util/pdbg.hpp>
Zane Shelleyd84ed6e2020-06-08 13:41:48 -050010#include <util/trace.hpp>
Ben Tyner0205f3b2020-02-24 10:24:47 -060011
12namespace analyzer
13{
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -060014//------------------------------------------------------------------------------
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050015
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -060016// Forward references for externally defined functions.
Ben Tyner87eabc62020-05-14 17:56:54 -050017
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060018/**
19 * @brief Will get the list of active chip and initialize the isolator.
20 * @param o_chips The returned list of active chips.
21 */
Zane Shelley171a2e02020-11-13 13:56:13 -060022void initializeIsolator(std::vector<libhei::Chip>& o_chips);
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050023
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060024/**
Zane Shelley65fefb22021-10-18 15:35:26 -050025 * @brief Will get the list of active chip and initialize the isolator.
Zane Shelleyec227c22021-12-09 15:54:40 -060026 * @param i_type The type of analysis to perform. See enum for details.
Zane Shelley65fefb22021-10-18 15:35:26 -050027 * @param i_isoData The data gathered during isolation (for FFDC).
28 * @param o_rootCause The returned root cause signature.
Caleb Palmer1a4f0e72022-11-07 15:08:01 -060029 * @param i_rasData The RAS data parser.
Zane Shelley65fefb22021-10-18 15:35:26 -050030 * @return True, if root cause has been found. False, otherwise.
31 */
Zane Shelleyec227c22021-12-09 15:54:40 -060032bool filterRootCause(AnalysisType i_type,
33 const libhei::IsolationData& i_isoData,
Caleb Palmer1a4f0e72022-11-07 15:08:01 -060034 libhei::Signature& o_rootCause,
35 const RasDataParser& i_rasData);
Zane Shelley65fefb22021-10-18 15:35:26 -050036
37/**
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060038 * @brief Will create and submit a PEL using the given data.
Zane Shelley4ed4be52021-02-15 17:53:40 -060039 * @param i_servData Data regarding service actions gathered during analysis.
Zane Shelley611b3442021-11-19 16:02:01 -060040 * @return The platform log ID. Will return zero if no PEL is generated.
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060041 */
Ben Tynerc1e1c002022-02-16 15:09:31 -060042uint32_t commitPel(const ServiceData& i_servData);
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060043
Zane Shelleyd84ed6e2020-06-08 13:41:48 -050044//------------------------------------------------------------------------------
45
Zane Shelleyebff0d32021-11-21 10:52:07 -060046const char* __attn(libhei::AttentionType_t i_type)
Zane Shelley2f263182020-07-10 21:41:21 -050047{
48 const char* str = "";
Zane Shelleyebff0d32021-11-21 10:52:07 -060049 switch (i_type)
Zane Shelley2f263182020-07-10 21:41:21 -050050 {
51 case libhei::ATTN_TYPE_CHECKSTOP:
52 str = "CHECKSTOP";
53 break;
54 case libhei::ATTN_TYPE_UNIT_CS:
55 str = "UNIT_CS";
56 break;
57 case libhei::ATTN_TYPE_RECOVERABLE:
58 str = "RECOVERABLE";
59 break;
60 case libhei::ATTN_TYPE_SP_ATTN:
61 str = "SP_ATTN";
62 break;
63 case libhei::ATTN_TYPE_HOST_ATTN:
64 str = "HOST_ATTN";
65 break;
66 default:
Zane Shelleyebff0d32021-11-21 10:52:07 -060067 trace::err("Unsupported attention type: %u", i_type);
Zane Shelley2f263182020-07-10 21:41:21 -050068 assert(0);
69 }
70 return str;
71}
72
Zane Shelley2f263182020-07-10 21:41:21 -050073//------------------------------------------------------------------------------
74
Zane Shelleyebff0d32021-11-21 10:52:07 -060075const char* __analysisType(AnalysisType i_type)
76{
77 const char* str = "";
78 switch (i_type)
79 {
80 case AnalysisType::SYSTEM_CHECKSTOP:
81 str = "SYSTEM_CHECKSTOP";
82 break;
83 case AnalysisType::TERMINATE_IMMEDIATE:
84 str = "TERMINATE_IMMEDIATE";
85 break;
86 case AnalysisType::MANUAL:
87 str = "MANUAL";
88 break;
89 default:
90 trace::err("Unsupported analysis type: %u", i_type);
91 assert(0);
92 }
93 return str;
94}
95
96//------------------------------------------------------------------------------
97
98uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump)
Zane Shelley9fb73932020-09-15 13:34:57 -050099{
Zane Shelley611b3442021-11-19 16:02:01 -0600100 uint32_t o_plid = 0; // default, zero indicates PEL was not created
Zane Shelley9fb73932020-09-15 13:34:57 -0500101
Zane Shelleye5411f02021-08-04 22:41:35 -0500102 if (!util::pdbg::queryHardwareAnalysisSupported())
103 {
104 trace::err("Hardware error analysis is not supported on this system");
Zane Shelley611b3442021-11-19 16:02:01 -0600105 return o_plid;
Zane Shelleye5411f02021-08-04 22:41:35 -0500106 }
107
Zane Shelleyebff0d32021-11-21 10:52:07 -0600108 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type));
Zane Shelleye5411f02021-08-04 22:41:35 -0500109
110 // Initialize the isolator and get all of the chips to be analyzed.
111 trace::inf("Initializing the isolator...");
112 std::vector<libhei::Chip> chips;
113 initializeIsolator(chips);
114
115 // Isolate attentions.
116 trace::inf("Isolating errors: # of chips=%u", chips.size());
117 libhei::IsolationData isoData{};
118 libhei::isolate(chips, isoData);
119
Zane Shelley65fefb22021-10-18 15:35:26 -0500120 // For debug, trace out the original list of signatures before filtering.
121 for (const auto& sig : isoData.getSignatureList())
122 {
123 trace::inf("Signature: %s 0x%0" PRIx32 " %s",
124 util::pdbg::getPath(sig.getChip()), sig.toUint32(),
125 __attn(sig.getAttnType()));
126 }
127
Zane Shelleye5411f02021-08-04 22:41:35 -0500128 // Filter for root cause attention.
Zane Shelleycb457382020-11-02 20:55:06 -0600129 libhei::Signature rootCause{};
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600130 RasDataParser rasData{};
131 bool attnFound = filterRootCause(i_type, isoData, rootCause, rasData);
Zane Shelleycb457382020-11-02 20:55:06 -0600132
Zane Shelleyb7879d32021-12-06 18:02:03 -0600133 // If a root cause attention was found, or if this was a system checkstop,
134 // generate a PEL.
135 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type)
Zane Shelley9fb73932020-09-15 13:34:57 -0500136 {
Zane Shelleyb7879d32021-12-06 18:02:03 -0600137 if (attnFound)
138 {
139 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
140 util::pdbg::getPath(rootCause.getChip()),
141 rootCause.toUint32(), __attn(rootCause.getAttnType()));
142 }
143 else
144 {
145 // This is bad. Analysis should have found a root cause attention
146 // for a system checkstop. Issues could range from code bugs to SCOM
147 // errors. Regardless, generate a PEL with FFDC to assist with
148 // debug.
149 trace::err("System checkstop with no root cause attention");
150 rootCause = libhei::Signature{}; // just in case
151 }
Zane Shelleycb457382020-11-02 20:55:06 -0600152
Zane Shelleyb7879d32021-12-06 18:02:03 -0600153 // Start building the service data.
Zane Shelley62adf5c2022-01-18 21:06:50 -0600154 ServiceData servData{rootCause, i_type, isoData};
Zane Shelleyb7879d32021-12-06 18:02:03 -0600155
156 // Apply any service actions, if needed. Note that there are no
157 // resolutions for manual analysis.
158 if (AnalysisType::MANUAL != i_type)
159 {
160 if (attnFound)
161 {
Zane Shelley2fbd2672022-02-03 13:56:35 -0600162 try
163 {
164 // Resolve the root cause attention.
Zane Shelley2fbd2672022-02-03 13:56:35 -0600165 rasData.getResolution(rootCause)->resolve(servData);
166 }
167 catch (const std::exception& e)
168 {
169 trace::err("Exception caught during root cause analysis");
170 trace::err(e.what());
171
172 // We'll still want to create a PEL for the FFDC, but
173 // since the analysis failed, we need to callout Level 2
174 // Support.
175 servData.calloutProcedure(callout::Procedure::NEXTLVL,
176 callout::Priority::HIGH);
177 }
Zane Shelleyb7879d32021-12-06 18:02:03 -0600178 }
179 else
180 {
Zane Shelley2fbd2672022-02-03 13:56:35 -0600181 // Analysis failed so callout the Level 2 Support.
Zane Shelley8af56852022-01-28 15:07:35 -0600182 servData.calloutProcedure(callout::Procedure::NEXTLVL,
183 callout::Priority::HIGH);
Zane Shelleyb7879d32021-12-06 18:02:03 -0600184 }
185 }
Zane Shelleyd3b9bac2020-11-17 21:59:12 -0600186
187 // Create and commit a PEL.
Ben Tynerc1e1c002022-02-16 15:09:31 -0600188 o_plid = commitPel(servData);
Ben Tyner7029e522021-08-09 19:18:24 -0500189
Zane Shelley611b3442021-11-19 16:02:01 -0600190 if (0 == o_plid)
191 {
192 trace::err("Failed to create PEL");
193 }
194 else
195 {
196 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid);
Zane Shelleybf3326f2021-11-12 13:41:39 -0600197
Zane Shelley611b3442021-11-19 16:02:01 -0600198 // Gather/return information needed for dump. A hardware dump will
199 // always be used for system checkstop attenions. Software dumps
200 // will be reserved for MP-IPLs during TI analysis.
201 // TODO: Need ID from root cause. At the moment, HUID does not exist
202 // in devtree. Will need a better ID definition.
Zane Shelleyebff0d32021-11-21 10:52:07 -0600203 o_dump.unitId = 0;
204 o_dump.dumpType = attn::DumpType::Hardware;
Zane Shelley611b3442021-11-19 16:02:01 -0600205 }
Zane Shelley9fb73932020-09-15 13:34:57 -0500206 }
Zane Shelleyb7879d32021-12-06 18:02:03 -0600207 else
208 {
209 // It is possible for TI handling, or manually initiated analysis via
210 // the command line, that there will not be an active attention. In
211 // which case, we will do nothing and let the caller of this function
212 // determine if this is the expected behavior.
213 trace::inf("No active attentions found");
214 }
Zane Shelley9fb73932020-09-15 13:34:57 -0500215
Zane Shelleye5411f02021-08-04 22:41:35 -0500216 // All done, clean up the isolator.
217 trace::inf("Uninitializing isolator...");
218 libhei::uninitialize();
Ben Tyner87eabc62020-05-14 17:56:54 -0500219
Zane Shelley2f263182020-07-10 21:41:21 -0500220 trace::inf("<<< exit analyzeHardware()");
221
Zane Shelley611b3442021-11-19 16:02:01 -0600222 return o_plid;
Ben Tyner0205f3b2020-02-24 10:24:47 -0600223}
224
Ben Tynereea45422021-04-15 10:54:14 -0500225//------------------------------------------------------------------------------
226
Ben Tyner0205f3b2020-02-24 10:24:47 -0600227} // namespace analyzer