blob: 5c4edf92339e4588d9a2c5d751ec96896117fdca [file] [log] [blame]
Zane Shelley65fefb22021-10-18 15:35:26 -05001#include <assert.h>
2
Caleb Palmer1a4f0e72022-11-07 15:08:01 -06003#include <analyzer/analyzer_main.hpp>
4#include <analyzer/ras-data/ras-data-parser.hpp>
Zane Shelley65fefb22021-10-18 15:35:26 -05005#include <hei_main.hpp>
Zane Shelley19df3702021-12-16 22:32:54 -06006#include <hei_util.hpp>
Zane Shelleyf4792d62021-10-28 18:08:22 -05007#include <util/pdbg.hpp>
Zane Shelley65fefb22021-10-18 15:35:26 -05008
9#include <algorithm>
10#include <limits>
11#include <string>
12
13namespace analyzer
14{
Zane Shelley65fefb22021-10-18 15:35:26 -050015//------------------------------------------------------------------------------
16
Zane Shelleya7369f82021-10-18 16:52:21 -050017bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18 libhei::Signature& o_rootCause)
19{
20 // TODO: Consider returning all of them instead of one as root cause.
21 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
Zane Shelley19df3702021-12-16 22:32:54 -060022 return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
Zane Shelleya7369f82021-10-18 16:52:21 -050023 (42 == t.getBit() || 43 == t.getBit()));
24 });
25
26 if (i_list.end() != itr)
27 {
28 o_rootCause = *itr;
29 return true;
30 }
31
32 return false;
33}
34
35//------------------------------------------------------------------------------
36
37bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38 libhei::Signature& o_rootCause)
39{
Zane Shelleyc62813d2023-08-22 16:52:19 -050040 using namespace util::pdbg;
41
Zane Shelleya7369f82021-10-18 16:52:21 -050042 // TODO: Consider returning all of them instead of one as root cause.
Zane Shelleyc62813d2023-08-22 16:52:19 -050043
44 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45
46 // First, look for any PLL unlock attentions reported by a processsor chip.
47 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48 return (nodeId == t.getId() &&
49 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
Zane Shelleya7369f82021-10-18 16:52:21 -050050 });
51
Zane Shelleyc62813d2023-08-22 16:52:19 -050052 if (i_list.end() != itr1)
Zane Shelleya7369f82021-10-18 16:52:21 -050053 {
Zane Shelleyc62813d2023-08-22 16:52:19 -050054 o_rootCause = *itr1;
55 return true;
56 }
57
58 // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59 // is specifically for Odyssey, which are the only OCMBs that would report
60 // PLL unlock attentions.
61 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62 return (nodeId == t.getId() &&
63 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64 });
65
66 if (i_list.end() != itr2)
67 {
68 o_rootCause = *itr2;
Zane Shelleya7369f82021-10-18 16:52:21 -050069 return true;
70 }
71
72 return false;
73}
74
75//------------------------------------------------------------------------------
76
Zane Shelleyf4792d62021-10-28 18:08:22 -050077bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
Caleb Palmer1a4f0e72022-11-07 15:08:01 -060078 libhei::Signature& o_rootCause,
79 const RasDataParser& i_rasData)
Zane Shelleyf4792d62021-10-28 18:08:22 -050080{
81 using namespace util::pdbg;
82
Patrick Williams27dd6362023-05-10 07:51:20 -050083 using func = libhei::NodeId_t (*)(const std::string& i_str);
Zane Shelley19df3702021-12-16 22:32:54 -060084 func __hash = libhei::hash<libhei::NodeId_t>;
85
Patrick Williams27dd6362023-05-10 07:51:20 -050086 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87 static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
Zane Shelley19df3702021-12-16 22:32:54 -060088 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
Zane Shelleyf4792d62021-10-28 18:08:22 -050089
Zane Shelleyadda0542023-04-06 16:38:02 -050090 // First, look for any chip checkstops from the connected OCMBs.
91 for (const auto& s : i_list)
Zane Shelleyf4792d62021-10-28 18:08:22 -050092 {
Zane Shelleyadda0542023-04-06 16:38:02 -050093 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
Zane Shelleyf4792d62021-10-28 18:08:22 -050094 {
Zane Shelleyadda0542023-04-06 16:38:02 -050095 continue; // OCMBs only
96 }
97
98 // TODO: The chip data for Explorer chips currently report chip
99 // checkstops as unit checkstops. Once the chip data has been
100 // updated, the check for unit checkstops here will need to be
101 // removed.
102 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104 {
Zane Shelley93b001c2023-03-24 17:45:04 -0500105 o_rootCause = s;
Zane Shelleyadda0542023-04-06 16:38:02 -0500106 return true;
Zane Shelley93b001c2023-03-24 17:45:04 -0500107 }
Zane Shelleyadda0542023-04-06 16:38:02 -0500108 }
109
110 // Now, look for any channel failure attentions on the processor side of the
111 // memory bus.
112 for (const auto& s : i_list)
113 {
114 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115 {
116 continue; // processors only
117 }
118
119 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120 // MC_USTLFIR are considered a channel failure attention.
121 // TODO: The "channel failure" designation is actually configurable via
122 // other registers. We just happen to expect anything that is
123 // configured to channel failure to also be configured to unit
124 // checkstop. Eventually, we will need some mechanism to check the
125 // configuration registers for a more accurate analysis.
126 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128 !i_rasData.isFlagSet(s,
129 RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130 {
131 o_rootCause = s;
132 return true;
133 }
134 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135 // MC_OMI_DL_FIR that are hardwired to channel failure.
Zane Shelley93b001c2023-03-24 17:45:04 -0500136 else if (mc_omi_dl_err_rpt == s.getId())
137 {
138 o_rootCause = s;
139 return true;
Zane Shelleyf4792d62021-10-28 18:08:22 -0500140 }
141 }
142
143 return false; // default, nothing found
144}
145
146//------------------------------------------------------------------------------
147
148// Will query if a signature is a potential system checkstop root cause.
149// attention. Note that this function excludes memory channel failure attentions
Zane Shelleyed3ab8f2022-05-24 21:08:21 -0500150// which are checked in __findMemoryChannelFailure().
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600151bool __findCsRootCause(const libhei::Signature& i_signature,
152 const RasDataParser& i_rasData)
Zane Shelleyf4792d62021-10-28 18:08:22 -0500153{
Zane Shelley93b001c2023-03-24 17:45:04 -0500154 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155 if (i_rasData.isFlagSet(i_signature,
156 RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157 i_rasData.isFlagSet(i_signature,
158 RasDataParser::RasDataFlags::SUE_SOURCE))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500159 {
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600160 return true;
Zane Shelleyf4792d62021-10-28 18:08:22 -0500161 }
162
163 return false; // default, nothing found
164}
165
166//------------------------------------------------------------------------------
167
168bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600169 libhei::Signature& o_rootCause,
170 const RasDataParser& i_rasData)
Zane Shelleyf4792d62021-10-28 18:08:22 -0500171{
Zane Shelleyadda0542023-04-06 16:38:02 -0500172 for (const auto& s : i_list)
Zane Shelleyf4792d62021-10-28 18:08:22 -0500173 {
174 // Only looking for recoverable attentions.
175 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176 {
177 continue;
178 }
179
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600180 if (__findCsRootCause(s, i_rasData))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500181 {
182 o_rootCause = s;
183 return true;
184 }
185 }
186
187 return false; // default, nothing found
188}
189
190//------------------------------------------------------------------------------
191
192bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600193 libhei::Signature& o_rootCause,
194 const RasDataParser& i_rasData)
Zane Shelleyf4792d62021-10-28 18:08:22 -0500195{
Zane Shelleyadda0542023-04-06 16:38:02 -0500196 for (const auto& s : i_list)
Zane Shelleyf4792d62021-10-28 18:08:22 -0500197 {
198 // Only looking for unit checkstop attentions.
199 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200 {
201 continue;
202 }
203
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600204 if (__findCsRootCause(s, i_rasData))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500205 {
206 o_rootCause = s;
207 return true;
208 }
209 }
210
211 return false; // default, nothing found
212}
213
214//------------------------------------------------------------------------------
215
Caleb Palmer51f82022023-02-22 16:09:09 -0600216bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217 libhei::Signature& o_rootCause,
218 const RasDataParser& i_rasData)
219{
220 using namespace util::pdbg;
221
222 // If we have any attentions from an OCMB, assume isolation to the OCMBs
223 // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
Zane Shelleyadda0542023-04-06 16:38:02 -0500224 for (const auto& s : i_list)
Caleb Palmer51f82022023-02-22 16:09:09 -0600225 {
226 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227 {
228 return false;
229 }
230 }
231
Zane Shelleyadda0542023-04-06 16:38:02 -0500232 for (const auto& s : i_list)
Caleb Palmer51f82022023-02-22 16:09:09 -0600233 {
Zane Shelley93b001c2023-03-24 17:45:04 -0500234 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
Caleb Palmer51f82022023-02-22 16:09:09 -0600235 {
236 o_rootCause = s;
237 return true;
238 }
239 }
240
241 return false; // default, nothing found
242}
243
244//------------------------------------------------------------------------------
245
Zane Shelleyf4792d62021-10-28 18:08:22 -0500246bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247 libhei::Signature& o_rootCause)
248{
249 using namespace util::pdbg;
250
Zane Shelley19df3702021-12-16 22:32:54 -0600251 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
Zane Shelleyf4792d62021-10-28 18:08:22 -0500252
Zane Shelleyadda0542023-04-06 16:38:02 -0500253 for (const auto& s : i_list)
Zane Shelleyf4792d62021-10-28 18:08:22 -0500254 {
255 const auto targetType = getTrgtType(getTrgt(s.getChip()));
Patrick Williams27dd6362023-05-10 07:51:20 -0500256 const auto id = s.getId();
257 const auto attnType = s.getAttnType();
Zane Shelleyf4792d62021-10-28 18:08:22 -0500258
Zane Shelleyadda0542023-04-06 16:38:02 -0500259 // Find any processor with chip checkstop attention that did not
Zane Shelleyf4792d62021-10-28 18:08:22 -0500260 // originate from the PB_EXT_FIR.
261 if ((TYPE_PROC == targetType) &&
Zane Shelleyadda0542023-04-06 16:38:02 -0500262 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500263 {
264 o_rootCause = s;
265 return true;
266 }
267 }
268
269 return false; // default, nothing found
270}
271
272//------------------------------------------------------------------------------
273
Zane Shelleybaec7c02022-03-17 11:05:20 -0500274bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275 libhei::Signature& o_rootCause)
276{
277 using namespace util::pdbg;
278
Patrick Williams27dd6362023-05-10 07:51:20 -0500279 using func = libhei::NodeId_t (*)(const std::string& i_str);
Zane Shelleybaec7c02022-03-17 11:05:20 -0500280 func __hash = libhei::hash<libhei::NodeId_t>;
281
282 // PROC registers
Patrick Williams27dd6362023-05-10 07:51:20 -0500283 static const auto tp_local_fir = __hash("TP_LOCAL_FIR");
284 static const auto occ_fir = __hash("OCC_FIR");
285 static const auto pbao_fir = __hash("PBAO_FIR");
286 static const auto n0_local_fir = __hash("N0_LOCAL_FIR");
287 static const auto int_cq_fir = __hash("INT_CQ_FIR");
288 static const auto nx_cq_fir = __hash("NX_CQ_FIR");
289 static const auto nx_dma_eng_fir = __hash("NX_DMA_ENG_FIR");
290 static const auto vas_fir = __hash("VAS_FIR");
291 static const auto n1_local_fir = __hash("N1_LOCAL_FIR");
292 static const auto mcd_fir = __hash("MCD_FIR");
Zane Shelleybaec7c02022-03-17 11:05:20 -0500293 static const auto pb_station_fir_en_1 = __hash("PB_STATION_FIR_EN_1");
294 static const auto pb_station_fir_en_2 = __hash("PB_STATION_FIR_EN_2");
295 static const auto pb_station_fir_en_3 = __hash("PB_STATION_FIR_EN_3");
296 static const auto pb_station_fir_en_4 = __hash("PB_STATION_FIR_EN_4");
297 static const auto pb_station_fir_es_1 = __hash("PB_STATION_FIR_ES_1");
298 static const auto pb_station_fir_es_2 = __hash("PB_STATION_FIR_ES_2");
299 static const auto pb_station_fir_es_3 = __hash("PB_STATION_FIR_ES_3");
300 static const auto pb_station_fir_es_4 = __hash("PB_STATION_FIR_ES_4");
Patrick Williams27dd6362023-05-10 07:51:20 -0500301 static const auto pb_station_fir_eq = __hash("PB_STATION_FIR_EQ");
302 static const auto psihb_fir = __hash("PSIHB_FIR");
303 static const auto pbaf_fir = __hash("PBAF_FIR");
304 static const auto lpc_fir = __hash("LPC_FIR");
305 static const auto eq_core_fir = __hash("EQ_CORE_FIR");
306 static const auto eq_l2_fir = __hash("EQ_L2_FIR");
307 static const auto eq_l3_fir = __hash("EQ_L3_FIR");
308 static const auto eq_ncu_fir = __hash("EQ_NCU_FIR");
309 static const auto eq_local_fir = __hash("EQ_LOCAL_FIR");
310 static const auto eq_qme_fir = __hash("EQ_QME_FIR");
311 static const auto iohs_local_fir = __hash("IOHS_LOCAL_FIR");
312 static const auto iohs_dlp_fir_oc = __hash("IOHS_DLP_FIR_OC");
313 static const auto iohs_dlp_fir_smp = __hash("IOHS_DLP_FIR_SMP");
314 static const auto mc_local_fir = __hash("MC_LOCAL_FIR");
315 static const auto mc_fir = __hash("MC_FIR");
316 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
317 static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
318 static const auto nmmu_cq_fir = __hash("NMMU_CQ_FIR");
319 static const auto nmmu_fir = __hash("NMMU_FIR");
320 static const auto mc_omi_dl = __hash("MC_OMI_DL");
321 static const auto pau_local_fir = __hash("PAU_LOCAL_FIR");
322 static const auto pau_ptl_fir = __hash("PAU_PTL_FIR");
323 static const auto pau_phy_fir = __hash("PAU_PHY_FIR");
324 static const auto pau_fir_0 = __hash("PAU_FIR_0");
325 static const auto pau_fir_2 = __hash("PAU_FIR_2");
326 static const auto pci_local_fir = __hash("PCI_LOCAL_FIR");
327 static const auto pci_iop_fir = __hash("PCI_IOP_FIR");
328 static const auto pci_nest_fir = __hash("PCI_NEST_FIR");
Zane Shelleybaec7c02022-03-17 11:05:20 -0500329
330 // OCMB registers
331 static const auto ocmb_lfir = __hash("OCMB_LFIR");
Patrick Williams27dd6362023-05-10 07:51:20 -0500332 static const auto mmiofir = __hash("MMIOFIR");
333 static const auto srqfir = __hash("SRQFIR");
334 static const auto rdffir = __hash("RDFFIR");
335 static const auto tlxfir = __hash("TLXFIR");
336 static const auto omi_dl = __hash("OMI_DL");
Zane Shelleybaec7c02022-03-17 11:05:20 -0500337
338 for (const auto& signature : i_list)
339 {
340 const auto targetType = getTrgtType(getTrgt(signature.getChip()));
Patrick Williams27dd6362023-05-10 07:51:20 -0500341 const auto attnType = signature.getAttnType();
342 const auto id = signature.getId();
343 const auto bit = signature.getBit();
Zane Shelleybaec7c02022-03-17 11:05:20 -0500344
345 // Only looking for recoverable or unit checkstop attentions.
346 if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
347 libhei::ATTN_TYPE_UNIT_CS != attnType)
348 {
349 continue;
350 }
351
352 // Ignore attentions that should not be blamed as root cause of a TI.
353 // This would include informational only FIRs or correctable errors.
354 if (TYPE_PROC == targetType)
355 {
356 if (tp_local_fir == id &&
357 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
358 5 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
359 11 == bit || 20 == bit || 22 == bit || 23 == bit ||
360 24 == bit || 38 == bit || 40 == bit || 41 == bit ||
361 46 == bit || 47 == bit || 48 == bit || 55 == bit ||
362 56 == bit || 57 == bit || 58 == bit || 59 == bit))
363 {
364 continue;
365 }
366
367 if (occ_fir == id &&
368 (9 == bit || 10 == bit || 15 == bit || 20 == bit || 21 == bit ||
369 22 == bit || 23 == bit || 32 == bit || 33 == bit ||
370 34 == bit || 36 == bit || 42 == bit || 43 == bit ||
371 46 == bit || 47 == bit || 48 == bit || 51 == bit ||
372 52 == bit || 53 == bit || 54 == bit || 57 == bit))
373 {
374 continue;
375 }
376
377 if (pbao_fir == id &&
378 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 11 == bit ||
379 13 == bit || 15 == bit || 16 == bit || 17 == bit))
380 {
381 continue;
382 }
383
384 if ((n0_local_fir == id || n1_local_fir == id ||
385 iohs_local_fir == id || mc_local_fir == id ||
386 pau_local_fir == id || pci_local_fir == id) &&
387 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
388 5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit ||
389 10 == bit || 11 == bit || 20 == bit || 21 == bit))
390 {
391 continue;
392 }
393
394 if (int_cq_fir == id &&
395 (0 == bit || 3 == bit || 5 == bit || 7 == bit || 36 == bit ||
Caleb Palmerecde53f2022-12-13 15:11:47 -0600396 47 == bit || 48 == bit || 49 == bit || 50 == bit ||
Zane Shelleybaec7c02022-03-17 11:05:20 -0500397 58 == bit || 59 == bit || 60 == bit))
398 {
399 continue;
400 }
401
402 if (nx_cq_fir == id &&
403 (1 == bit || 4 == bit || 18 == bit || 32 == bit || 33 == bit))
404 {
405 continue;
406 }
407
408 if (nx_dma_eng_fir == id &&
409 (4 == bit || 6 == bit || 9 == bit || 10 == bit || 11 == bit ||
410 34 == bit || 35 == bit || 36 == bit || 37 == bit || 39 == bit))
411 {
412 continue;
413 }
414
415 if (vas_fir == id &&
416 (8 == bit || 9 == bit || 11 == bit || 12 == bit || 13 == bit))
417 {
418 continue;
419 }
420
421 if (mcd_fir == id && (0 == bit))
422 {
423 continue;
424 }
425
426 if ((pb_station_fir_en_1 == id || pb_station_fir_en_2 == id ||
427 pb_station_fir_en_3 == id || pb_station_fir_en_4 == id ||
428 pb_station_fir_es_1 == id || pb_station_fir_es_2 == id ||
429 pb_station_fir_es_3 == id || pb_station_fir_es_4 == id ||
430 pb_station_fir_eq == id) &&
431 (9 == bit))
432 {
433 continue;
434 }
435
436 if (psihb_fir == id && (0 == bit || 23 == bit))
437 {
438 continue;
439 }
440
441 if (pbaf_fir == id &&
442 (0 == bit || 1 == bit || 3 == bit || 4 == bit || 5 == bit ||
443 6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
444 11 == bit || 19 == bit || 20 == bit || 21 == bit ||
445 28 == bit || 29 == bit || 30 == bit || 31 == bit ||
446 32 == bit || 33 == bit || 34 == bit || 35 == bit || 36 == bit))
447 {
448 continue;
449 }
450
451 if (lpc_fir == id && (5 == bit))
452 {
453 continue;
454 }
455
456 if (eq_core_fir == id &&
457 (0 == bit || 2 == bit || 4 == bit || 7 == bit || 9 == bit ||
458 11 == bit || 13 == bit || 18 == bit || 21 == bit ||
459 24 == bit || 29 == bit || 31 == bit || 37 == bit ||
460 43 == bit || 56 == bit || 57 == bit))
461 {
462 continue;
463 }
464
465 if (eq_l2_fir == id &&
466 (0 == bit || 6 == bit || 11 == bit || 19 == bit || 36 == bit))
467 {
468 continue;
469 }
470
471 if (eq_l3_fir == id &&
472 (3 == bit || 4 == bit || 7 == bit || 10 == bit || 13 == bit))
473 {
474 continue;
475 }
476
477 if (eq_ncu_fir == id && (9 == bit))
478 {
479 continue;
480 }
481
482 if (eq_local_fir == id &&
483 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 5 == bit ||
484 6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
485 11 == bit || 12 == bit || 13 == bit || 14 == bit ||
486 15 == bit || 16 == bit || 20 == bit || 21 == bit ||
487 22 == bit || 23 == bit || 24 == bit || 25 == bit ||
488 26 == bit || 27 == bit || 28 == bit || 29 == bit ||
489 30 == bit || 31 == bit || 32 == bit || 33 == bit ||
490 34 == bit || 35 == bit || 36 == bit || 37 == bit ||
491 38 == bit || 39 == bit))
492 {
493 continue;
494 }
495
496 if (eq_qme_fir == id && (7 == bit || 25 == bit))
497 {
498 continue;
499 }
500
501 if (iohs_dlp_fir_oc == id &&
502 (6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
503 48 == bit || 49 == bit || 52 == bit || 53 == bit))
504 {
505 continue;
506 }
507
508 if (iohs_dlp_fir_smp == id &&
509 (6 == bit || 7 == bit || 14 == bit || 15 == bit || 16 == bit ||
510 17 == bit || 38 == bit || 39 == bit || 44 == bit ||
511 45 == bit || 50 == bit || 51 == bit))
512 {
513 continue;
514 }
515
516 if (mc_fir == id &&
517 (5 == bit || 8 == bit || 15 == bit || 16 == bit))
518 {
519 continue;
520 }
521
522 if (mc_dstl_fir == id &&
523 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
524 5 == bit || 6 == bit || 7 == bit || 14 == bit || 15 == bit))
525 {
526 continue;
527 }
528
529 if (mc_ustl_fir == id &&
530 (6 == bit || 20 == bit || 33 == bit || 34 == bit))
531 {
532 continue;
533 }
534
535 if (nmmu_cq_fir == id && (8 == bit || 11 == bit || 14 == bit))
536 {
537 continue;
538 }
539
540 if (nmmu_fir == id &&
541 (0 == bit || 3 == bit || 8 == bit || 9 == bit || 10 == bit ||
542 11 == bit || 12 == bit || 13 == bit || 14 == bit ||
543 15 == bit || 30 == bit || 31 == bit || 41 == bit))
544 {
545 continue;
546 }
547
548 if (mc_omi_dl == id && (2 == bit || 3 == bit || 6 == bit ||
549 7 == bit || 9 == bit || 10 == bit))
550 {
551 continue;
552 }
553
554 if (pau_ptl_fir == id && (5 == bit || 9 == bit))
555 {
556 continue;
557 }
558
559 if (pau_phy_fir == id &&
560 (2 == bit || 3 == bit || 6 == bit || 7 == bit || 15 == bit))
561 {
562 continue;
563 }
564
565 if (pau_fir_0 == id && (13 == bit || 30 == bit || 41 == bit))
566 {
567 continue;
568 }
569
570 if (pau_fir_2 == id && (19 == bit || 46 == bit || 49 == bit))
571 {
572 continue;
573 }
574
575 if (pci_iop_fir == id &&
576 (0 == bit || 2 == bit || 4 == bit || 6 == bit || 7 == bit ||
577 8 == bit || 10 == bit))
578 {
579 continue;
580 }
581
582 if (pci_nest_fir == id && (2 == bit || 5 == bit))
583 {
584 continue;
585 }
586 }
587 else if (TYPE_OCMB == targetType)
588 {
589 if (ocmb_lfir == id &&
590 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 23 == bit ||
591 37 == bit || 63 == bit))
592 {
593 continue;
594 }
595
596 if (mmiofir == id && (2 == bit))
597 {
598 continue;
599 }
600
601 if (srqfir == id &&
602 (2 == bit || 4 == bit || 14 == bit || 15 == bit || 23 == bit ||
603 25 == bit || 28 == bit))
604 {
605 continue;
606 }
607
608 if (rdffir == id &&
609 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
610 5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit ||
611 18 == bit || 38 == bit || 40 == bit || 41 == bit ||
612 45 == bit || 46 == bit))
613 {
614 continue;
615 }
616
617 if (tlxfir == id && (0 == bit || 9 == bit || 26 == bit))
618 {
619 continue;
620 }
621
622 if (omi_dl == id && (2 == bit || 3 == bit || 6 == bit || 7 == bit ||
623 9 == bit || 10 == bit))
624 {
625 continue;
626 }
627 }
628
629 // At this point, the attention has not been explicitly ignored. So
630 // return this signature and exit.
631 o_rootCause = signature;
632 return true;
633 }
634
635 return false; // default, nothing found
636}
637
638//------------------------------------------------------------------------------
639
Caleb Palmerc3038c02023-09-11 10:20:56 -0500640bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
641 libhei::Signature& o_rootCause,
642 const RasDataParser& i_rasData)
Zane Shelley65fefb22021-10-18 15:35:26 -0500643{
644 // We'll need to make a copy of the list so that the original list is
Zane Shelleyec227c22021-12-09 15:54:40 -0600645 // maintained for the PEL.
Zane Shelley65fefb22021-10-18 15:35:26 -0500646 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
647
648 // START WORKAROUND
649 // TODO: Filtering should be data driven. Until that support is available,
650 // use the following isolation rules.
651
Zane Shelleyec227c22021-12-09 15:54:40 -0600652 // Ensure the list is not empty before continuing.
Zane Shelleyf4792d62021-10-28 18:08:22 -0500653 if (list.empty())
654 {
Zane Shelleyec227c22021-12-09 15:54:40 -0600655 return false; // nothing more to do
Zane Shelleyf4792d62021-10-28 18:08:22 -0500656 }
657
658 // First, look for any RCS OSC errors. This must always be first because
659 // they can cause downstream PLL unlock attentions.
660 if (__findRcsOscError(list, o_rootCause))
Zane Shelleya7369f82021-10-18 16:52:21 -0500661 {
662 return true;
663 }
664
Zane Shelleyf4792d62021-10-28 18:08:22 -0500665 // Second, look for any PLL unlock attentions. This must always be second
666 // because PLL unlock attentions can cause any number of downstream
667 // attentions, including a system checkstop.
668 if (__findPllUnlock(list, o_rootCause))
669 {
670 return true;
671 }
672
Zane Shelleyec227c22021-12-09 15:54:40 -0600673 // Regardless of the analysis type, always look for anything that could be
674 // blamed as the root cause of a system checkstop.
675
Zane Shelleyf4792d62021-10-28 18:08:22 -0500676 // Memory channel failure attentions will produce SUEs and likely cause
677 // downstream attentions, including a system checkstop.
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600678 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500679 {
680 return true;
681 }
682
683 // Look for any recoverable attentions that have been identified as a
684 // potential root cause of a system checkstop attention. These would include
685 // any attention that would generate an SUE. Note that is it possible for
686 // recoverables to generate unit checkstop attentions so we must check them
687 // first.
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600688 if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500689 {
690 return true;
691 }
692
693 // Look for any unit checkstop attentions (other than memory channel
694 // failures) that have been identified as a potential root cause of a
695 // system checkstop attention. These would include any attention that would
696 // generate an SUE.
Caleb Palmer1a4f0e72022-11-07 15:08:01 -0600697 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
Zane Shelleyf4792d62021-10-28 18:08:22 -0500698 {
699 return true;
700 }
701
Zane Shelley93b001c2023-03-24 17:45:04 -0500702 // If no other viable root cause has been found, check for any signatures
703 // with the ATTN_FROM_OCMB flag in case there was an attention from an
704 // inaccessible OCMB.
Caleb Palmer51f82022023-02-22 16:09:09 -0600705 if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
706 {
707 return true;
708 }
709
Zane Shelleyf4792d62021-10-28 18:08:22 -0500710 // Look for any system checkstop attentions that originated from within the
711 // chip that reported the attention. In other words, no external checkstop
712 // attentions.
713 if (__findNonExternalCs(list, o_rootCause))
714 {
715 return true;
716 }
717
Zane Shelleyec227c22021-12-09 15:54:40 -0600718 if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
Zane Shelley65fefb22021-10-18 15:35:26 -0500719 {
Zane Shelleyec227c22021-12-09 15:54:40 -0600720 // No system checkstop root cause attentions were found. Next, look for
721 // any recoverable or unit checkstop attentions that could be associated
Zane Shelleybaec7c02022-03-17 11:05:20 -0500722 // with a TI.
723 if (__findTiRootCause(list, o_rootCause))
Zane Shelleyec227c22021-12-09 15:54:40 -0600724 {
Zane Shelleyec227c22021-12-09 15:54:40 -0600725 return true;
726 }
727
728 if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
729 {
730 // No attentions associated with a system checkstop or TI were
731 // found. Simply, return the first entry in the list.
732 o_rootCause = list.front();
733 return true;
734 }
Zane Shelley65fefb22021-10-18 15:35:26 -0500735 }
736
737 // END WORKAROUND
738
739 return false; // default, no active attentions found.
740}
741
742//------------------------------------------------------------------------------
743
Caleb Palmerc3038c02023-09-11 10:20:56 -0500744bool __findIueTh(const std::vector<libhei::Signature>& i_list,
745 libhei::Signature& o_rootCause)
746{
747 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
748 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
749 (17 == t.getBit() || 37 == t.getBit())) ||
750 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
751 (18 == t.getBit() || 38 == t.getBit()));
752 });
753
754 if (i_list.end() != itr)
755 {
756 o_rootCause = *itr;
757 return true;
758 }
759
760 return false;
761}
762
763//------------------------------------------------------------------------------
764
765void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
766 libhei::Signature& o_rootCause,
767 const RasDataParser& i_rasData)
768{
769 using func = libhei::NodeId_t (*)(const std::string& i_str);
770 func __hash = libhei::hash<libhei::NodeId_t>;
771
772 // Check for any special cases that exist for specific FIR bits.
773
774 // If the channel fail was specifically a firmware initiated channel fail
775 // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
776 // any IUE bits that are on that would have caused the channel fail
777 // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
778 // for Odyssey OCMBs).
779
780 // Explorer SRQFIR
781 static const auto srqfir = __hash("SRQFIR");
782 // Odyssey SRQ_FIR
783 static const auto srq_fir = __hash("SRQ_FIR");
784
785 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
786
787 if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
788 (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
789 __findIueTh(list, o_rootCause))
790 {
791 // If __findIueTh returned true, o_rootCause was updated, return.
792 return;
793 }
794
795 // Check if the root cause found was a potential side effect of an
796 // ODP data corruption error. If it was, check if any other signature
797 // in the signature list was a potential root cause.
798 auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
799 auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
800 if (i_rasData.isFlagSet(o_rootCause, OdpSide))
801 {
802 for (const auto& s : list)
803 {
804 if (i_rasData.isFlagSet(s, OdpRoot))
805 {
806 // ODP data corruption root cause found, return.
807 o_rootCause = s;
808 return;
809 }
810 }
811 }
812}
813
814//------------------------------------------------------------------------------
815
816bool filterRootCause(AnalysisType i_type,
817 const libhei::IsolationData& i_isoData,
818 libhei::Signature& o_rootCause,
819 const RasDataParser& i_rasData)
820{
821 // Find the initial root cause attention based on common rules for FIR
822 // isolation.
823 bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
824
825 // If some root cause was found, handle any special cases for specific FIR
826 // bits that require additional logic to determine the root cause.
827 if (true == rc)
828 {
829 rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
830 }
831
832 return rc;
833}
834
835//------------------------------------------------------------------------------
836
Zane Shelley65fefb22021-10-18 15:35:26 -0500837} // namespace analyzer