blob: 6455f27dec6dae92fca1838193ce79cc3427c797 [file] [log] [blame]
Zane Shelley65fefb22021-10-18 15:35:26 -05001#include <assert.h>
2
Zane Shelleyec227c22021-12-09 15:54:40 -06003#include <analyzer_main.hpp>
Zane Shelley65fefb22021-10-18 15:35:26 -05004#include <hei_main.hpp>
Zane Shelleyf4792d62021-10-28 18:08:22 -05005#include <util/pdbg.hpp>
Zane Shelley65fefb22021-10-18 15:35:26 -05006
7#include <algorithm>
8#include <limits>
9#include <string>
10
11namespace analyzer
12{
13
14//------------------------------------------------------------------------------
15
Zane Shelleya7369f82021-10-18 16:52:21 -050016uint64_t __hash(unsigned int i_bytes, const std::string& i_str)
17{
18 // This hash is a simple "n*s[0] + (n-1)*s[1] + ... + s[n-1]" algorithm,
19 // where s[i] is a chunk from the input string the length of i_bytes.
20
21 // Currently only supporting 1-8 byte hashes.
22 assert(1 <= i_bytes && i_bytes <= sizeof(uint64_t));
23
24 // Start hashing each chunk.
25 uint64_t sumA = 0;
26 uint64_t sumB = 0;
27
28 // Iterate one chunk at a time.
29 for (unsigned int i = 0; i < i_str.size(); i += i_bytes)
30 {
31 // Combine each chunk into a single integer value. If we reach the end
32 // of the string, pad with null characters.
33 uint64_t chunk = 0;
34 for (unsigned int j = 0; j < i_bytes; j++)
35 {
36 chunk <<= 8;
37 chunk |= (i + j < i_str.size()) ? i_str[i + j] : '\0';
38 }
39
40 // Apply the simple hash.
41 sumA += chunk;
42 sumB += sumA;
43 }
44
45 // Mask off everything except the target number of bytes.
46 auto mask = std::numeric_limits<uint64_t>::max();
47 sumB &= mask >> ((sizeof(uint64_t) - i_bytes) * 8);
48
49 return sumB;
50}
51
52//------------------------------------------------------------------------------
53
54bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
55 libhei::Signature& o_rootCause)
56{
57 // TODO: Consider returning all of them instead of one as root cause.
58 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
59 return (__hash(2, "TP_LOCAL_FIR") == t.getId() &&
60 (42 == t.getBit() || 43 == t.getBit()));
61 });
62
63 if (i_list.end() != itr)
64 {
65 o_rootCause = *itr;
66 return true;
67 }
68
69 return false;
70}
71
72//------------------------------------------------------------------------------
73
74bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
75 libhei::Signature& o_rootCause)
76{
77 // TODO: Consider returning all of them instead of one as root cause.
78 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
79 return (__hash(2, "PLL_UNLOCK") == t.getId() &&
80 (0 == t.getBit() || 1 == t.getBit()));
81 });
82
83 if (i_list.end() != itr)
84 {
85 o_rootCause = *itr;
86 return true;
87 }
88
89 return false;
90}
91
92//------------------------------------------------------------------------------
93
Zane Shelleyf4792d62021-10-28 18:08:22 -050094bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
95 libhei::Signature& o_rootCause)
96{
97 using namespace util::pdbg;
98
99 static const auto mc_dstl_fir = __hash(2, "MC_DSTL_FIR");
100 static const auto mc_ustl_fir = __hash(2, "MC_USTL_FIR");
101 static const auto mc_omi_dl_err_rpt = __hash(2, "MC_OMI_DL_ERR_RPT");
102
103 for (const auto s : i_list)
104 {
105 const auto targetType = getTrgtType(getTrgt(s.getChip()));
106 const auto id = s.getId();
107 const auto bit = s.getBit();
108 const auto attnType = s.getAttnType();
109
110 // Look for any unit checkstop attentions from OCMBs.
111 if (TYPE_OCMB == targetType)
112 {
113 // Any unit checkstop attentions will trigger a channel failure.
114 if (libhei::ATTN_TYPE_UNIT_CS == attnType)
115 {
116 o_rootCause = s;
117 return true;
118 }
119 }
120 // Look for channel failure attentions on processors.
121 else if (TYPE_PROC == targetType)
122 {
123 // TODO: All of these channel failure bits are configurable.
124 // Eventually, we will need some mechanism to check that
125 // config registers for a more accurate analysis. For now,
126 // simply check for all bits that could potentially be
127 // configured to channel failure.
128
129 // Any unit checkstop bit in the MC_DSTL_FIR or MC_USTL_FIR could
130 // be a channel failure.
131 if (libhei::ATTN_TYPE_UNIT_CS == attnType)
132 {
133 // Ignore bits MC_DSTL_FIR[0:7] because they simply indicate
134 // attentions occurred on the attached OCMBs.
135 if ((mc_dstl_fir == id && 8 <= bit) || (mc_ustl_fir == id))
136 {
137 o_rootCause = s;
138 return true;
139 }
140 }
141
142 // All bits in MC_OMI_DL_ERR_RPT eventually feed into
143 // MC_OMI_DL_FIR[0,20] which are configurable to channel failure.
144 if (mc_omi_dl_err_rpt == id)
145 {
146 o_rootCause = s;
147 return true;
148 }
149 }
150 }
151
152 return false; // default, nothing found
153}
154
155//------------------------------------------------------------------------------
156
157// Will query if a signature is a potential system checkstop root cause.
158// attention. Note that this function excludes memory channel failure attentions
159// and core unit checkstop attentions.
160bool __findCsRootCause(const libhei::Signature& i_signature)
161{
162 using namespace util::pdbg;
163
164 // PROC registers
165 static const auto eq_core_fir = __hash(2, "EQ_CORE_FIR");
166 static const auto eq_l2_fir = __hash(2, "EQ_L2_FIR");
167 static const auto eq_l3_fir = __hash(2, "EQ_L3_FIR");
168 static const auto eq_ncu_fir = __hash(2, "EQ_NCU_FIR");
169 static const auto iohs_dlp_fir_oc = __hash(2, "IOHS_DLP_FIR_OC");
170 static const auto iohs_dlp_fir_smp = __hash(2, "IOHS_DLP_FIR_SMP");
171 static const auto nx_cq_fir = __hash(2, "NX_CQ_FIR");
172 static const auto nx_dma_eng_fir = __hash(2, "NX_DMA_ENG_FIR");
173 static const auto pau_fir_0 = __hash(2, "PAU_FIR_0");
174 static const auto pau_fir_1 = __hash(2, "PAU_FIR_1");
175 static const auto pau_fir_2 = __hash(2, "PAU_FIR_2");
176 static const auto pau_ptl_fir = __hash(2, "PAU_PTL_FIR");
177
178 // OCMB registers
179 static const auto rdffir = __hash(2, "RDFFIR");
180
181 const auto targetType = getTrgtType(getTrgt(i_signature.getChip()));
182 const auto id = i_signature.getId();
183 const auto bit = i_signature.getBit();
184
185 if (TYPE_PROC == targetType)
186 {
187 if (eq_core_fir == id &&
188 (3 == bit || 5 == bit || 8 == bit || 12 == bit || 22 == bit ||
189 25 == bit || 32 == bit || 36 == bit || 38 == bit || 46 == bit ||
190 47 == bit || 57 == bit))
191 {
192 return true;
193 }
194
195 if (eq_l2_fir == id &&
196 (1 == bit || 12 == bit || 13 == bit || 17 == bit || 18 == bit ||
197 20 == bit || 27 == bit))
198 {
199 return true;
200 }
201
202 if (eq_l3_fir == id &&
203 (2 == bit || 5 == bit || 8 == bit || 11 == bit || 17 == bit))
204 {
205 return true;
206 }
207
208 if (eq_ncu_fir == id && (3 == bit || 4 == bit || 5 == bit || 7 == bit ||
209 8 == bit || 10 == bit || 17 == bit))
210 {
211 return true;
212 }
213
214 if (iohs_dlp_fir_oc == id && (54 <= bit && bit <= 61))
215 {
216 return true;
217 }
218
219 if (iohs_dlp_fir_smp == id && (54 <= bit && bit <= 61))
220 {
221 return true;
222 }
223
224 if (nx_cq_fir == id && (7 == bit || 16 == bit || 21 == bit))
225 {
226 return true;
227 }
228
229 if (nx_dma_eng_fir == id && (0 == bit))
230 {
231 return true;
232 }
233
234 if (pau_fir_0 == id &&
235 (15 == bit || 18 == bit || 19 == bit || 25 == bit || 26 == bit ||
236 29 == bit || 33 == bit || 34 == bit || 35 == bit || 40 == bit ||
237 42 == bit || 44 == bit || 45 == bit))
238 {
239 return true;
240 }
241
242 if (pau_fir_1 == id &&
243 (13 == bit || 14 == bit || 15 == bit || 37 == bit || 39 == bit ||
244 40 == bit || 41 == bit || 42 == bit))
245 {
246 return true;
247 }
248
249 if (pau_fir_2 == id &&
250 ((4 <= bit && bit <= 18) || (20 <= bit && bit <= 31) ||
251 (36 <= bit && bit <= 41) || 45 == bit || 47 == bit || 48 == bit ||
252 50 == bit || 51 == bit || 52 == bit))
253 {
254 return true;
255 }
256
257 if (pau_ptl_fir == id && (4 == bit || 8 == bit))
258 {
259 return true;
260 }
261 }
262 else if (TYPE_OCMB == targetType)
263 {
264 if (rdffir == id && (14 == bit || 15 == bit || 17 == bit || 37 == bit))
265 {
266 return true;
267 }
268 }
269
270 return false; // default, nothing found
271}
272
273//------------------------------------------------------------------------------
274
275bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
276 libhei::Signature& o_rootCause)
277{
278 for (const auto s : i_list)
279 {
280 // Only looking for recoverable attentions.
281 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
282 {
283 continue;
284 }
285
286 if (__findCsRootCause(s))
287 {
288 o_rootCause = s;
289 return true;
290 }
291 }
292
293 return false; // default, nothing found
294}
295
296//------------------------------------------------------------------------------
297
298bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
299 libhei::Signature& o_rootCause)
300{
301 for (const auto s : i_list)
302 {
303 // Only looking for unit checkstop attentions.
304 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
305 {
306 continue;
307 }
308
309 if (__findCsRootCause(s))
310 {
311 o_rootCause = s;
312 return true;
313 }
314 }
315
316 return false; // default, nothing found
317}
318
319//------------------------------------------------------------------------------
320
321bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
322 libhei::Signature& o_rootCause)
323{
324 using namespace util::pdbg;
325
326 static const auto pb_ext_fir = __hash(2, "PB_EXT_FIR");
327
328 for (const auto s : i_list)
329 {
330 const auto targetType = getTrgtType(getTrgt(s.getChip()));
331 const auto id = s.getId();
332 const auto attnType = s.getAttnType();
333
334 // Find any processor with system checkstop attention that did not
335 // originate from the PB_EXT_FIR.
336 if ((TYPE_PROC == targetType) &&
337 (libhei::ATTN_TYPE_CHECKSTOP == attnType) && (pb_ext_fir != id))
338 {
339 o_rootCause = s;
340 return true;
341 }
342 }
343
344 return false; // default, nothing found
345}
346
347//------------------------------------------------------------------------------
348
Zane Shelleyec227c22021-12-09 15:54:40 -0600349bool filterRootCause(AnalysisType i_type,
350 const libhei::IsolationData& i_isoData,
Zane Shelley65fefb22021-10-18 15:35:26 -0500351 libhei::Signature& o_rootCause)
352{
353 // We'll need to make a copy of the list so that the original list is
Zane Shelleyec227c22021-12-09 15:54:40 -0600354 // maintained for the PEL.
Zane Shelley65fefb22021-10-18 15:35:26 -0500355 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
356
357 // START WORKAROUND
358 // TODO: Filtering should be data driven. Until that support is available,
359 // use the following isolation rules.
360
Zane Shelleyec227c22021-12-09 15:54:40 -0600361 // Ensure the list is not empty before continuing.
Zane Shelleyf4792d62021-10-28 18:08:22 -0500362 if (list.empty())
363 {
Zane Shelleyec227c22021-12-09 15:54:40 -0600364 return false; // nothing more to do
Zane Shelleyf4792d62021-10-28 18:08:22 -0500365 }
366
367 // First, look for any RCS OSC errors. This must always be first because
368 // they can cause downstream PLL unlock attentions.
369 if (__findRcsOscError(list, o_rootCause))
Zane Shelleya7369f82021-10-18 16:52:21 -0500370 {
371 return true;
372 }
373
Zane Shelleyf4792d62021-10-28 18:08:22 -0500374 // Second, look for any PLL unlock attentions. This must always be second
375 // because PLL unlock attentions can cause any number of downstream
376 // attentions, including a system checkstop.
377 if (__findPllUnlock(list, o_rootCause))
378 {
379 return true;
380 }
381
Zane Shelleyec227c22021-12-09 15:54:40 -0600382 // Regardless of the analysis type, always look for anything that could be
383 // blamed as the root cause of a system checkstop.
384
Zane Shelleyf4792d62021-10-28 18:08:22 -0500385 // Memory channel failure attentions will produce SUEs and likely cause
386 // downstream attentions, including a system checkstop.
387 if (__findMemoryChannelFailure(list, o_rootCause))
388 {
389 return true;
390 }
391
392 // Look for any recoverable attentions that have been identified as a
393 // potential root cause of a system checkstop attention. These would include
394 // any attention that would generate an SUE. Note that is it possible for
395 // recoverables to generate unit checkstop attentions so we must check them
396 // first.
397 if (__findCsRootCause_RE(list, o_rootCause))
398 {
399 return true;
400 }
401
402 // Look for any unit checkstop attentions (other than memory channel
403 // failures) that have been identified as a potential root cause of a
404 // system checkstop attention. These would include any attention that would
405 // generate an SUE.
406 if (__findCsRootCause_UCS(list, o_rootCause))
407 {
408 return true;
409 }
410
411 // Look for any system checkstop attentions that originated from within the
412 // chip that reported the attention. In other words, no external checkstop
413 // attentions.
414 if (__findNonExternalCs(list, o_rootCause))
415 {
416 return true;
417 }
418
Zane Shelleyec227c22021-12-09 15:54:40 -0600419 if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
Zane Shelley65fefb22021-10-18 15:35:26 -0500420 {
Zane Shelleyec227c22021-12-09 15:54:40 -0600421 // No system checkstop root cause attentions were found. Next, look for
422 // any recoverable or unit checkstop attentions that could be associated
423 // with a TI.
Zane Shelleyf4792d62021-10-28 18:08:22 -0500424
Zane Shelleyec227c22021-12-09 15:54:40 -0600425 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
426 return (libhei::ATTN_TYPE_RECOVERABLE == t.getAttnType() ||
427 libhei::ATTN_TYPE_UNIT_CS == t.getAttnType());
Zane Shelleyf4792d62021-10-28 18:08:22 -0500428 });
429
Zane Shelleyec227c22021-12-09 15:54:40 -0600430 if (list.end() != itr)
431 {
432 o_rootCause = *itr;
433 return true;
434 }
435
436 if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
437 {
438 // No attentions associated with a system checkstop or TI were
439 // found. Simply, return the first entry in the list.
440 o_rootCause = list.front();
441 return true;
442 }
Zane Shelley65fefb22021-10-18 15:35:26 -0500443 }
444
445 // END WORKAROUND
446
447 return false; // default, no active attentions found.
448}
449
450//------------------------------------------------------------------------------
451
452} // namespace analyzer