blob: 75331740255c9d99dc5c35b724358cc9c90ba46c [file] [log] [blame]
Zane Shelley65fefb22021-10-18 15:35:26 -05001#include <assert.h>
2
3#include <hei_main.hpp>
Zane Shelleyf4792d62021-10-28 18:08:22 -05004#include <util/pdbg.hpp>
Zane Shelley65fefb22021-10-18 15:35:26 -05005
6#include <algorithm>
7#include <limits>
8#include <string>
9
10namespace analyzer
11{
12
13//------------------------------------------------------------------------------
14
Zane Shelleya7369f82021-10-18 16:52:21 -050015uint64_t __hash(unsigned int i_bytes, const std::string& i_str)
16{
17 // This hash is a simple "n*s[0] + (n-1)*s[1] + ... + s[n-1]" algorithm,
18 // where s[i] is a chunk from the input string the length of i_bytes.
19
20 // Currently only supporting 1-8 byte hashes.
21 assert(1 <= i_bytes && i_bytes <= sizeof(uint64_t));
22
23 // Start hashing each chunk.
24 uint64_t sumA = 0;
25 uint64_t sumB = 0;
26
27 // Iterate one chunk at a time.
28 for (unsigned int i = 0; i < i_str.size(); i += i_bytes)
29 {
30 // Combine each chunk into a single integer value. If we reach the end
31 // of the string, pad with null characters.
32 uint64_t chunk = 0;
33 for (unsigned int j = 0; j < i_bytes; j++)
34 {
35 chunk <<= 8;
36 chunk |= (i + j < i_str.size()) ? i_str[i + j] : '\0';
37 }
38
39 // Apply the simple hash.
40 sumA += chunk;
41 sumB += sumA;
42 }
43
44 // Mask off everything except the target number of bytes.
45 auto mask = std::numeric_limits<uint64_t>::max();
46 sumB &= mask >> ((sizeof(uint64_t) - i_bytes) * 8);
47
48 return sumB;
49}
50
51//------------------------------------------------------------------------------
52
53bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
54 libhei::Signature& o_rootCause)
55{
56 // TODO: Consider returning all of them instead of one as root cause.
57 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
58 return (__hash(2, "TP_LOCAL_FIR") == t.getId() &&
59 (42 == t.getBit() || 43 == t.getBit()));
60 });
61
62 if (i_list.end() != itr)
63 {
64 o_rootCause = *itr;
65 return true;
66 }
67
68 return false;
69}
70
71//------------------------------------------------------------------------------
72
73bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
74 libhei::Signature& o_rootCause)
75{
76 // TODO: Consider returning all of them instead of one as root cause.
77 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
78 return (__hash(2, "PLL_UNLOCK") == t.getId() &&
79 (0 == t.getBit() || 1 == t.getBit()));
80 });
81
82 if (i_list.end() != itr)
83 {
84 o_rootCause = *itr;
85 return true;
86 }
87
88 return false;
89}
90
91//------------------------------------------------------------------------------
92
Zane Shelleyf4792d62021-10-28 18:08:22 -050093bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
94 libhei::Signature& o_rootCause)
95{
96 using namespace util::pdbg;
97
98 static const auto mc_dstl_fir = __hash(2, "MC_DSTL_FIR");
99 static const auto mc_ustl_fir = __hash(2, "MC_USTL_FIR");
100 static const auto mc_omi_dl_err_rpt = __hash(2, "MC_OMI_DL_ERR_RPT");
101
102 for (const auto s : i_list)
103 {
104 const auto targetType = getTrgtType(getTrgt(s.getChip()));
105 const auto id = s.getId();
106 const auto bit = s.getBit();
107 const auto attnType = s.getAttnType();
108
109 // Look for any unit checkstop attentions from OCMBs.
110 if (TYPE_OCMB == targetType)
111 {
112 // Any unit checkstop attentions will trigger a channel failure.
113 if (libhei::ATTN_TYPE_UNIT_CS == attnType)
114 {
115 o_rootCause = s;
116 return true;
117 }
118 }
119 // Look for channel failure attentions on processors.
120 else if (TYPE_PROC == targetType)
121 {
122 // TODO: All of these channel failure bits are configurable.
123 // Eventually, we will need some mechanism to check that
124 // config registers for a more accurate analysis. For now,
125 // simply check for all bits that could potentially be
126 // configured to channel failure.
127
128 // Any unit checkstop bit in the MC_DSTL_FIR or MC_USTL_FIR could
129 // be a channel failure.
130 if (libhei::ATTN_TYPE_UNIT_CS == attnType)
131 {
132 // Ignore bits MC_DSTL_FIR[0:7] because they simply indicate
133 // attentions occurred on the attached OCMBs.
134 if ((mc_dstl_fir == id && 8 <= bit) || (mc_ustl_fir == id))
135 {
136 o_rootCause = s;
137 return true;
138 }
139 }
140
141 // All bits in MC_OMI_DL_ERR_RPT eventually feed into
142 // MC_OMI_DL_FIR[0,20] which are configurable to channel failure.
143 if (mc_omi_dl_err_rpt == id)
144 {
145 o_rootCause = s;
146 return true;
147 }
148 }
149 }
150
151 return false; // default, nothing found
152}
153
154//------------------------------------------------------------------------------
155
156// Will query if a signature is a potential system checkstop root cause.
157// attention. Note that this function excludes memory channel failure attentions
158// and core unit checkstop attentions.
159bool __findCsRootCause(const libhei::Signature& i_signature)
160{
161 using namespace util::pdbg;
162
163 // PROC registers
164 static const auto eq_core_fir = __hash(2, "EQ_CORE_FIR");
165 static const auto eq_l2_fir = __hash(2, "EQ_L2_FIR");
166 static const auto eq_l3_fir = __hash(2, "EQ_L3_FIR");
167 static const auto eq_ncu_fir = __hash(2, "EQ_NCU_FIR");
168 static const auto iohs_dlp_fir_oc = __hash(2, "IOHS_DLP_FIR_OC");
169 static const auto iohs_dlp_fir_smp = __hash(2, "IOHS_DLP_FIR_SMP");
170 static const auto nx_cq_fir = __hash(2, "NX_CQ_FIR");
171 static const auto nx_dma_eng_fir = __hash(2, "NX_DMA_ENG_FIR");
172 static const auto pau_fir_0 = __hash(2, "PAU_FIR_0");
173 static const auto pau_fir_1 = __hash(2, "PAU_FIR_1");
174 static const auto pau_fir_2 = __hash(2, "PAU_FIR_2");
175 static const auto pau_ptl_fir = __hash(2, "PAU_PTL_FIR");
176
177 // OCMB registers
178 static const auto rdffir = __hash(2, "RDFFIR");
179
180 const auto targetType = getTrgtType(getTrgt(i_signature.getChip()));
181 const auto id = i_signature.getId();
182 const auto bit = i_signature.getBit();
183
184 if (TYPE_PROC == targetType)
185 {
186 if (eq_core_fir == id &&
187 (3 == bit || 5 == bit || 8 == bit || 12 == bit || 22 == bit ||
188 25 == bit || 32 == bit || 36 == bit || 38 == bit || 46 == bit ||
189 47 == bit || 57 == bit))
190 {
191 return true;
192 }
193
194 if (eq_l2_fir == id &&
195 (1 == bit || 12 == bit || 13 == bit || 17 == bit || 18 == bit ||
196 20 == bit || 27 == bit))
197 {
198 return true;
199 }
200
201 if (eq_l3_fir == id &&
202 (2 == bit || 5 == bit || 8 == bit || 11 == bit || 17 == bit))
203 {
204 return true;
205 }
206
207 if (eq_ncu_fir == id && (3 == bit || 4 == bit || 5 == bit || 7 == bit ||
208 8 == bit || 10 == bit || 17 == bit))
209 {
210 return true;
211 }
212
213 if (iohs_dlp_fir_oc == id && (54 <= bit && bit <= 61))
214 {
215 return true;
216 }
217
218 if (iohs_dlp_fir_smp == id && (54 <= bit && bit <= 61))
219 {
220 return true;
221 }
222
223 if (nx_cq_fir == id && (7 == bit || 16 == bit || 21 == bit))
224 {
225 return true;
226 }
227
228 if (nx_dma_eng_fir == id && (0 == bit))
229 {
230 return true;
231 }
232
233 if (pau_fir_0 == id &&
234 (15 == bit || 18 == bit || 19 == bit || 25 == bit || 26 == bit ||
235 29 == bit || 33 == bit || 34 == bit || 35 == bit || 40 == bit ||
236 42 == bit || 44 == bit || 45 == bit))
237 {
238 return true;
239 }
240
241 if (pau_fir_1 == id &&
242 (13 == bit || 14 == bit || 15 == bit || 37 == bit || 39 == bit ||
243 40 == bit || 41 == bit || 42 == bit))
244 {
245 return true;
246 }
247
248 if (pau_fir_2 == id &&
249 ((4 <= bit && bit <= 18) || (20 <= bit && bit <= 31) ||
250 (36 <= bit && bit <= 41) || 45 == bit || 47 == bit || 48 == bit ||
251 50 == bit || 51 == bit || 52 == bit))
252 {
253 return true;
254 }
255
256 if (pau_ptl_fir == id && (4 == bit || 8 == bit))
257 {
258 return true;
259 }
260 }
261 else if (TYPE_OCMB == targetType)
262 {
263 if (rdffir == id && (14 == bit || 15 == bit || 17 == bit || 37 == bit))
264 {
265 return true;
266 }
267 }
268
269 return false; // default, nothing found
270}
271
272//------------------------------------------------------------------------------
273
274bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
275 libhei::Signature& o_rootCause)
276{
277 for (const auto s : i_list)
278 {
279 // Only looking for recoverable attentions.
280 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
281 {
282 continue;
283 }
284
285 if (__findCsRootCause(s))
286 {
287 o_rootCause = s;
288 return true;
289 }
290 }
291
292 return false; // default, nothing found
293}
294
295//------------------------------------------------------------------------------
296
297bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
298 libhei::Signature& o_rootCause)
299{
300 for (const auto s : i_list)
301 {
302 // Only looking for unit checkstop attentions.
303 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
304 {
305 continue;
306 }
307
308 if (__findCsRootCause(s))
309 {
310 o_rootCause = s;
311 return true;
312 }
313 }
314
315 return false; // default, nothing found
316}
317
318//------------------------------------------------------------------------------
319
320bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
321 libhei::Signature& o_rootCause)
322{
323 using namespace util::pdbg;
324
325 static const auto pb_ext_fir = __hash(2, "PB_EXT_FIR");
326
327 for (const auto s : i_list)
328 {
329 const auto targetType = getTrgtType(getTrgt(s.getChip()));
330 const auto id = s.getId();
331 const auto attnType = s.getAttnType();
332
333 // Find any processor with system checkstop attention that did not
334 // originate from the PB_EXT_FIR.
335 if ((TYPE_PROC == targetType) &&
336 (libhei::ATTN_TYPE_CHECKSTOP == attnType) && (pb_ext_fir != id))
337 {
338 o_rootCause = s;
339 return true;
340 }
341 }
342
343 return false; // default, nothing found
344}
345
346//------------------------------------------------------------------------------
347
Zane Shelley65fefb22021-10-18 15:35:26 -0500348bool filterRootCause(const libhei::IsolationData& i_isoData,
349 libhei::Signature& o_rootCause)
350{
351 // We'll need to make a copy of the list so that the original list is
352 // maintained for the log.
353 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
354
355 // START WORKAROUND
356 // TODO: Filtering should be data driven. Until that support is available,
357 // use the following isolation rules.
358
359 // Special and host attentions are not supported by this user application.
360 auto itr = std::remove_if(list.begin(), list.end(), [&](const auto& t) {
361 return (libhei::ATTN_TYPE_SP_ATTN == t.getAttnType() ||
362 libhei::ATTN_TYPE_HOST_ATTN == t.getAttnType());
363 });
364 list.resize(std::distance(list.begin(), itr));
365
Zane Shelleyf4792d62021-10-28 18:08:22 -0500366 if (list.empty())
367 {
368 return false; // the list is empty, nothing more to do
369 }
370
371 // First, look for any RCS OSC errors. This must always be first because
372 // they can cause downstream PLL unlock attentions.
373 if (__findRcsOscError(list, o_rootCause))
Zane Shelleya7369f82021-10-18 16:52:21 -0500374 {
375 return true;
376 }
377
Zane Shelleyf4792d62021-10-28 18:08:22 -0500378 // Second, look for any PLL unlock attentions. This must always be second
379 // because PLL unlock attentions can cause any number of downstream
380 // attentions, including a system checkstop.
381 if (__findPllUnlock(list, o_rootCause))
382 {
383 return true;
384 }
385
386 // Memory channel failure attentions will produce SUEs and likely cause
387 // downstream attentions, including a system checkstop.
388 if (__findMemoryChannelFailure(list, o_rootCause))
389 {
390 return true;
391 }
392
393 // Look for any recoverable attentions that have been identified as a
394 // potential root cause of a system checkstop attention. These would include
395 // any attention that would generate an SUE. Note that is it possible for
396 // recoverables to generate unit checkstop attentions so we must check them
397 // first.
398 if (__findCsRootCause_RE(list, o_rootCause))
399 {
400 return true;
401 }
402
403 // Look for any unit checkstop attentions (other than memory channel
404 // failures) that have been identified as a potential root cause of a
405 // system checkstop attention. These would include any attention that would
406 // generate an SUE.
407 if (__findCsRootCause_UCS(list, o_rootCause))
408 {
409 return true;
410 }
411
412 // Look for any system checkstop attentions that originated from within the
413 // chip that reported the attention. In other words, no external checkstop
414 // attentions.
415 if (__findNonExternalCs(list, o_rootCause))
416 {
417 return true;
418 }
419
Zane Shelley65fefb22021-10-18 15:35:26 -0500420 if (!list.empty())
421 {
Zane Shelleyf4792d62021-10-28 18:08:22 -0500422 // TODO: At this point, we have not found any known errors that could be
423 // attributed to a system checkstop attention. This would be an
424 // isolation error if this function is called specifically for
425 // checkstop analysis, but this function currently is called for
426 // TIs and manual analysis as well. For now, we'll just sort the
427 // remaining list (recoverable, unit checkstop, and then system
428 // checkstop) and return the first element in the list. Later,
429 // we'll change this to properly handle error path scenarios.
430
431 // Fortunately, we just need to sort the list by the greater attention
432 // type value.
433 std::sort(list.begin(), list.end(), [&](const auto& a, const auto& b) {
434 return a.getAttnType() > b.getAttnType();
435 });
436
Zane Shelley65fefb22021-10-18 15:35:26 -0500437 // The entry at the front of the list will be the root cause.
438 o_rootCause = list.front();
439 return true;
440 }
441
442 // END WORKAROUND
443
444 return false; // default, no active attentions found.
445}
446
447//------------------------------------------------------------------------------
448
449} // namespace analyzer