blob: 8d69d9bfddfccdcb1038eee4b0e8744430477517 [file] [log] [blame]
Zane Shelleyd84ed6e2020-06-08 13:41:48 -05001#include <assert.h>
Ben Tyner87eabc62020-05-14 17:56:54 -05002#include <libpdbg.h>
Zane Shelley9fb73932020-09-15 13:34:57 -05003#include <unistd.h>
Ben Tyner87eabc62020-05-14 17:56:54 -05004
Zane Shelley4ed4be52021-02-15 17:53:40 -06005#include <analyzer/service_data.hpp>
Ben Tyner0205f3b2020-02-24 10:24:47 -06006#include <hei_main.hpp>
Zane Shelley9fb73932020-09-15 13:34:57 -05007#include <phosphor-logging/log.hpp>
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -06008#include <util/pdbg.hpp>
Zane Shelleyd84ed6e2020-06-08 13:41:48 -05009#include <util/trace.hpp>
Ben Tyner0205f3b2020-02-24 10:24:47 -060010
Zane Shelleyd84ed6e2020-06-08 13:41:48 -050011#include <algorithm>
Ben Tyner87eabc62020-05-14 17:56:54 -050012#include <fstream>
13#include <iostream>
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050014#include <map>
15#include <string>
16
Ben Tyner0205f3b2020-02-24 10:24:47 -060017namespace analyzer
18{
19
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -060020//------------------------------------------------------------------------------
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050021
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -060022// Forward references for externally defined functions.
Ben Tyner87eabc62020-05-14 17:56:54 -050023
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060024/**
25 * @brief Will get the list of active chip and initialize the isolator.
26 * @param o_chips The returned list of active chips.
27 */
Zane Shelley171a2e02020-11-13 13:56:13 -060028void initializeIsolator(std::vector<libhei::Chip>& o_chips);
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050029
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060030/**
Zane Shelleye5411f02021-08-04 22:41:35 -050031 * @brief Apply any RAS actions required by the given data.
32 * @param i_servData Data regarding service actions gathered during analysis.
33 */
34void applyRasActions(ServiceData& io_servData);
35
36/**
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060037 * @brief Will create and submit a PEL using the given data.
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060038 * @param i_isoData The data gathered during isolation (for FFDC).
Zane Shelley4ed4be52021-02-15 17:53:40 -060039 * @param i_servData Data regarding service actions gathered during analysis.
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060040 */
Zane Shelley8af9e462021-03-11 10:44:28 -060041void createPel(const libhei::IsolationData& i_isoData,
Zane Shelley4ed4be52021-02-15 17:53:40 -060042 const ServiceData& i_servData);
Zane Shelleyd3b9bac2020-11-17 21:59:12 -060043
Zane Shelleyd84ed6e2020-06-08 13:41:48 -050044//------------------------------------------------------------------------------
45
Zane Shelley2f263182020-07-10 21:41:21 -050046const char* __attn(libhei::AttentionType_t i_attnType)
47{
48 const char* str = "";
49 switch (i_attnType)
50 {
51 case libhei::ATTN_TYPE_CHECKSTOP:
52 str = "CHECKSTOP";
53 break;
54 case libhei::ATTN_TYPE_UNIT_CS:
55 str = "UNIT_CS";
56 break;
57 case libhei::ATTN_TYPE_RECOVERABLE:
58 str = "RECOVERABLE";
59 break;
60 case libhei::ATTN_TYPE_SP_ATTN:
61 str = "SP_ATTN";
62 break;
63 case libhei::ATTN_TYPE_HOST_ATTN:
64 str = "HOST_ATTN";
65 break;
66 default:
67 trace::err("Unsupported attention type: %u", i_attnType);
68 assert(0);
69 }
70 return str;
71}
72
Zane Shelley2f263182020-07-10 21:41:21 -050073//------------------------------------------------------------------------------
74
Zane Shelleycb457382020-11-02 20:55:06 -060075bool __filterRootCause(const libhei::IsolationData& i_isoData,
76 libhei::Signature& o_signature)
Zane Shelley097a71a2020-06-08 15:55:29 -050077{
Zane Shelleycb457382020-11-02 20:55:06 -060078 // We'll need to make a copy of the list so that the original list is
79 // maintained for the log.
80 std::vector<libhei::Signature> sigList{i_isoData.getSignatureList()};
81
Zane Shelley2f263182020-07-10 21:41:21 -050082 // For debug, trace out the original list of signatures before filtering.
Zane Shelleycb457382020-11-02 20:55:06 -060083 for (const auto& sig : sigList)
Zane Shelley2f263182020-07-10 21:41:21 -050084 {
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -060085 trace::inf("Signature: %s 0x%0" PRIx32 " %s",
Zane Shelleycb457382020-11-02 20:55:06 -060086 util::pdbg::getPath(sig.getChip()), sig.toUint32(),
Zane Shelleyf4bd5ff2020-11-05 22:26:04 -060087 __attn(sig.getAttnType()));
Zane Shelley2f263182020-07-10 21:41:21 -050088 }
89
Zane Shelley097a71a2020-06-08 15:55:29 -050090 // Special and host attentions are not supported by this user application.
91 auto newEndItr =
Zane Shelleycb457382020-11-02 20:55:06 -060092 std::remove_if(sigList.begin(), sigList.end(), [&](const auto& t) {
Zane Shelley097a71a2020-06-08 15:55:29 -050093 return (libhei::ATTN_TYPE_SP_ATTN == t.getAttnType() ||
94 libhei::ATTN_TYPE_HOST_ATTN == t.getAttnType());
95 });
96
97 // Shrink the vector, if needed.
Zane Shelleycb457382020-11-02 20:55:06 -060098 sigList.resize(std::distance(sigList.begin(), newEndItr));
Zane Shelley097a71a2020-06-08 15:55:29 -050099
100 // START WORKAROUND
101 // TODO: Filtering should be determined by the RAS Data Files provided by
102 // the host firmware via the PNOR (similar to the Chip Data Files).
103 // Until that support is available, use a rudimentary filter that
104 // first looks for any recoverable attention, then any unit checkstop,
105 // and then any system checkstop. This is built on the premise that
106 // recoverable errors could be the root cause of an system checkstop
107 // attentions. Fortunately, we just need to sort the list by the
108 // greater attention type value.
Zane Shelleycb457382020-11-02 20:55:06 -0600109 std::sort(sigList.begin(), sigList.end(),
Zane Shelley097a71a2020-06-08 15:55:29 -0500110 [&](const auto& a, const auto& b) {
111 return a.getAttnType() > b.getAttnType();
112 });
113 // END WORKAROUND
Zane Shelleycb457382020-11-02 20:55:06 -0600114
115 // Check if a root cause attention was found.
116 if (!sigList.empty())
117 {
118 // The entry at the front of the list will be the root cause.
119 o_signature = sigList.front();
120 return true;
121 }
122
123 return false; // default, no active attentions found.
Zane Shelley097a71a2020-06-08 15:55:29 -0500124}
125
126//------------------------------------------------------------------------------
127
Zane Shelleye5411f02021-08-04 22:41:35 -0500128bool analyzeHardware()
Zane Shelley9fb73932020-09-15 13:34:57 -0500129{
130 bool attnFound = false;
131
Zane Shelleye5411f02021-08-04 22:41:35 -0500132 if (!util::pdbg::queryHardwareAnalysisSupported())
133 {
134 trace::err("Hardware error analysis is not supported on this system");
135 return attnFound;
136 }
137
138 trace::inf(">>> enter analyzeHardware()");
139
140 // Initialize the isolator and get all of the chips to be analyzed.
141 trace::inf("Initializing the isolator...");
142 std::vector<libhei::Chip> chips;
143 initializeIsolator(chips);
144
145 // Isolate attentions.
146 trace::inf("Isolating errors: # of chips=%u", chips.size());
147 libhei::IsolationData isoData{};
148 libhei::isolate(chips, isoData);
149
150 // Filter for root cause attention.
Zane Shelleycb457382020-11-02 20:55:06 -0600151 libhei::Signature rootCause{};
Zane Shelleye5411f02021-08-04 22:41:35 -0500152 attnFound = __filterRootCause(isoData, rootCause);
Zane Shelleycb457382020-11-02 20:55:06 -0600153
154 if (!attnFound)
Zane Shelley9fb73932020-09-15 13:34:57 -0500155 {
Zane Shelleye5411f02021-08-04 22:41:35 -0500156 // It is possible for TI handling, or manually initiated analysis via
157 // the command line, that there will not be an active attention. In
158 // which case, we will do nothing and let the caller of this function
159 // determine if this is the expected behavior.
Zane Shelley9fb73932020-09-15 13:34:57 -0500160 trace::inf("No active attentions found");
161 }
162 else
163 {
Zane Shelley9fb73932020-09-15 13:34:57 -0500164 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
Zane Shelleycb457382020-11-02 20:55:06 -0600165 util::pdbg::getPath(rootCause.getChip()),
166 rootCause.toUint32(), __attn(rootCause.getAttnType()));
167
Zane Shelleye5411f02021-08-04 22:41:35 -0500168 // Perform service actions based on the root cause.
Zane Shelley8af9e462021-03-11 10:44:28 -0600169 ServiceData servData{rootCause};
Zane Shelleye5411f02021-08-04 22:41:35 -0500170 applyRasActions(servData);
Zane Shelleyd3b9bac2020-11-17 21:59:12 -0600171
172 // Create and commit a PEL.
Zane Shelleye5411f02021-08-04 22:41:35 -0500173 createPel(isoData, servData);
Zane Shelley9fb73932020-09-15 13:34:57 -0500174 }
175
Zane Shelleye5411f02021-08-04 22:41:35 -0500176 // All done, clean up the isolator.
177 trace::inf("Uninitializing isolator...");
178 libhei::uninitialize();
Ben Tyner87eabc62020-05-14 17:56:54 -0500179
Zane Shelley2f263182020-07-10 21:41:21 -0500180 trace::inf("<<< exit analyzeHardware()");
181
Zane Shelley097a71a2020-06-08 15:55:29 -0500182 return attnFound;
Ben Tyner0205f3b2020-02-24 10:24:47 -0600183}
184
Ben Tynereea45422021-04-15 10:54:14 -0500185//------------------------------------------------------------------------------
186
187/**
188 * @brief Get error isolator build information
189 *
190 * @return Pointer to build information
191 */
192const char* getBuildInfo()
193{
194 return libhei::getBuildInfo();
195}
196
Ben Tyner0205f3b2020-02-24 10:24:47 -0600197} // namespace analyzer