blob: d374fb6a011508604cedc5411ba3ed5af51e8797 [file] [log] [blame]
Zane Shelleyd84ed6e2020-06-08 13:41:48 -05001#include <assert.h>
Ben Tyner87eabc62020-05-14 17:56:54 -05002#include <libpdbg.h>
3
Ben Tyner0205f3b2020-02-24 10:24:47 -06004#include <hei_main.hpp>
Zane Shelleyd84ed6e2020-06-08 13:41:48 -05005#include <util/trace.hpp>
Ben Tyner0205f3b2020-02-24 10:24:47 -06006
Zane Shelleyd84ed6e2020-06-08 13:41:48 -05007#include <algorithm>
Ben Tyner87eabc62020-05-14 17:56:54 -05008#include <fstream>
9#include <iostream>
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050010#include <map>
11#include <string>
12
Ben Tyner0205f3b2020-02-24 10:24:47 -060013namespace analyzer
14{
15
Ben Tyner87eabc62020-05-14 17:56:54 -050016/**
17 * @brief send chip data file to isolator
18 *
19 * Read a chip data file into memory and then send it to the isolator via
20 * the initialize interface.
21 *
22 * @param i_filePath The file path and name to read into memory
23 *
24 * @return Returns true if the isolator was successfully initialized with
25 * a single chip data file. Returns false otherwise.
26 *
27 */
Zane Shelley2e994bc2020-06-08 14:38:14 -050028void initWithFile(const char* i_filePath)
Ben Tyner0205f3b2020-02-24 10:24:47 -060029{
Ben Tyner87eabc62020-05-14 17:56:54 -050030 // open the file and seek to the end to get length
31 std::ifstream fileStream(i_filePath, std::ios::binary | std::ios::ate);
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050032
Zane Shelley2e994bc2020-06-08 14:38:14 -050033 if (!fileStream.good())
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050034 {
Zane Shelley2e994bc2020-06-08 14:38:14 -050035 trace::err("Unable to open file: %s", i_filePath);
36 assert(0);
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050037 }
38 else
39 {
Ben Tyner87eabc62020-05-14 17:56:54 -050040 // get file size based on seek position
Zane Shelley2e994bc2020-06-08 14:38:14 -050041 fileStream.seekg(0, std::ios::end);
Ben Tyner87eabc62020-05-14 17:56:54 -050042 std::ifstream::pos_type fileSize = fileStream.tellg();
43
44 // create a buffer large enough to hold the entire file
45 std::vector<char> fileBuffer(fileSize);
46
47 // seek to the beginning of the file
48 fileStream.seekg(0, std::ios::beg);
49
50 // read the entire file into the buffer
51 fileStream.read(fileBuffer.data(), fileSize);
52
53 // done with the file
54 fileStream.close();
55
Zane Shelley2e994bc2020-06-08 14:38:14 -050056 // initialize the isolator with the chip data
57 libhei::initialize(fileBuffer.data(), fileSize);
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050058 }
Ben Tyner87eabc62020-05-14 17:56:54 -050059}
Ben Tynerb1ebfcb2020-05-08 18:52:48 -050060
Zane Shelleyd84ed6e2020-06-08 13:41:48 -050061//------------------------------------------------------------------------------
62
Zane Shelley2f263182020-07-10 21:41:21 -050063uint8_t __attrType(pdbg_target* i_trgt)
64{
65 uint8_t attr = 0;
66 pdbg_target_get_attribute(i_trgt, "ATTR_TYPE", 1, 1, &attr);
67 return attr;
68}
69
70uint32_t __attrFapiPos(pdbg_target* i_trgt)
71{
72 uint32_t attr = 0;
73 pdbg_target_get_attribute(i_trgt, "ATTR_FAPI_POS", 4, 1, &attr);
74 return attr;
75}
76
77//------------------------------------------------------------------------------
78
79const char* __path(const libhei::Chip& i_chip)
80{
81 return pdbg_target_path((pdbg_target*)i_chip.getChip());
82}
83
84const char* __attn(libhei::AttentionType_t i_attnType)
85{
86 const char* str = "";
87 switch (i_attnType)
88 {
89 case libhei::ATTN_TYPE_CHECKSTOP:
90 str = "CHECKSTOP";
91 break;
92 case libhei::ATTN_TYPE_UNIT_CS:
93 str = "UNIT_CS";
94 break;
95 case libhei::ATTN_TYPE_RECOVERABLE:
96 str = "RECOVERABLE";
97 break;
98 case libhei::ATTN_TYPE_SP_ATTN:
99 str = "SP_ATTN";
100 break;
101 case libhei::ATTN_TYPE_HOST_ATTN:
102 str = "HOST_ATTN";
103 break;
104 default:
105 trace::err("Unsupported attention type: %u", i_attnType);
106 assert(0);
107 }
108 return str;
109}
110
111uint32_t __trgt(const libhei::Signature& i_sig)
112{
113 auto trgt = (pdbg_target*)i_sig.getChip().getChip();
114
115 uint8_t type = __attrType(trgt);
116 uint32_t pos = __attrFapiPos(trgt);
117
118 // Technically, the FapiPos attribute is 32-bit, but not likely to ever go
119 // over 24-bit.
120
121 return type << 24 | (pos & 0xffffff);
122}
123
124uint32_t __sig(const libhei::Signature& i_sig)
125{
126 return i_sig.getId() << 16 | i_sig.getInstance() << 8 | i_sig.getBit();
127}
128
129//------------------------------------------------------------------------------
130
Zane Shelleyd84ed6e2020-06-08 13:41:48 -0500131// Returns the chip model/level of the given target. Also, adds the chip
132// model/level to the list of type types needed to initialize the isolator.
133libhei::ChipType_t __getChipType(pdbg_target* i_trgt,
134 std::vector<libhei::ChipType_t>& o_types)
135{
136 libhei::ChipType_t type;
137
138 // START WORKAROUND
139 // TODO: Will need to grab the model/level from the target attributes when
140 // they are available. For now, use ATTR_TYPE to determine which
141 // currently supported value to use supported.
Zane Shelley2f263182020-07-10 21:41:21 -0500142 uint8_t attrType = __attrType(i_trgt);
143 switch (attrType)
Zane Shelleyd84ed6e2020-06-08 13:41:48 -0500144 {
145 case 0x05: // PROC
146 type = 0x120DA049;
147 break;
148
149 case 0x4b: // OCMB_CHIP
150 type = 0x160D2000;
151 break;
152
153 default:
Zane Shelley2f263182020-07-10 21:41:21 -0500154 trace::err("Unsupported ATTR_TYPE value: 0x%02x", attrType);
Zane Shelleyd84ed6e2020-06-08 13:41:48 -0500155 assert(0);
156 }
Zane Shelleyd84ed6e2020-06-08 13:41:48 -0500157 // END WORKAROUND
158
159 o_types.push_back(type);
160
161 return type;
162}
163
164//------------------------------------------------------------------------------
165
166// Gathers list of active chips to analyze. Also, returns the list of chip types
167// needed to initialize the isolator.
168void __getActiveChips(std::vector<libhei::Chip>& o_chips,
169 std::vector<libhei::ChipType_t>& o_types)
170{
171 // Iterate each processor.
172 pdbg_target* procTrgt;
173 pdbg_for_each_class_target("proc", procTrgt)
174 {
175 // Active processors only.
176 if (PDBG_TARGET_ENABLED != pdbg_target_probe(procTrgt))
177 continue;
178
179 // Add the processor to the list.
180 o_chips.emplace_back(procTrgt, __getChipType(procTrgt, o_types));
181
182 // Iterate the connected OCMBs, if they exist.
183 pdbg_target* ocmbTrgt;
184 pdbg_for_each_target("ocmb_chip", procTrgt, ocmbTrgt)
185 {
186 // Active OCMBs only.
187 if (PDBG_TARGET_ENABLED != pdbg_target_probe(ocmbTrgt))
188 continue;
189
190 // Add the OCMB to the list.
191 o_chips.emplace_back(ocmbTrgt, __getChipType(ocmbTrgt, o_types));
192 }
193 }
194
Zane Shelley2f263182020-07-10 21:41:21 -0500195 // For debug, trace out all of the chips found.
196 for (const auto& chip : o_chips)
197 {
198 trace::inf("chip:%s type:0x%0" PRIx32, __path(chip), chip.getType());
199 }
200
201 // Make sure the model/level list contains unique values only.
Zane Shelleyd84ed6e2020-06-08 13:41:48 -0500202 auto itr = std::unique(o_types.begin(), o_types.end());
203 o_types.resize(std::distance(o_types.begin(), itr));
204}
205
206//------------------------------------------------------------------------------
207
Zane Shelley2e994bc2020-06-08 14:38:14 -0500208// Initializes the isolator for each specified chip type.
209void __initializeIsolator(const std::vector<libhei::ChipType_t>& i_types)
210{
211 // START WORKAROUND
212 // TODO: The chip data will eventually come from the CHIPDATA section of the
213 // PNOR. Until that support is available, we'll use temporary chip
214 // data files.
215 for (const auto& type : i_types)
216 {
217 switch (type)
218 {
219 case 0x120DA049: // PROC
220 initWithFile(
221 "/usr/share/openpower-hw-diags/chip_data_proc.cdb");
222 break;
223
224 case 0x160D2000: // OCMB_CHIP
225 initWithFile(
226 "/usr/share/openpower-hw-diags/chip_data_ocmb.cdb");
227 break;
228
229 default:
230 trace::err("Unsupported ChipType_t value: 0x%0" PRIx32, type);
231 assert(0);
232 }
233 }
234 // END WORKAROUND
235}
236
237//------------------------------------------------------------------------------
238
Zane Shelley097a71a2020-06-08 15:55:29 -0500239// Takes a signature list that will be filtered and sorted. The first entry in
240// the returned list will be the root cause. If the returned list is empty,
241// analysis failed.
242void __filterRootCause(std::vector<libhei::Signature>& io_list)
243{
Zane Shelley2f263182020-07-10 21:41:21 -0500244 // For debug, trace out the original list of signatures before filtering.
245 for (const auto& sig : io_list)
246 {
247 trace::inf("Signature: %s 0x%0" PRIx32 " %s", __path(sig.getChip()),
248 __sig(sig), __attn(sig.getAttnType()));
249 }
250
Zane Shelley097a71a2020-06-08 15:55:29 -0500251 // Special and host attentions are not supported by this user application.
252 auto newEndItr =
253 std::remove_if(io_list.begin(), io_list.end(), [&](const auto& t) {
254 return (libhei::ATTN_TYPE_SP_ATTN == t.getAttnType() ||
255 libhei::ATTN_TYPE_HOST_ATTN == t.getAttnType());
256 });
257
258 // Shrink the vector, if needed.
259 io_list.resize(std::distance(io_list.begin(), newEndItr));
260
261 // START WORKAROUND
262 // TODO: Filtering should be determined by the RAS Data Files provided by
263 // the host firmware via the PNOR (similar to the Chip Data Files).
264 // Until that support is available, use a rudimentary filter that
265 // first looks for any recoverable attention, then any unit checkstop,
266 // and then any system checkstop. This is built on the premise that
267 // recoverable errors could be the root cause of an system checkstop
268 // attentions. Fortunately, we just need to sort the list by the
269 // greater attention type value.
270 std::sort(io_list.begin(), io_list.end(),
271 [&](const auto& a, const auto& b) {
272 return a.getAttnType() > b.getAttnType();
273 });
274 // END WORKAROUND
275}
276
277//------------------------------------------------------------------------------
278
Ben Tyner87eabc62020-05-14 17:56:54 -0500279bool analyzeHardware(std::map<std::string, std::string>& o_errors)
280{
Zane Shelley097a71a2020-06-08 15:55:29 -0500281 bool attnFound = false;
Ben Tyner87eabc62020-05-14 17:56:54 -0500282
Zane Shelley2f263182020-07-10 21:41:21 -0500283 trace::inf(">>> enter analyzeHardware()");
284
Zane Shelleyd84ed6e2020-06-08 13:41:48 -0500285 // Get the active chips to be analyzed and their types.
286 std::vector<libhei::Chip> chipList;
287 std::vector<libhei::ChipType_t> chipTypes;
288 __getActiveChips(chipList, chipTypes);
Ben Tyner87eabc62020-05-14 17:56:54 -0500289
Zane Shelley2e994bc2020-06-08 14:38:14 -0500290 // Initialize the isolator for all chip types.
Zane Shelley2f263182020-07-10 21:41:21 -0500291 trace::inf("Initializing isolator: # of types=%u", chipTypes.size());
Zane Shelley2e994bc2020-06-08 14:38:14 -0500292 __initializeIsolator(chipTypes);
293
Zane Shelley097a71a2020-06-08 15:55:29 -0500294 // Isolate attentions.
Zane Shelley2f263182020-07-10 21:41:21 -0500295 trace::inf("Isolating errors: # of chips=%u", chipList.size());
Zane Shelley097a71a2020-06-08 15:55:29 -0500296 libhei::IsolationData isoData{};
297 libhei::isolate(chipList, isoData);
Ben Tyner87eabc62020-05-14 17:56:54 -0500298
Zane Shelley2f263182020-07-10 21:41:21 -0500299 // Filter signatures to determine root cause. We'll need to make a copy of
300 // the list so that the original list is maintained for the log.
Zane Shelley097a71a2020-06-08 15:55:29 -0500301 std::vector<libhei::Signature> sigList{isoData.getSignatureList()};
302 __filterRootCause(sigList);
303
304 if (sigList.empty())
Ben Tyner87eabc62020-05-14 17:56:54 -0500305 {
Zane Shelley097a71a2020-06-08 15:55:29 -0500306 // Don't throw an error here because it could happen for during TI
307 // analysis. Attention Handler will need to determine if this is an
308 // actual problem.
309 trace::inf("No active attentions found");
310 }
311 else
312 {
313 attnFound = true;
314 trace::inf("Active attentions found: %d", sigList.size());
Ben Tyner87eabc62020-05-14 17:56:54 -0500315
Zane Shelley097a71a2020-06-08 15:55:29 -0500316 libhei::Signature root = sigList.front();
Zane Shelley2f263182020-07-10 21:41:21 -0500317 trace::inf("Root cause attention: %p 0x%04x%02x%02x %s",
Zane Shelley097a71a2020-06-08 15:55:29 -0500318 root.getChip().getChip(), root.getId(), root.getInstance(),
Zane Shelley2f263182020-07-10 21:41:21 -0500319 root.getBit(), __attn(root.getAttnType()));
Ben Tyner87eabc62020-05-14 17:56:54 -0500320
Zane Shelley097a71a2020-06-08 15:55:29 -0500321 // TODO: generate log information
322 }
Ben Tyner87eabc62020-05-14 17:56:54 -0500323
Zane Shelley097a71a2020-06-08 15:55:29 -0500324 // All done, clean up the isolator.
Zane Shelley2f263182020-07-10 21:41:21 -0500325 trace::inf("Uninitializing isolator");
Zane Shelley097a71a2020-06-08 15:55:29 -0500326 libhei::uninitialize();
Ben Tyner87eabc62020-05-14 17:56:54 -0500327
Zane Shelley2f263182020-07-10 21:41:21 -0500328 trace::inf("<<< exit analyzeHardware()");
329
Zane Shelley097a71a2020-06-08 15:55:29 -0500330 return attnFound;
Ben Tyner0205f3b2020-02-24 10:24:47 -0600331}
332
333} // namespace analyzer