blob: 11cd9270f6949cc7390562b63d11823aa8bd5917 [file] [log] [blame]
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -05001extern "C"
2{
3#include <libpdbg.h>
4#include <libpdbg_sbe.h>
5}
6
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -05007#include "create_pel.hpp"
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -05008#include "sbe_consts.hpp"
9#include "sbe_dump_collector.hpp"
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -050010#include "sbe_type.hpp"
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050011
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -050012#include <ekb/hwpf/fapi2/include/target_types.H>
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050013#include <libphal.H>
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -050014#include <phal_exception.H>
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050015
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -050016#include <phosphor-logging/elog-errors.hpp>
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050017#include <phosphor-logging/lg2.hpp>
18#include <phosphor-logging/log.hpp>
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -050019#include <sbe_consts.hpp>
20#include <xyz/openbmc_project/Common/File/error.hpp>
21#include <xyz/openbmc_project/Common/error.hpp>
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050022
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -050023#include <cstdint>
24#include <filesystem>
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050025#include <format>
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -050026#include <fstream>
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050027#include <stdexcept>
28
29namespace openpower::dump::sbe_chipop
30{
31
32using namespace phosphor::logging;
33using namespace openpower::dump::SBE;
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -050034using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050035
36void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
37 uint64_t failingUnit,
38 const std::filesystem::path& path)
39{
40 lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
41 "failingUnit:{FAILINGUNIT}, path:{PATH}",
42 "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
43 path.string());
44
45 initializePdbg();
46
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -050047 TargetMap targets;
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050048
49 struct pdbg_target* target = nullptr;
50 pdbg_for_each_class_target("proc", target)
51 {
52 if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
53 !openpower::phal::pdbg::isTgtFunctional(target))
54 {
55 continue;
56 }
57
Dhruvaraj Subhashchandranf9f65b82022-10-13 06:46:43 -050058 bool includeTarget = true;
59 // if the dump type is hostboot then call stop instructions
60 if (type == SBE_DUMP_TYPE_HOSTBOOT)
61 {
62 includeTarget = executeThreadStop(target);
63 }
64 if (includeTarget)
65 {
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -050066 targets[target] = std::vector<struct pdbg_target*>();
67
68 // Hardware dump needs OCMB data if present
69 if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
70 {
71 struct pdbg_target* ocmbTarget;
72 pdbg_for_each_target("ocmb", target, ocmbTarget)
73 {
74 if (!is_ody_ocmb_chip(ocmbTarget))
75 {
76 continue;
77 }
78
79 if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
80 {
81 continue;
82 }
83
84 if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
85 {
86 continue;
87 }
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -050088 targets[target].push_back(ocmbTarget);
89 }
90 }
Dhruvaraj Subhashchandranf9f65b82022-10-13 06:46:43 -050091 }
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -050092 }
93
94 std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
95 for (auto cstate : clockStates)
96 {
Dhruvaraj Subhashchandran9098d8c2022-12-01 00:40:20 -060097 // Skip collection for performance dump if clock state is not ON
98 if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
99 {
100 continue;
101 }
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500102 auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
103 cstate, targets);
104
105 // Wait for all asynchronous tasks to complete
106 for (auto& future : futures)
107 {
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500108 try
109 {
110 future.wait();
111 }
112 catch (const std::exception& e)
113 {
114 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
115 "ERROR", e);
116 }
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500117 }
118 lg2::info(
119 "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
120 "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
121 "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
122 failingUnit, "PATH", path.string());
123 }
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500124 if (std::filesystem::is_empty(path))
125 {
126 lg2::error("Failed to collect the dump");
127 throw std::runtime_error("Failed to collect the dump");
128 }
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500129 lg2::info("Dump collection completed");
130}
131
132void SbeDumpCollector::initializePdbg()
133{
134 openpower::phal::pdbg::init();
135}
136
137std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
138 uint8_t type, uint32_t id, const std::filesystem::path& path,
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500139 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500140{
141 std::vector<std::future<void>> futures;
142
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500143 for (const auto& [procTarget, ocmbTargets] : targetMap)
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500144 {
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500145 auto future = std::async(std::launch::async,
146 [this, procTarget, ocmbTargets, path, id, type,
147 cstate, failingUnit]() {
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500148 try
149 {
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500150 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500151 failingUnit);
152 }
153 catch (const std::exception& e)
154 {
155 lg2::error(
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500156 "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
157 "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
158 }
159
160 // Collect OCMBs only with clock on
161 if (cstate == SBE_CLOCK_ON)
162 {
163 // Handle OCMBs serially after handling the proc
164 for (auto ocmbTarget : ocmbTargets)
165 {
166 try
167 {
168 this->collectDumpFromSBE(ocmbTarget, path, id, type,
169 cstate, failingUnit);
170 }
171 catch (const std::exception& e)
172 {
173 lg2::error(
174 "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
175 "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
176 e);
177 }
178 }
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500179 }
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500180 });
181
182 futures.push_back(std::move(future));
183 }
184
185 return futures;
186}
187
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500188bool SbeDumpCollector::logErrorAndCreatePEL(
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500189 const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
190 SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType)
191{
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500192 namespace fs = std::filesystem;
193
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500194 std::string chipName;
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500195 std::string event;
196 bool dumpIsRequired = false;
197 bool isDumpFailure = true;
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500198 try
199 {
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500200 chipName = sbeTypeAttributes.at(sbeType).chipName;
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500201 event = sbeTypeAttributes.at(sbeType).chipOpFailure;
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500202
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500203 lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
204 chipPos);
205
206 // Common FFDC data
207 openpower::dump::pel::FFDCData pelAdditionalData = {
208 {"SRC6", std::format("{:X}{:X}", chipPos, (cmdClass | cmdType))}};
209
210 if (sbeType == SBETypes::OCMB)
211 {
212 pelAdditionalData.emplace_back(
213 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
214 }
215
216 // Check the error type
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500217 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
218 {
219 event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
220 dumpIsRequired = true;
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500221 // For timeout, we do not expect any FFDC packets
222 }
223 else if (sbeError.errType() ==
224 openpower::phal::exception::SBE_FFDC_NO_DATA)
225 {
226 // We will create a PEL without FFDC with the common information we
227 // added
228 lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
229 "CHIP", chipName, "POSITION", chipPos);
230 event = sbeTypeAttributes.at(sbeType).noFfdc;
231 }
232 else
233 {
234 if (sbeError.errType() ==
235 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
236 {
237 lg2::info(
238 "FFDC Not related to chip-op present {CHIP} {POSITION}",
239 "CHIP", chipName, "POSITION", chipPos);
240 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
241 isDumpFailure = false;
242 }
243 else
244 {
245 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
246 "POSITION", chipPos);
247 }
248 // Processor FFDC Packets
249 openpower::dump::pel::processFFDCPackets(sbeError, event,
250 pelAdditionalData);
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500251 }
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500252
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500253 // If dump is required, request it
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500254 if (dumpIsRequired)
255 {
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500256 auto logId = openpower::dump::pel::createSbeErrorPEL(
257 event, sbeError, pelAdditionalData);
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500258 util::requestSBEDump(chipPos, logId, sbeType);
259 }
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500260 }
261 catch (const std::out_of_range& e)
262 {
263 lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
264 sbeType, "ERROR", e);
265 }
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500266 catch (const std::exception& e)
267 {
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500268 lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
269 "position({CHIPPOS}), Error: {ERROR}",
270 "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
Dhruvaraj Subhashchandran5f5c94d2021-10-19 07:18:30 -0500271 }
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500272
273 return isDumpFailure;
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500274}
275
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500276void SbeDumpCollector::collectDumpFromSBE(struct pdbg_target* chip,
277 const std::filesystem::path& path,
278 uint32_t id, uint8_t type,
279 uint8_t clockState,
280 uint64_t failingUnit)
281{
282 auto chipPos = pdbg_target_index(chip);
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500283 SBETypes sbeType = getSBEType(chip);
284 auto chipName = sbeTypeAttributes.at(sbeType).chipName;
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500285 lg2::info(
Dhruvaraj Subhashchandrane74e9162024-04-01 09:53:13 -0500286 "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
287 "type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
288 "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
289 id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500290
291 util::DumpDataPtr dataPtr;
292 uint32_t len = 0;
293 uint8_t collectFastArray =
294 checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
295
296 try
297 {
298 openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
299 dataPtr.getPtr(), &len);
300 }
301 catch (const openpower::phal::sbeError_t& sbeError)
302 {
303 if (sbeError.errType() ==
304 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
305 {
306 // SBE is not ready to accept chip-ops,
307 // Skip the request, no additional error handling required.
308 lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
309 "on proc({PROC}) clock state({CLOCKSTATE})",
310 "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
311 "CLOCKSTATE", clockState);
312 return;
313 }
314
Dhruvaraj Subhashchandranf2298892024-04-21 04:42:55 -0500315 // If the FFDC is from actual chip-op failure this function will
316 // return true, if the chip-op is not failed but FFDC is present
317 // then create PELs with FFDC but write the dump contents to the
318 // file.
319 if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
320 SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP))
321 {
322 lg2::error("Error in collecting dump dump type({TYPE}), "
323 "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
324 "position({POSITION}), "
325 "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
326 "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
327 chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
328 collectFastArray, "ERROR", sbeError);
329 return;
330 }
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500331 }
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500332 writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500333}
334
335void SbeDumpCollector::writeDumpFile(
336 const std::filesystem::path& path, const uint32_t id,
Dhruvaraj Subhashchandran6feeebd2021-10-19 05:03:59 -0500337 const uint8_t clockState, const uint8_t nodeNum,
338 const std::string& chipName, const uint8_t chipPos,
339 util::DumpDataPtr& dataPtr, const uint32_t len)
Dhruvaraj Subhashchandrana699e312021-10-27 07:20:34 -0500340{
341 using namespace sdbusplus::xyz::openbmc_project::Common::Error;
342 namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
343
344 // Construct the filename
345 std::ostringstream filenameBuilder;
346 filenameBuilder << std::setw(8) << std::setfill('0') << id
347 << ".SbeDataClocks"
348 << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
349 << static_cast<int>(nodeNum) << "." << chipName
350 << static_cast<int>(chipPos);
351
352 auto dumpPath = path / filenameBuilder.str();
353
354 // Attempt to open the file
355 std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
356 if (!outfile)
357 {
358 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
359 using metadata = xyz::openbmc_project::Common::File::Open;
360 // Unable to open the file for writing
361 auto err = errno;
362 lg2::error("Error opening file to write dump, "
363 "errno({ERRNO}), filepath({FILEPATH})",
364 "ERRNO", err, "FILEPATH", dumpPath.string());
365
366 report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
367 // Just return here, so that the dumps collected from other
368 // SBEs can be packaged.
369 return;
370 }
371
372 // Write to the file
373 try
374 {
375 outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
376
377 lg2::info("Successfully wrote dump file "
378 "path=({PATH}) size=({SIZE})",
379 "PATH", dumpPath.string(), "SIZE", len);
380 }
381 catch (const std::ofstream::failure& oe)
382 {
383 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
384 using metadata = xyz::openbmc_project::Common::File::Write;
385
386 lg2::error(
387 "Failed to write to dump file, "
388 "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
389 "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
390 dumpPath.string());
391 report<Write>(metadata::ERRNO(oe.code().value()),
392 metadata::PATH(dumpPath.c_str()));
393 // Just return here so dumps collected from other SBEs can be
394 // packaged.
395 }
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500396}
397
Dhruvaraj Subhashchandranf9f65b82022-10-13 06:46:43 -0500398bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target)
399{
400 try
401 {
402 openpower::phal::sbe::threadStopProc(target);
403 return true;
404 }
405 catch (const openpower::phal::sbeError_t& sbeError)
406 {
407 uint64_t chipPos = pdbg_target_index(target);
408 if (sbeError.errType() ==
409 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
410 {
411 lg2::info("SBE is not ready to accept chip-op: Skipping "
412 "stop instruction on proc-({POSITION}) error({ERROR}) ",
413 "POSITION", chipPos, "ERROR", sbeError);
414 return false; // Do not include the target for dump collection
415 }
416
417 lg2::error("Stop instructions failed on "
418 "proc-({POSITION}) error({ERROR}) ",
419 "POSITION", chipPos, "ERROR", sbeError);
420
421 logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
422 SBEFIFO_CMD_CLASS_INSTRUCTION,
423 SBEFIFO_CMD_CONTROL_INSN);
424 // For TIMEOUT, log the error and skip adding the processor for dump
425 // collection
426 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
427 {
428 return false;
429 }
430 }
431 // Include the target for dump collection for SBE_CMD_FAILED or any other
432 // non-critical errors
433 return true;
434}
435
Dhruvaraj Subhashchandran858d1aa2021-10-27 03:26:06 -0500436} // namespace openpower::dump::sbe_chipop