blob: 479c7897a06bcffafc4ceb3a1d90312bbd319ddb [file] [log] [blame]
Chau Lya743e382024-10-26 11:12:22 +00001#include "oem_event_manager.hpp"
2
Dung Cao72c8aa02023-11-22 02:31:41 +00003#include "libcper/Cper.h"
4
5#include "cper.hpp"
Chau Lya743e382024-10-26 11:12:22 +00006#include "requester/handler.hpp"
7#include "requester/request.hpp"
8
9#include <config.h>
10#include <libpldm/pldm.h>
11#include <libpldm/utils.h>
12#include <systemd/sd-journal.h>
13
14#include <phosphor-logging/lg2.hpp>
15#include <xyz/openbmc_project/Logging/Entry/server.hpp>
16
17#include <algorithm>
18#include <map>
Thu Nguyen79f9ff62024-11-22 03:36:27 +000019#include <set>
Chau Lya743e382024-10-26 11:12:22 +000020#include <sstream>
21#include <string>
22#include <unordered_map>
23
24namespace pldm
25{
26namespace oem_ampere
27{
Dung Cao4a503832025-01-08 03:45:17 +000028namespace fs = std::filesystem;
29using namespace std::chrono;
30
Chau Lya743e382024-10-26 11:12:22 +000031namespace boot_stage = boot::stage;
Chau Lycebf4762024-10-03 09:02:54 +000032namespace ddr_status = ddr::status;
33namespace dimm_status = dimm::status;
34namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
35namespace phy_syndrome = dimm::training_failure::phy_syndrome;
36namespace training_failure = dimm::training_failure;
Chau Lya743e382024-10-26 11:12:22 +000037
Chaul Ly198084b2024-12-13 09:02:52 +000038constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
39constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
40constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
Chau Lya743e382024-10-26 11:12:22 +000041constexpr const char* BIOSFWPanicRegistry =
Chaul Ly198084b2024-12-13 09:02:52 +000042 "OpenBMC.0.1.BIOSFirmwarePanicReason";
Chau Lya743e382024-10-26 11:12:22 +000043constexpr auto maxDIMMIdxBitNum = 24;
Chau Lycebf4762024-10-03 09:02:54 +000044constexpr auto maxDIMMInstantNum = 24;
Chau Lya743e382024-10-26 11:12:22 +000045
Thu Nguyen79f9ff62024-11-22 03:36:27 +000046const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47
Chau Lya743e382024-10-26 11:12:22 +000048/*
49 An array of possible boot status of a boot stage.
50 The index maps with byte 0 of boot code.
51*/
52std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53
54/*
55 An array of possible boot status of DDR training stage.
56 The index maps with byte 0 of boot code.
57*/
58std::array<std::string, 3> ddrTrainingMsg = {
59 " progress started", " in-progress", " progress completed"};
60
61/*
Chau Lycebf4762024-10-03 09:02:54 +000062 A map between PMIC status and logging strings.
63*/
64std::array<std::string, 8> pmicTempAlertMsg = {
65 "Below 85°C", "85°C", "95°C", "105°C",
66 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
67
68/*
Chau Lya743e382024-10-26 11:12:22 +000069 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70 EPs through SMBus and PCIe. When host boots up, SMBUS interface
71 comes up first. In this interface, BMC is bus owner.
72
73 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75*/
76EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77
78/*
79 A map between sensor IDs and their names in string.
80 Using pldm::oem::sensor_ids
81*/
Chau Ly4cca3dc2024-10-03 09:07:09 +000082EventToMsgMap_t sensorIdToStrMap = {
Chau Lyef214b52024-10-16 09:40:38 +000083 {DDR_STATUS, "DDR_STATUS"},
84 {PCP_VR_STATE, "PCP_VR_STATE"},
85 {SOC_VR_STATE, "SOC_VR_STATE"},
86 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88 {D2D_VR_STATE, "D2D_VR_STATE"},
89 {IOC_VR1_STATE, "IOC_VR1_STATE"},
90 {IOC_VR2_STATE, "IOC_VR2_STATE"},
91 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94 {BOOT_OVERALL, "BOOT_OVERALL"},
Chau Lyb01357f2024-10-17 09:18:01 +000095 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96 {WATCH_DOG, "WATCH_DOG"}};
Chau Lya743e382024-10-26 11:12:22 +000097
98/*
99 A map between the boot stages and logging strings.
100 Using pldm::oem::boot::stage::boot_stage
101*/
102EventToMsgMap_t bootStageToMsgMap = {
103 {boot_stage::SECPRO, "SECpro"},
104 {boot_stage::MPRO, "Mpro"},
105 {boot_stage::ATF_BL1, "ATF BL1"},
106 {boot_stage::ATF_BL2, "ATF BL2"},
107 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108 {boot_stage::DDR_TRAINING, "DDR training"},
109 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110 {boot_stage::ATF_BL31, "ATF BL31"},
111 {boot_stage::ATF_BL32, "ATF BL32"},
112 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114 "ATF BL33 (UEFI) booting status = "}};
115
116/*
Chau Lycebf4762024-10-03 09:02:54 +0000117 A map between DDR status and logging strings.
118 Using pldm::oem::ddr::status::ddr_status
119*/
120EventToMsgMap_t ddrStatusToMsgMap = {
121 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125 {ddr_status::OTHER_FAILURE, "has other failure"},
126 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127 "has boot failure due to no configuration"},
128 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129 "failsafe activated but boot success with the next valid configuration"}};
130
131/*
132 A map between DIMM status and logging strings.
133 Using pldm::oem::dimm::status::dimm_status
134*/
135EventToMsgMap_t dimmStatusToMsgMap = {
136 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137 {dimm_status::NOT_INSTALLED, "is not installed"},
138 {dimm_status::OTHER_FAILURE, "has other failure"},
139 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140 {dimm_status::TRAINING_FAILURE, "has training failure; "},
141 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142
143/*
144 A map between PHY training failure syndrome and logging strings.
145 Using
146 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147*/
148EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149 {phy_syndrome::NA, "(N/A)"},
150 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153 "(PHY write level failure - see syndrome 1)"},
154 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155 "(PHY read gate leveling failure)"},
156 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159
160/*
161 A map between DIMM training failure syndrome and logging strings.
162 Using
163 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164*/
165EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166 {dimm_syndrome::NA, "(N/A)"},
167 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168 "(DRAM VREFDQ training failure)"},
169 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171 "(LRDRIMM DB SW training failure)"}};
172
173/*
174 A map between DIMM training failure type and a pair of <logging strings -
175 syndrome map>. Using
176 pldm::oem::dimm::training_faillure::dimm_training_failure_type
177*/
178std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179 dimmTrainingFailureTypeMap = {
180 {training_failure::PHY_TRAINING_FAILURE_TYPE,
181 std::make_pair("PHY training failure",
182 phyTrainingFailureSyndromeToMsgMap)},
183 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184 std::make_pair("DIMM training failure",
185 dimmTrainingFailureSyndromeToMsgMap)}};
186
187/*
Chau Lya743e382024-10-26 11:12:22 +0000188 A map between log level and the registry used for Redfish SEL log
189 Using pldm::oem::log_level
190*/
191std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
Chau Ly3de0d942024-10-03 08:57:11 +0000192 {log_level::OK, ampereEventRegistry},
193 {log_level::WARNING, ampereWarningRegistry},
194 {log_level::CRITICAL, ampereCriticalRegistry},
Chau Lya743e382024-10-26 11:12:22 +0000195 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
196
Chau Lyef214b52024-10-16 09:40:38 +0000197std::unordered_map<
198 uint16_t,
199 std::vector<std::pair<
200 std::string,
201 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
202 stateSensorToMsgMap = {
203 {SOC_HEALTH_AVAILABILITY,
204 {{"SoC Health",
205 {{1, {log_level::OK, "Normal"}},
206 {2, {log_level::WARNING, "Non-Critical"}},
207 {3, {log_level::CRITICAL, "Critical"}},
208 {4, {log_level::CRITICAL, "Fatal"}}}},
209 {"SoC Availability",
210 {{1, {log_level::OK, "Enabled"}},
211 {2, {log_level::WARNING, "Disabled"}},
Chau Lyb01357f2024-10-17 09:18:01 +0000212 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
213 {WATCH_DOG,
214 {{"Global Watch Dog",
215 {{1, {log_level::OK, "Normal"}},
216 {2, {log_level::CRITICAL, "Timer Expired"}}}},
217 {"Secure Watch Dog",
218 {{1, {log_level::OK, "Normal"}},
219 {2, {log_level::CRITICAL, "Timer Expired"}}}},
220 {"Non-secure Watch Dog",
221 {{1, {log_level::OK, "Normal"}},
222 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
Chau Lyef214b52024-10-16 09:40:38 +0000223
Patrick Williams366507c2025-02-03 14:28:01 -0500224std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
225 uint16_t sensorId)
Chau Lya743e382024-10-26 11:12:22 +0000226{
227 std::string description;
228 if (!tidToSocketNameMap.contains(tid))
229 {
230 description += "TID " + std::to_string(tid) + ": ";
231 }
232 else
233 {
234 description += tidToSocketNameMap[tid] + ": ";
235 }
236
237 if (!sensorIdToStrMap.contains(sensorId))
238 {
239 description += "Sensor ID " + std::to_string(sensorId) + ": ";
240 }
241 else
242 {
243 description += sensorIdToStrMap[sensorId] + ": ";
244 }
245
246 return description;
247}
248
249void OemEventManager::sendJournalRedfish(const std::string& description,
250 log_level& logLevel)
251{
252 if (description.empty())
253 {
254 return;
255 }
256
257 if (!logLevelToRedfishMsgIdMap.contains(logLevel))
258 {
259 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
260 "DES", description);
261 return;
262 }
263 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
264 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265 redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
266}
267
268std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
269{
270 std::string description;
271 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
272 {
273 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
274 {
275 description += " #" + std::to_string(bitIdx);
276 }
277 }
278 return description;
279}
280
Thu Nguyen93d0ca32024-11-14 23:46:40 +0000281uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
282{
283 uint8_t dimmIdx = maxDIMMInstantNum;
284 int sensorId_Off = sensorId - 4;
285 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
286 ((sensorId_Off / 2) < maxDIMMInstantNum))
287 {
288 dimmIdx = sensorId_Off / 2;
289 }
290 return dimmIdx;
291}
292
Chau Lya743e382024-10-26 11:12:22 +0000293void OemEventManager::handleBootOverallEvent(
294 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
295{
296 log_level logLevel{log_level::OK};
297 std::string description;
298 std::stringstream strStream;
299
300 uint8_t byte0 = (presentReading & 0x000000ff);
301 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
302 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
303 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
304 /*
305 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
306 * ATF BL32 and DDR initialization
307 */
308 if (bootStageToMsgMap.contains(byte3))
309 {
310 // Boot stage adding
311 description += bootStageToMsgMap[byte3];
312
313 switch (byte3)
314 {
315 case boot_stage::DDR_TRAINING:
316 if (byte0 >= ddrTrainingMsg.size())
317 {
318 logLevel = log_level::BIOSFWPANIC;
319 description += " unknown status";
320 }
321 else
322 {
323 description += ddrTrainingMsg[byte0];
324 }
325 if (0x01 == byte0)
326 {
327 // Add complete percentage
328 description += " at " + std::to_string(byte1) + "%";
329 }
330 break;
331 case boot_stage::S0_DDR_TRAINING_FAILURE:
332 case boot_stage::S1_DDR_TRAINING_FAILURE:
333 // ddr_training_status_msg()
334 logLevel = log_level::BIOSFWPANIC;
335 description += " at DIMMs:";
336 // dimmIdxs = presentReading & 0x00ffffff;
337 description += dimmIdxsToString(presentReading & 0x00ffffff);
338 description += " of socket ";
339 description +=
340 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
341 break;
342 default:
343 if (byte0 >= bootStatMsg.size())
344 {
345 logLevel = log_level::BIOSFWPANIC;
346 description += " unknown status";
347 }
348 else
349 {
350 description += bootStatMsg[byte0];
351 }
352 break;
353 }
354
355 // Sensor report action is fail
356 if (boot::status::BOOT_STATUS_FAILURE == byte2)
357 {
358 logLevel = log_level::BIOSFWPANIC;
359 }
360 }
361 else
362 {
363 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
364 {
365 description +=
366 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
367
368 strStream
369 << "Segment (0x" << std::setfill('0') << std::hex
370 << std::setw(8) << static_cast<uint32_t>(presentReading)
Chau Ly3de0d942024-10-03 08:57:11 +0000371 << "); Status Class (0x" << std::setw(2)
372 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
Chau Lya743e382024-10-26 11:12:22 +0000373 << std::setw(2) << static_cast<uint32_t>(byte2)
Chau Ly3de0d942024-10-03 08:57:11 +0000374 << "); Operation Code (0x" << std::setw(4)
Chau Lya743e382024-10-26 11:12:22 +0000375 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
376 << ")" << std::dec;
377
378 description += strStream.str();
379 }
380 }
381
382 // Log to Redfish event
383 sendJournalRedfish(description, logLevel);
384}
385
386int OemEventManager::processNumericSensorEvent(
387 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
388 size_t sensorDataLength)
389{
390 uint8_t eventState = 0;
391 uint8_t previousEventState = 0;
392 uint8_t sensorDataSize = 0;
393 uint32_t presentReading;
394 auto rc = decode_numeric_sensor_data(
395 sensorData, sensorDataLength, &eventState, &previousEventState,
396 &sensorDataSize, &presentReading);
397 if (rc)
398 {
399 lg2::error(
400 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
401 "TID", tid, "RC", rc);
402 return rc;
403 }
404
Chau Lycebf4762024-10-03 09:02:54 +0000405 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
Thu Nguyen93d0ca32024-11-14 23:46:40 +0000406 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
Chau Lycebf4762024-10-03 09:02:54 +0000407 {
408 handleDIMMStatusEvent(tid, sensorId, presentReading);
409 return PLDM_SUCCESS;
410 }
411
Chau Lya743e382024-10-26 11:12:22 +0000412 switch (sensorId)
413 {
414 case BOOT_OVERALL:
415 handleBootOverallEvent(tid, sensorId, presentReading);
416 break;
Chau Ly3de0d942024-10-03 08:57:11 +0000417 case PCIE_HOT_PLUG:
418 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
419 break;
Chau Lycebf4762024-10-03 09:02:54 +0000420 case DDR_STATUS:
421 handleDDRStatusEvent(tid, sensorId, presentReading);
422 break;
Chau Ly4cca3dc2024-10-03 09:07:09 +0000423 case PCP_VR_STATE:
424 case SOC_VR_STATE:
425 case DPHY_VR1_STATE:
426 case DPHY_VR2_STATE:
427 case D2D_VR_STATE:
428 case IOC_VR1_STATE:
429 case IOC_VR2_STATE:
430 case PCI_D_VR_STATE:
431 case PCI_A_VR_STATE:
432 handleVRDStatusEvent(tid, sensorId, presentReading);
433 break;
Chau Lyb01357f2024-10-17 09:18:01 +0000434 case WATCH_DOG:
435 handleNumericWatchdogEvent(tid, sensorId, presentReading);
436 break;
Chau Lya743e382024-10-26 11:12:22 +0000437 default:
438 std::string description;
439 std::stringstream strStream;
440 log_level logLevel = log_level::OK;
441
442 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
443 description += prefixMsgStrCreation(tid, sensorId);
444 strStream << std::setfill('0') << std::hex << "eventState 0x"
445 << std::setw(2) << static_cast<uint32_t>(eventState)
446 << " previousEventState 0x" << std::setw(2)
447 << static_cast<uint32_t>(previousEventState)
448 << " sensorDataSize 0x" << std::setw(2)
449 << static_cast<uint32_t>(sensorDataSize)
450 << " presentReading 0x" << std::setw(8)
451 << static_cast<uint32_t>(presentReading) << std::dec;
452 description += strStream.str();
453
454 sendJournalRedfish(description, logLevel);
455 break;
456 }
457 return PLDM_SUCCESS;
458}
459
460int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
461 const uint8_t* sensorData,
462 size_t sensorDataLength)
463{
464 uint8_t sensorOffset = 0;
465 uint8_t eventState = 0;
466 uint8_t previousEventState = 0;
467
468 auto rc =
469 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
470 &eventState, &previousEventState);
471 if (rc)
472 {
473 lg2::error(
474 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
475 "TID", tid, "RC", rc);
476 return rc;
477 }
478
479 std::string description;
Chau Lya743e382024-10-26 11:12:22 +0000480 log_level logLevel = log_level::OK;
481
Chau Lyef214b52024-10-16 09:40:38 +0000482 if (stateSensorToMsgMap.contains(sensorId))
483 {
484 description += prefixMsgStrCreation(tid, sensorId);
485 auto componentMap = stateSensorToMsgMap[sensorId];
486 if (sensorOffset < componentMap.size())
487 {
488 description += std::get<0>(componentMap[sensorOffset]);
489 auto stateMap = std::get<1>(componentMap[sensorOffset]);
490 if (stateMap.contains(eventState))
491 {
492 logLevel = std::get<0>(stateMap[eventState]);
493 description += " state : " + std::get<1>(stateMap[eventState]);
494 if (stateMap.contains(previousEventState))
495 {
496 description += "; previous state: " +
497 std::get<1>(stateMap[previousEventState]);
498 }
499 }
500 else
501 {
502 description += " sends unsupported event state: " +
503 std::to_string(eventState);
504 if (stateMap.contains(previousEventState))
505 {
506 description += "; previous state: " +
507 std::get<1>(stateMap[previousEventState]);
508 }
509 }
510 }
511 else
512 {
513 description += "sends unsupported component sensor offset " +
514 std::to_string(sensorOffset);
515 }
516 }
517 else
518 {
519 std::stringstream strStream;
520 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
521 description += prefixMsgStrCreation(tid, sensorId);
522 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
523 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
524 << "eventState 0x" << std::setw(2)
525 << static_cast<uint32_t>(eventState)
526 << " previousEventState 0x" << std::setw(2)
527 << static_cast<uint32_t>(previousEventState) << std::dec;
528 description += strStream.str();
529 }
Chau Lya743e382024-10-26 11:12:22 +0000530
531 sendJournalRedfish(description, logLevel);
532
533 return PLDM_SUCCESS;
534}
535
536int OemEventManager::processSensorOpStateEvent(
537 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
538 size_t sensorDataLength)
539{
540 uint8_t present_op_state = 0;
541 uint8_t previous_op_state = 0;
542
543 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
544 &present_op_state, &previous_op_state);
545 if (rc)
546 {
547 lg2::error(
548 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
549 "TID", tid, "RC", rc);
550 return rc;
551 }
552
553 std::string description;
554 std::stringstream strStream;
555 log_level logLevel = log_level::OK;
556
557 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
558 description += prefixMsgStrCreation(tid, sensorId);
559 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
560 << std::setw(2) << static_cast<uint32_t>(present_op_state)
561 << "previous_op_state 0x" << std::setw(2)
562 << static_cast<uint32_t>(previous_op_state) << std::dec;
563 description += strStream.str();
564
565 sendJournalRedfish(description, logLevel);
566
567 return PLDM_SUCCESS;
568}
569
570int OemEventManager::handleSensorEvent(
571 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
572 pldm_tid_t tid, size_t eventDataOffset)
573{
574 /* This OEM event handler is only used for SoC terminus*/
575 if (!tidToSocketNameMap.contains(tid))
576 {
577 return PLDM_SUCCESS;
578 }
579 auto eventData =
580 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
581 auto eventDataSize = payloadLength - eventDataOffset;
582
583 uint16_t sensorId = 0;
584 uint8_t sensorEventClassType = 0;
585 size_t eventClassDataOffset = 0;
586 auto rc =
587 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
588 &sensorEventClassType, &eventClassDataOffset);
589 if (rc)
590 {
591 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
592 rc);
593 return rc;
594 }
595 const uint8_t* sensorData = eventData + eventClassDataOffset;
596 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
597
598 switch (sensorEventClassType)
599 {
600 case PLDM_NUMERIC_SENSOR_STATE:
601 {
602 return processNumericSensorEvent(tid, sensorId, sensorData,
603 sensorDataLength);
604 }
605 case PLDM_STATE_SENSOR_STATE:
606 {
607 return processStateSensorEvent(tid, sensorId, sensorData,
608 sensorDataLength);
609 }
610 case PLDM_SENSOR_OP_STATE:
611 {
612 return processSensorOpStateEvent(tid, sensorId, sensorData,
613 sensorDataLength);
614 }
615 default:
616 std::string description;
617 std::stringstream strStream;
618 log_level logLevel = log_level::OK;
619
620 description += "SENSOR_EVENT : Unsupported Sensor Class " +
621 std::to_string(sensorEventClassType) + ": ";
622 description += prefixMsgStrCreation(tid, sensorId);
623 strStream << std::setfill('0') << std::hex
624 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
625
626 auto dataPtr = sensorData;
627 for ([[maybe_unused]] const auto& i :
628 std::views::iota(0, (int)sensorDataLength))
629 {
630 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
631 dataPtr += sizeof(sensorData);
632 }
633
634 description += strStream.str();
635
636 sendJournalRedfish(description, logLevel);
637 }
638 lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
639 sensorEventClassType);
640 return PLDM_ERROR;
641}
642
Chau Ly3de0d942024-10-03 08:57:11 +0000643void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
644 uint32_t presentReading)
645{
646 std::string description;
647 std::stringstream strStream;
648 PCIeHotPlugEventRecord_t record{presentReading};
649
650 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
651 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
652 log_level logLevel =
653 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
654
655 description += prefixMsgStrCreation(tid, sensorId);
656
657 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
658 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
659 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
660 << "); Device (0x" << std::setw(2)
661 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
662 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
663 << "); Action (" << sAction << "); Operation status ("
664 << sOpStatus << "); Media slot number (" << std::dec
665 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
666
667 description += strStream.str();
668
669 // Log to Redfish event
670 sendJournalRedfish(description, logLevel);
671}
672
Chau Lycebf4762024-10-03 09:02:54 +0000673std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
674{
675 std::string description;
676 DIMMTrainingFailure_t failure{failureInfo};
677
678 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
679 {
680 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
681
682 description += std::get<0>(failureInfoMap);
683
684 description += "; MCU rank index " +
685 std::to_string(failure.bits.mcuRankIdx);
686
687 description += "; Slice number " +
688 std::to_string(failure.bits.sliceNum);
689
690 description += "; Upper nibble error status: ";
691 description += (!failure.bits.upperNibbStatErr)
692 ? "No error"
693 : "Found no rising edge";
694
695 description += "; Lower nibble error status: ";
696 description += (!failure.bits.lowerNibbStatErr)
697 ? "No error"
698 : "Found no rising edge";
699
700 description += "; Failure syndrome 0: ";
701
702 auto& syndromeMap = std::get<1>(failureInfoMap);
703 if (syndromeMap.contains(failure.bits.syndrome))
704 {
705 description += syndromeMap[failure.bits.syndrome];
706 }
707 else
708 {
709 description += "(Unknown syndrome)";
710 }
711 }
712 else
713 {
714 description += "Unknown training failure type " +
715 std::to_string(failure.bits.type);
716 }
717
718 return description;
719}
720
721void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
722 uint32_t presentReading)
723{
724 log_level logLevel{log_level::WARNING};
725 std::string description;
726 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
727 uint32_t byte012 = presentReading & 0xffffff;
728
729 description += prefixMsgStrCreation(tid, sensorId);
730
Thu Nguyen93d0ca32024-11-14 23:46:40 +0000731 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
732 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
733 if (dimmIdx >= maxDIMMIdxBitNum)
734 {
735 return;
736 }
Chau Lycebf4762024-10-03 09:02:54 +0000737
738 description += "DIMM " + std::to_string(dimmIdx) + " ";
739
740 if (dimmStatusToMsgMap.contains(byte3))
741 {
742 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
743 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
744 {
745 logLevel = log_level::OK;
746 }
747
748 description += dimmStatusToMsgMap[byte3];
749
750 if (byte3 == dimm_status::TRAINING_FAILURE)
751 {
752 description += "; " + dimmTrainingFailureToMsg(byte012);
753 }
754 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
755 {
756 uint8_t byte0 = (byte012 & 0xff);
757 if (byte0 < pmicTempAlertMsg.size())
758 {
759 description += ": " + pmicTempAlertMsg[byte0];
760 }
761 }
762 }
763 else
764 {
765 switch (byte3)
766 {
767 case dimm_status::PMIC_HIGH_TEMP:
768 if (byte012 == 0x01)
769 {
770 description += "has PMIC high temp condition";
771 }
772 break;
773 case dimm_status::TSx_HIGH_TEMP:
774 switch (byte012)
775 {
776 case 0x01:
777 description += "has TS0";
778 break;
779 case 0x02:
780 description += "has TS1";
781 break;
782 case 0x03:
783 description += "has TS0 and TS1";
784 break;
785 }
786 description += " exceeding their high temperature threshold";
787 break;
788 case dimm_status::SPD_HUB_HIGH_TEMP:
789 if (byte012 == 0x01)
790 {
791 description += "has SPD/HUB high temp condition";
792 }
793 break;
794 default:
795 description += "has unsupported status " +
796 std::to_string(byte3);
797 break;
798 }
799 }
800
801 // Log to Redfish event
802 sendJournalRedfish(description, logLevel);
803}
804
805void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
806 uint32_t presentReading)
807{
808 log_level logLevel{log_level::WARNING};
809 std::string description;
810 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
811 uint32_t byte012 = presentReading & 0xffffff;
812
813 description += prefixMsgStrCreation(tid, sensorId);
814
815 description += "DDR ";
816 if (ddrStatusToMsgMap.contains(byte3))
817 {
818 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
819 {
820 logLevel = log_level::OK;
821 }
822
823 description += ddrStatusToMsgMap[byte3];
824
825 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
826 byte3 == ddr_status::TRAINING_FAILURE)
827 {
828 // List out failed DIMMs
829 description += dimmIdxsToString(byte012);
830 }
831 }
832 else
833 {
834 description += "has unsupported status " + std::to_string(byte3);
835 }
836
837 // Log to Redfish event
838 sendJournalRedfish(description, logLevel);
839}
840
Chau Ly4cca3dc2024-10-03 09:07:09 +0000841void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
842 uint32_t presentReading)
843{
844 log_level logLevel{log_level::WARNING};
845 std::string description;
846 std::stringstream strStream;
847
848 description += prefixMsgStrCreation(tid, sensorId);
849
850 VRDStatus_t status{presentReading};
851
852 if (status.bits.warning && status.bits.critical)
853 {
854 description += "A VR warning and a VR critical";
855 logLevel = log_level::CRITICAL;
856 }
857 else
858 {
859 if (status.bits.warning)
860 {
861 description += "A VR warning";
862 }
863 else if (status.bits.critical)
864 {
865 description += "A VR critical";
866 logLevel = log_level::CRITICAL;
867 }
868 else
869 {
870 description += "No VR warning or critical";
871 logLevel = log_level::OK;
872 }
873 }
874 description += " condition observed";
875
876 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
877 << std::setw(2)
878 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
879 << "; VR status byte low is 0x" << std::setw(2)
880 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
881 << "; Reading is 0x" << std::setw(2)
882 << static_cast<uint32_t>(presentReading) << ";";
883
884 description += strStream.str();
885
886 // Log to Redfish event
887 sendJournalRedfish(description, logLevel);
888}
889
Chau Lyb01357f2024-10-17 09:18:01 +0000890void OemEventManager::handleNumericWatchdogEvent(
891 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
892{
893 std::string description;
894 log_level logLevel = log_level::CRITICAL;
895
896 description += prefixMsgStrCreation(tid, sensorId);
897
898 if (presentReading & 0x01)
899 {
900 description += "Global watchdog expired;";
901 }
902 if (presentReading & 0x02)
903 {
904 description += "Secure watchdog expired;";
905 }
906 if (presentReading & 0x04)
907 {
908 description += "Non-secure watchdog expired;";
909 }
910
911 // Log to Redfish event
912 sendJournalRedfish(description, logLevel);
913}
914
Dung Cao72c8aa02023-11-22 02:31:41 +0000915int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
916 const uint8_t* eventData,
917 size_t eventDataSize)
918{
919 EFI_AMPERE_ERROR_DATA ampHdr;
920
921 decodeCperRecord(eventData, eventDataSize, &ampHdr);
922
923 addCperSELLog(tid, eventId, &ampHdr);
924
Thu Nguyen4b537552024-11-19 08:43:23 +0000925 /* isBert at bit 12 of TypeId */
926 if (ampHdr.TypeId & 0x0800)
927 {
928 lg2::info("Ampere SoC BERT is triggered.");
929 std::variant<std::string> value(
930 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
931 try
932 {
933 auto& bus = pldm::utils::DBusHandler::getBus();
934 auto method =
935 bus.new_method_call("com.ampere.CrashCapture.Trigger",
936 "/com/ampere/crashcapture/trigger",
937 pldm::utils::dbusProperties, "Set");
938 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
939 value);
940 bus.call_noreply(method);
941 }
942 catch (const std::exception& e)
943 {
944 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
945 }
946 }
947
Dung Cao72c8aa02023-11-22 02:31:41 +0000948 return PLDM_SUCCESS;
949}
950
Thu Nguyen79f9ff62024-11-22 03:36:27 +0000951int OemEventManager::handlepldmMessagePollEvent(
952 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
953 pldm_tid_t tid, size_t eventDataOffset)
954{
955 /* This OEM event handler is only used for SoC terminus*/
956 if (!tidToSocketNameMap.contains(tid))
957 {
958 return PLDM_SUCCESS;
959 }
960
961 auto eventData =
962 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
963 auto eventDataSize = payloadLength - eventDataOffset;
964
965 pldm_message_poll_event poll_event{};
966 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
967 &poll_event);
968 if (rc)
969 {
970 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
971 "RC", rc);
972 return rc;
973 }
974
975 auto sensorID = poll_event.event_id;
976 /* The UE errors */
977 if (rasUESensorIDs.contains(sensorID))
978 {
979 pldm::utils::DBusMapping dbusMapping{
980 "/xyz/openbmc_project/led/groups/ras_ue_fault",
981 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
982 try
983 {
984 pldm::utils::DBusHandler().setDbusProperty(
985 dbusMapping, pldm::utils::PropertyValue{bool(true)});
986 }
987 catch (const std::exception& e)
988 {
989 lg2::error(
990 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
991 "TID", tid, "SENSORID", sensorID, "ERROR", e);
992 }
993 }
994
995 return PLDM_SUCCESS;
996}
997
Dung Cao4a503832025-01-08 03:45:17 +0000998exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
999{
1000 uint64_t t0 = 0;
1001
1002 /* This OEM event handler is only used for SoC terminus */
1003 if (!tidToSocketNameMap.contains(tid))
1004 {
1005 co_return PLDM_SUCCESS;
1006 }
1007
1008 if (!timeStampMap.contains(tid))
1009 {
1010 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1011 timeStampMap.emplace(std::make_pair(tid, t0));
1012 }
1013 else
1014 {
1015 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1016 uint64_t elapsed = t0 - timeStampMap[tid];
1017 if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1018 {
1019 co_await manager->pollForPlatformEvent(tid, 0, 0);
1020 timeStampMap[tid] = t0;
1021 }
1022 }
1023
1024 co_return PLDM_SUCCESS;
1025}
Chau Lya743e382024-10-26 11:12:22 +00001026} // namespace oem_ampere
1027} // namespace pldm