blob: dcd3fa914e1a69775088eb93bae7edcef0a19d44 [file] [log] [blame]
Chau Lya743e382024-10-26 11:12:22 +00001#include "oem_event_manager.hpp"
2
Dung Cao72c8aa02023-11-22 02:31:41 +00003#include "libcper/Cper.h"
4
5#include "cper.hpp"
Chau Lya743e382024-10-26 11:12:22 +00006#include "requester/handler.hpp"
7#include "requester/request.hpp"
8
9#include <config.h>
10#include <libpldm/pldm.h>
11#include <libpldm/utils.h>
12#include <systemd/sd-journal.h>
13
14#include <phosphor-logging/lg2.hpp>
15#include <xyz/openbmc_project/Logging/Entry/server.hpp>
16
17#include <algorithm>
18#include <map>
Thu Nguyen79f9ff62024-11-22 03:36:27 +000019#include <set>
Chau Lya743e382024-10-26 11:12:22 +000020#include <sstream>
21#include <string>
22#include <unordered_map>
23
24namespace pldm
25{
26namespace oem_ampere
27{
28namespace boot_stage = boot::stage;
Chau Lycebf4762024-10-03 09:02:54 +000029namespace ddr_status = ddr::status;
30namespace dimm_status = dimm::status;
31namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
32namespace phy_syndrome = dimm::training_failure::phy_syndrome;
33namespace training_failure = dimm::training_failure;
Chau Lya743e382024-10-26 11:12:22 +000034
Chaul Ly198084b2024-12-13 09:02:52 +000035constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
36constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
37constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
Chau Lya743e382024-10-26 11:12:22 +000038constexpr const char* BIOSFWPanicRegistry =
Chaul Ly198084b2024-12-13 09:02:52 +000039 "OpenBMC.0.1.BIOSFirmwarePanicReason";
Chau Lya743e382024-10-26 11:12:22 +000040constexpr auto maxDIMMIdxBitNum = 24;
Chau Lycebf4762024-10-03 09:02:54 +000041constexpr auto maxDIMMInstantNum = 24;
Chau Lya743e382024-10-26 11:12:22 +000042
Thu Nguyen79f9ff62024-11-22 03:36:27 +000043const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
44
Chau Lya743e382024-10-26 11:12:22 +000045/*
46 An array of possible boot status of a boot stage.
47 The index maps with byte 0 of boot code.
48*/
49std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
50
51/*
52 An array of possible boot status of DDR training stage.
53 The index maps with byte 0 of boot code.
54*/
55std::array<std::string, 3> ddrTrainingMsg = {
56 " progress started", " in-progress", " progress completed"};
57
58/*
Chau Lycebf4762024-10-03 09:02:54 +000059 A map between PMIC status and logging strings.
60*/
61std::array<std::string, 8> pmicTempAlertMsg = {
62 "Below 85°C", "85°C", "95°C", "105°C",
63 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
64
65/*
Chau Lya743e382024-10-26 11:12:22 +000066 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
67 EPs through SMBus and PCIe. When host boots up, SMBUS interface
68 comes up first. In this interface, BMC is bus owner.
69
70 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
71 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
72*/
73EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
74
75/*
76 A map between sensor IDs and their names in string.
77 Using pldm::oem::sensor_ids
78*/
Chau Ly4cca3dc2024-10-03 09:07:09 +000079EventToMsgMap_t sensorIdToStrMap = {
Chau Lyef214b52024-10-16 09:40:38 +000080 {DDR_STATUS, "DDR_STATUS"},
81 {PCP_VR_STATE, "PCP_VR_STATE"},
82 {SOC_VR_STATE, "SOC_VR_STATE"},
83 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
84 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
85 {D2D_VR_STATE, "D2D_VR_STATE"},
86 {IOC_VR1_STATE, "IOC_VR1_STATE"},
87 {IOC_VR2_STATE, "IOC_VR2_STATE"},
88 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
89 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
90 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
91 {BOOT_OVERALL, "BOOT_OVERALL"},
Chau Lyb01357f2024-10-17 09:18:01 +000092 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
93 {WATCH_DOG, "WATCH_DOG"}};
Chau Lya743e382024-10-26 11:12:22 +000094
95/*
96 A map between the boot stages and logging strings.
97 Using pldm::oem::boot::stage::boot_stage
98*/
99EventToMsgMap_t bootStageToMsgMap = {
100 {boot_stage::SECPRO, "SECpro"},
101 {boot_stage::MPRO, "Mpro"},
102 {boot_stage::ATF_BL1, "ATF BL1"},
103 {boot_stage::ATF_BL2, "ATF BL2"},
104 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
105 {boot_stage::DDR_TRAINING, "DDR training"},
106 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
107 {boot_stage::ATF_BL31, "ATF BL31"},
108 {boot_stage::ATF_BL32, "ATF BL32"},
109 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
110 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
111 "ATF BL33 (UEFI) booting status = "}};
112
113/*
Chau Lycebf4762024-10-03 09:02:54 +0000114 A map between DDR status and logging strings.
115 Using pldm::oem::ddr::status::ddr_status
116*/
117EventToMsgMap_t ddrStatusToMsgMap = {
118 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
119 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
120 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
121 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
122 {ddr_status::OTHER_FAILURE, "has other failure"},
123 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
124 "has boot failure due to no configuration"},
125 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
126 "failsafe activated but boot success with the next valid configuration"}};
127
128/*
129 A map between DIMM status and logging strings.
130 Using pldm::oem::dimm::status::dimm_status
131*/
132EventToMsgMap_t dimmStatusToMsgMap = {
133 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
134 {dimm_status::NOT_INSTALLED, "is not installed"},
135 {dimm_status::OTHER_FAILURE, "has other failure"},
136 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
137 {dimm_status::TRAINING_FAILURE, "has training failure; "},
138 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
139
140/*
141 A map between PHY training failure syndrome and logging strings.
142 Using
143 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
144*/
145EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
146 {phy_syndrome::NA, "(N/A)"},
147 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
148 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
149 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
150 "(PHY write level failure - see syndrome 1)"},
151 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
152 "(PHY read gate leveling failure)"},
153 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
154 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
155 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
156
157/*
158 A map between DIMM training failure syndrome and logging strings.
159 Using
160 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
161*/
162EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
163 {dimm_syndrome::NA, "(N/A)"},
164 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
165 "(DRAM VREFDQ training failure)"},
166 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
167 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
168 "(LRDRIMM DB SW training failure)"}};
169
170/*
171 A map between DIMM training failure type and a pair of <logging strings -
172 syndrome map>. Using
173 pldm::oem::dimm::training_faillure::dimm_training_failure_type
174*/
175std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
176 dimmTrainingFailureTypeMap = {
177 {training_failure::PHY_TRAINING_FAILURE_TYPE,
178 std::make_pair("PHY training failure",
179 phyTrainingFailureSyndromeToMsgMap)},
180 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
181 std::make_pair("DIMM training failure",
182 dimmTrainingFailureSyndromeToMsgMap)}};
183
184/*
Chau Lya743e382024-10-26 11:12:22 +0000185 A map between log level and the registry used for Redfish SEL log
186 Using pldm::oem::log_level
187*/
188std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
Chau Ly3de0d942024-10-03 08:57:11 +0000189 {log_level::OK, ampereEventRegistry},
190 {log_level::WARNING, ampereWarningRegistry},
191 {log_level::CRITICAL, ampereCriticalRegistry},
Chau Lya743e382024-10-26 11:12:22 +0000192 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
193
Chau Lyef214b52024-10-16 09:40:38 +0000194std::unordered_map<
195 uint16_t,
196 std::vector<std::pair<
197 std::string,
198 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
199 stateSensorToMsgMap = {
200 {SOC_HEALTH_AVAILABILITY,
201 {{"SoC Health",
202 {{1, {log_level::OK, "Normal"}},
203 {2, {log_level::WARNING, "Non-Critical"}},
204 {3, {log_level::CRITICAL, "Critical"}},
205 {4, {log_level::CRITICAL, "Fatal"}}}},
206 {"SoC Availability",
207 {{1, {log_level::OK, "Enabled"}},
208 {2, {log_level::WARNING, "Disabled"}},
Chau Lyb01357f2024-10-17 09:18:01 +0000209 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
210 {WATCH_DOG,
211 {{"Global Watch Dog",
212 {{1, {log_level::OK, "Normal"}},
213 {2, {log_level::CRITICAL, "Timer Expired"}}}},
214 {"Secure Watch Dog",
215 {{1, {log_level::OK, "Normal"}},
216 {2, {log_level::CRITICAL, "Timer Expired"}}}},
217 {"Non-secure Watch Dog",
218 {{1, {log_level::OK, "Normal"}},
219 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
Chau Lyef214b52024-10-16 09:40:38 +0000220
Chau Lya743e382024-10-26 11:12:22 +0000221std::string
222 OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId)
223{
224 std::string description;
225 if (!tidToSocketNameMap.contains(tid))
226 {
227 description += "TID " + std::to_string(tid) + ": ";
228 }
229 else
230 {
231 description += tidToSocketNameMap[tid] + ": ";
232 }
233
234 if (!sensorIdToStrMap.contains(sensorId))
235 {
236 description += "Sensor ID " + std::to_string(sensorId) + ": ";
237 }
238 else
239 {
240 description += sensorIdToStrMap[sensorId] + ": ";
241 }
242
243 return description;
244}
245
246void OemEventManager::sendJournalRedfish(const std::string& description,
247 log_level& logLevel)
248{
249 if (description.empty())
250 {
251 return;
252 }
253
254 if (!logLevelToRedfishMsgIdMap.contains(logLevel))
255 {
256 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
257 "DES", description);
258 return;
259 }
260 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
261 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
262 redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
263}
264
265std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
266{
267 std::string description;
268 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
269 {
270 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
271 {
272 description += " #" + std::to_string(bitIdx);
273 }
274 }
275 return description;
276}
277
Thu Nguyen93d0ca32024-11-14 23:46:40 +0000278uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
279{
280 uint8_t dimmIdx = maxDIMMInstantNum;
281 int sensorId_Off = sensorId - 4;
282 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
283 ((sensorId_Off / 2) < maxDIMMInstantNum))
284 {
285 dimmIdx = sensorId_Off / 2;
286 }
287 return dimmIdx;
288}
289
Chau Lya743e382024-10-26 11:12:22 +0000290void OemEventManager::handleBootOverallEvent(
291 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
292{
293 log_level logLevel{log_level::OK};
294 std::string description;
295 std::stringstream strStream;
296
297 uint8_t byte0 = (presentReading & 0x000000ff);
298 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
299 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
300 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
301 /*
302 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
303 * ATF BL32 and DDR initialization
304 */
305 if (bootStageToMsgMap.contains(byte3))
306 {
307 // Boot stage adding
308 description += bootStageToMsgMap[byte3];
309
310 switch (byte3)
311 {
312 case boot_stage::DDR_TRAINING:
313 if (byte0 >= ddrTrainingMsg.size())
314 {
315 logLevel = log_level::BIOSFWPANIC;
316 description += " unknown status";
317 }
318 else
319 {
320 description += ddrTrainingMsg[byte0];
321 }
322 if (0x01 == byte0)
323 {
324 // Add complete percentage
325 description += " at " + std::to_string(byte1) + "%";
326 }
327 break;
328 case boot_stage::S0_DDR_TRAINING_FAILURE:
329 case boot_stage::S1_DDR_TRAINING_FAILURE:
330 // ddr_training_status_msg()
331 logLevel = log_level::BIOSFWPANIC;
332 description += " at DIMMs:";
333 // dimmIdxs = presentReading & 0x00ffffff;
334 description += dimmIdxsToString(presentReading & 0x00ffffff);
335 description += " of socket ";
336 description +=
337 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
338 break;
339 default:
340 if (byte0 >= bootStatMsg.size())
341 {
342 logLevel = log_level::BIOSFWPANIC;
343 description += " unknown status";
344 }
345 else
346 {
347 description += bootStatMsg[byte0];
348 }
349 break;
350 }
351
352 // Sensor report action is fail
353 if (boot::status::BOOT_STATUS_FAILURE == byte2)
354 {
355 logLevel = log_level::BIOSFWPANIC;
356 }
357 }
358 else
359 {
360 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
361 {
362 description +=
363 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
364
365 strStream
366 << "Segment (0x" << std::setfill('0') << std::hex
367 << std::setw(8) << static_cast<uint32_t>(presentReading)
Chau Ly3de0d942024-10-03 08:57:11 +0000368 << "); Status Class (0x" << std::setw(2)
369 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
Chau Lya743e382024-10-26 11:12:22 +0000370 << std::setw(2) << static_cast<uint32_t>(byte2)
Chau Ly3de0d942024-10-03 08:57:11 +0000371 << "); Operation Code (0x" << std::setw(4)
Chau Lya743e382024-10-26 11:12:22 +0000372 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
373 << ")" << std::dec;
374
375 description += strStream.str();
376 }
377 }
378
379 // Log to Redfish event
380 sendJournalRedfish(description, logLevel);
381}
382
383int OemEventManager::processNumericSensorEvent(
384 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
385 size_t sensorDataLength)
386{
387 uint8_t eventState = 0;
388 uint8_t previousEventState = 0;
389 uint8_t sensorDataSize = 0;
390 uint32_t presentReading;
391 auto rc = decode_numeric_sensor_data(
392 sensorData, sensorDataLength, &eventState, &previousEventState,
393 &sensorDataSize, &presentReading);
394 if (rc)
395 {
396 lg2::error(
397 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
398 "TID", tid, "RC", rc);
399 return rc;
400 }
401
Chau Lycebf4762024-10-03 09:02:54 +0000402 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
Thu Nguyen93d0ca32024-11-14 23:46:40 +0000403 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
Chau Lycebf4762024-10-03 09:02:54 +0000404 {
405 handleDIMMStatusEvent(tid, sensorId, presentReading);
406 return PLDM_SUCCESS;
407 }
408
Chau Lya743e382024-10-26 11:12:22 +0000409 switch (sensorId)
410 {
411 case BOOT_OVERALL:
412 handleBootOverallEvent(tid, sensorId, presentReading);
413 break;
Chau Ly3de0d942024-10-03 08:57:11 +0000414 case PCIE_HOT_PLUG:
415 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
416 break;
Chau Lycebf4762024-10-03 09:02:54 +0000417 case DDR_STATUS:
418 handleDDRStatusEvent(tid, sensorId, presentReading);
419 break;
Chau Ly4cca3dc2024-10-03 09:07:09 +0000420 case PCP_VR_STATE:
421 case SOC_VR_STATE:
422 case DPHY_VR1_STATE:
423 case DPHY_VR2_STATE:
424 case D2D_VR_STATE:
425 case IOC_VR1_STATE:
426 case IOC_VR2_STATE:
427 case PCI_D_VR_STATE:
428 case PCI_A_VR_STATE:
429 handleVRDStatusEvent(tid, sensorId, presentReading);
430 break;
Chau Lyb01357f2024-10-17 09:18:01 +0000431 case WATCH_DOG:
432 handleNumericWatchdogEvent(tid, sensorId, presentReading);
433 break;
Chau Lya743e382024-10-26 11:12:22 +0000434 default:
435 std::string description;
436 std::stringstream strStream;
437 log_level logLevel = log_level::OK;
438
439 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
440 description += prefixMsgStrCreation(tid, sensorId);
441 strStream << std::setfill('0') << std::hex << "eventState 0x"
442 << std::setw(2) << static_cast<uint32_t>(eventState)
443 << " previousEventState 0x" << std::setw(2)
444 << static_cast<uint32_t>(previousEventState)
445 << " sensorDataSize 0x" << std::setw(2)
446 << static_cast<uint32_t>(sensorDataSize)
447 << " presentReading 0x" << std::setw(8)
448 << static_cast<uint32_t>(presentReading) << std::dec;
449 description += strStream.str();
450
451 sendJournalRedfish(description, logLevel);
452 break;
453 }
454 return PLDM_SUCCESS;
455}
456
457int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
458 const uint8_t* sensorData,
459 size_t sensorDataLength)
460{
461 uint8_t sensorOffset = 0;
462 uint8_t eventState = 0;
463 uint8_t previousEventState = 0;
464
465 auto rc =
466 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
467 &eventState, &previousEventState);
468 if (rc)
469 {
470 lg2::error(
471 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
472 "TID", tid, "RC", rc);
473 return rc;
474 }
475
476 std::string description;
Chau Lya743e382024-10-26 11:12:22 +0000477 log_level logLevel = log_level::OK;
478
Chau Lyef214b52024-10-16 09:40:38 +0000479 if (stateSensorToMsgMap.contains(sensorId))
480 {
481 description += prefixMsgStrCreation(tid, sensorId);
482 auto componentMap = stateSensorToMsgMap[sensorId];
483 if (sensorOffset < componentMap.size())
484 {
485 description += std::get<0>(componentMap[sensorOffset]);
486 auto stateMap = std::get<1>(componentMap[sensorOffset]);
487 if (stateMap.contains(eventState))
488 {
489 logLevel = std::get<0>(stateMap[eventState]);
490 description += " state : " + std::get<1>(stateMap[eventState]);
491 if (stateMap.contains(previousEventState))
492 {
493 description += "; previous state: " +
494 std::get<1>(stateMap[previousEventState]);
495 }
496 }
497 else
498 {
499 description += " sends unsupported event state: " +
500 std::to_string(eventState);
501 if (stateMap.contains(previousEventState))
502 {
503 description += "; previous state: " +
504 std::get<1>(stateMap[previousEventState]);
505 }
506 }
507 }
508 else
509 {
510 description += "sends unsupported component sensor offset " +
511 std::to_string(sensorOffset);
512 }
513 }
514 else
515 {
516 std::stringstream strStream;
517 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
518 description += prefixMsgStrCreation(tid, sensorId);
519 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
520 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
521 << "eventState 0x" << std::setw(2)
522 << static_cast<uint32_t>(eventState)
523 << " previousEventState 0x" << std::setw(2)
524 << static_cast<uint32_t>(previousEventState) << std::dec;
525 description += strStream.str();
526 }
Chau Lya743e382024-10-26 11:12:22 +0000527
528 sendJournalRedfish(description, logLevel);
529
530 return PLDM_SUCCESS;
531}
532
533int OemEventManager::processSensorOpStateEvent(
534 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
535 size_t sensorDataLength)
536{
537 uint8_t present_op_state = 0;
538 uint8_t previous_op_state = 0;
539
540 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
541 &present_op_state, &previous_op_state);
542 if (rc)
543 {
544 lg2::error(
545 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
546 "TID", tid, "RC", rc);
547 return rc;
548 }
549
550 std::string description;
551 std::stringstream strStream;
552 log_level logLevel = log_level::OK;
553
554 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
555 description += prefixMsgStrCreation(tid, sensorId);
556 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
557 << std::setw(2) << static_cast<uint32_t>(present_op_state)
558 << "previous_op_state 0x" << std::setw(2)
559 << static_cast<uint32_t>(previous_op_state) << std::dec;
560 description += strStream.str();
561
562 sendJournalRedfish(description, logLevel);
563
564 return PLDM_SUCCESS;
565}
566
567int OemEventManager::handleSensorEvent(
568 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
569 pldm_tid_t tid, size_t eventDataOffset)
570{
571 /* This OEM event handler is only used for SoC terminus*/
572 if (!tidToSocketNameMap.contains(tid))
573 {
574 return PLDM_SUCCESS;
575 }
576 auto eventData =
577 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
578 auto eventDataSize = payloadLength - eventDataOffset;
579
580 uint16_t sensorId = 0;
581 uint8_t sensorEventClassType = 0;
582 size_t eventClassDataOffset = 0;
583 auto rc =
584 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
585 &sensorEventClassType, &eventClassDataOffset);
586 if (rc)
587 {
588 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
589 rc);
590 return rc;
591 }
592 const uint8_t* sensorData = eventData + eventClassDataOffset;
593 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
594
595 switch (sensorEventClassType)
596 {
597 case PLDM_NUMERIC_SENSOR_STATE:
598 {
599 return processNumericSensorEvent(tid, sensorId, sensorData,
600 sensorDataLength);
601 }
602 case PLDM_STATE_SENSOR_STATE:
603 {
604 return processStateSensorEvent(tid, sensorId, sensorData,
605 sensorDataLength);
606 }
607 case PLDM_SENSOR_OP_STATE:
608 {
609 return processSensorOpStateEvent(tid, sensorId, sensorData,
610 sensorDataLength);
611 }
612 default:
613 std::string description;
614 std::stringstream strStream;
615 log_level logLevel = log_level::OK;
616
617 description += "SENSOR_EVENT : Unsupported Sensor Class " +
618 std::to_string(sensorEventClassType) + ": ";
619 description += prefixMsgStrCreation(tid, sensorId);
620 strStream << std::setfill('0') << std::hex
621 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
622
623 auto dataPtr = sensorData;
624 for ([[maybe_unused]] const auto& i :
625 std::views::iota(0, (int)sensorDataLength))
626 {
627 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
628 dataPtr += sizeof(sensorData);
629 }
630
631 description += strStream.str();
632
633 sendJournalRedfish(description, logLevel);
634 }
635 lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
636 sensorEventClassType);
637 return PLDM_ERROR;
638}
639
Chau Ly3de0d942024-10-03 08:57:11 +0000640void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
641 uint32_t presentReading)
642{
643 std::string description;
644 std::stringstream strStream;
645 PCIeHotPlugEventRecord_t record{presentReading};
646
647 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
648 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
649 log_level logLevel =
650 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
651
652 description += prefixMsgStrCreation(tid, sensorId);
653
654 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
655 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
656 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
657 << "); Device (0x" << std::setw(2)
658 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
659 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
660 << "); Action (" << sAction << "); Operation status ("
661 << sOpStatus << "); Media slot number (" << std::dec
662 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
663
664 description += strStream.str();
665
666 // Log to Redfish event
667 sendJournalRedfish(description, logLevel);
668}
669
Chau Lycebf4762024-10-03 09:02:54 +0000670std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
671{
672 std::string description;
673 DIMMTrainingFailure_t failure{failureInfo};
674
675 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
676 {
677 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
678
679 description += std::get<0>(failureInfoMap);
680
681 description += "; MCU rank index " +
682 std::to_string(failure.bits.mcuRankIdx);
683
684 description += "; Slice number " +
685 std::to_string(failure.bits.sliceNum);
686
687 description += "; Upper nibble error status: ";
688 description += (!failure.bits.upperNibbStatErr)
689 ? "No error"
690 : "Found no rising edge";
691
692 description += "; Lower nibble error status: ";
693 description += (!failure.bits.lowerNibbStatErr)
694 ? "No error"
695 : "Found no rising edge";
696
697 description += "; Failure syndrome 0: ";
698
699 auto& syndromeMap = std::get<1>(failureInfoMap);
700 if (syndromeMap.contains(failure.bits.syndrome))
701 {
702 description += syndromeMap[failure.bits.syndrome];
703 }
704 else
705 {
706 description += "(Unknown syndrome)";
707 }
708 }
709 else
710 {
711 description += "Unknown training failure type " +
712 std::to_string(failure.bits.type);
713 }
714
715 return description;
716}
717
718void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
719 uint32_t presentReading)
720{
721 log_level logLevel{log_level::WARNING};
722 std::string description;
723 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
724 uint32_t byte012 = presentReading & 0xffffff;
725
726 description += prefixMsgStrCreation(tid, sensorId);
727
Thu Nguyen93d0ca32024-11-14 23:46:40 +0000728 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
729 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
730 if (dimmIdx >= maxDIMMIdxBitNum)
731 {
732 return;
733 }
Chau Lycebf4762024-10-03 09:02:54 +0000734
735 description += "DIMM " + std::to_string(dimmIdx) + " ";
736
737 if (dimmStatusToMsgMap.contains(byte3))
738 {
739 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
740 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
741 {
742 logLevel = log_level::OK;
743 }
744
745 description += dimmStatusToMsgMap[byte3];
746
747 if (byte3 == dimm_status::TRAINING_FAILURE)
748 {
749 description += "; " + dimmTrainingFailureToMsg(byte012);
750 }
751 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
752 {
753 uint8_t byte0 = (byte012 & 0xff);
754 if (byte0 < pmicTempAlertMsg.size())
755 {
756 description += ": " + pmicTempAlertMsg[byte0];
757 }
758 }
759 }
760 else
761 {
762 switch (byte3)
763 {
764 case dimm_status::PMIC_HIGH_TEMP:
765 if (byte012 == 0x01)
766 {
767 description += "has PMIC high temp condition";
768 }
769 break;
770 case dimm_status::TSx_HIGH_TEMP:
771 switch (byte012)
772 {
773 case 0x01:
774 description += "has TS0";
775 break;
776 case 0x02:
777 description += "has TS1";
778 break;
779 case 0x03:
780 description += "has TS0 and TS1";
781 break;
782 }
783 description += " exceeding their high temperature threshold";
784 break;
785 case dimm_status::SPD_HUB_HIGH_TEMP:
786 if (byte012 == 0x01)
787 {
788 description += "has SPD/HUB high temp condition";
789 }
790 break;
791 default:
792 description += "has unsupported status " +
793 std::to_string(byte3);
794 break;
795 }
796 }
797
798 // Log to Redfish event
799 sendJournalRedfish(description, logLevel);
800}
801
802void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
803 uint32_t presentReading)
804{
805 log_level logLevel{log_level::WARNING};
806 std::string description;
807 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
808 uint32_t byte012 = presentReading & 0xffffff;
809
810 description += prefixMsgStrCreation(tid, sensorId);
811
812 description += "DDR ";
813 if (ddrStatusToMsgMap.contains(byte3))
814 {
815 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
816 {
817 logLevel = log_level::OK;
818 }
819
820 description += ddrStatusToMsgMap[byte3];
821
822 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
823 byte3 == ddr_status::TRAINING_FAILURE)
824 {
825 // List out failed DIMMs
826 description += dimmIdxsToString(byte012);
827 }
828 }
829 else
830 {
831 description += "has unsupported status " + std::to_string(byte3);
832 }
833
834 // Log to Redfish event
835 sendJournalRedfish(description, logLevel);
836}
837
Chau Ly4cca3dc2024-10-03 09:07:09 +0000838void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
839 uint32_t presentReading)
840{
841 log_level logLevel{log_level::WARNING};
842 std::string description;
843 std::stringstream strStream;
844
845 description += prefixMsgStrCreation(tid, sensorId);
846
847 VRDStatus_t status{presentReading};
848
849 if (status.bits.warning && status.bits.critical)
850 {
851 description += "A VR warning and a VR critical";
852 logLevel = log_level::CRITICAL;
853 }
854 else
855 {
856 if (status.bits.warning)
857 {
858 description += "A VR warning";
859 }
860 else if (status.bits.critical)
861 {
862 description += "A VR critical";
863 logLevel = log_level::CRITICAL;
864 }
865 else
866 {
867 description += "No VR warning or critical";
868 logLevel = log_level::OK;
869 }
870 }
871 description += " condition observed";
872
873 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
874 << std::setw(2)
875 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
876 << "; VR status byte low is 0x" << std::setw(2)
877 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
878 << "; Reading is 0x" << std::setw(2)
879 << static_cast<uint32_t>(presentReading) << ";";
880
881 description += strStream.str();
882
883 // Log to Redfish event
884 sendJournalRedfish(description, logLevel);
885}
886
Chau Lyb01357f2024-10-17 09:18:01 +0000887void OemEventManager::handleNumericWatchdogEvent(
888 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
889{
890 std::string description;
891 log_level logLevel = log_level::CRITICAL;
892
893 description += prefixMsgStrCreation(tid, sensorId);
894
895 if (presentReading & 0x01)
896 {
897 description += "Global watchdog expired;";
898 }
899 if (presentReading & 0x02)
900 {
901 description += "Secure watchdog expired;";
902 }
903 if (presentReading & 0x04)
904 {
905 description += "Non-secure watchdog expired;";
906 }
907
908 // Log to Redfish event
909 sendJournalRedfish(description, logLevel);
910}
911
Dung Cao72c8aa02023-11-22 02:31:41 +0000912int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
913 const uint8_t* eventData,
914 size_t eventDataSize)
915{
916 EFI_AMPERE_ERROR_DATA ampHdr;
917
918 decodeCperRecord(eventData, eventDataSize, &ampHdr);
919
920 addCperSELLog(tid, eventId, &ampHdr);
921
Thu Nguyen4b537552024-11-19 08:43:23 +0000922 /* isBert at bit 12 of TypeId */
923 if (ampHdr.TypeId & 0x0800)
924 {
925 lg2::info("Ampere SoC BERT is triggered.");
926 std::variant<std::string> value(
927 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
928 try
929 {
930 auto& bus = pldm::utils::DBusHandler::getBus();
931 auto method =
932 bus.new_method_call("com.ampere.CrashCapture.Trigger",
933 "/com/ampere/crashcapture/trigger",
934 pldm::utils::dbusProperties, "Set");
935 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
936 value);
937 bus.call_noreply(method);
938 }
939 catch (const std::exception& e)
940 {
941 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
942 }
943 }
944
Dung Cao72c8aa02023-11-22 02:31:41 +0000945 return PLDM_SUCCESS;
946}
947
Thu Nguyen79f9ff62024-11-22 03:36:27 +0000948int OemEventManager::handlepldmMessagePollEvent(
949 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
950 pldm_tid_t tid, size_t eventDataOffset)
951{
952 /* This OEM event handler is only used for SoC terminus*/
953 if (!tidToSocketNameMap.contains(tid))
954 {
955 return PLDM_SUCCESS;
956 }
957
958 auto eventData =
959 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
960 auto eventDataSize = payloadLength - eventDataOffset;
961
962 pldm_message_poll_event poll_event{};
963 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
964 &poll_event);
965 if (rc)
966 {
967 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
968 "RC", rc);
969 return rc;
970 }
971
972 auto sensorID = poll_event.event_id;
973 /* The UE errors */
974 if (rasUESensorIDs.contains(sensorID))
975 {
976 pldm::utils::DBusMapping dbusMapping{
977 "/xyz/openbmc_project/led/groups/ras_ue_fault",
978 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
979 try
980 {
981 pldm::utils::DBusHandler().setDbusProperty(
982 dbusMapping, pldm::utils::PropertyValue{bool(true)});
983 }
984 catch (const std::exception& e)
985 {
986 lg2::error(
987 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
988 "TID", tid, "SENSORID", sensorID, "ERROR", e);
989 }
990 }
991
992 return PLDM_SUCCESS;
993}
994
Chau Lya743e382024-10-26 11:12:22 +0000995} // namespace oem_ampere
996} // namespace pldm