blob: cf8e1abac2043c0824aa49bba5b11ab3c45813e3 [file] [log] [blame]
Chau Lya743e382024-10-26 11:12:22 +00001#include "oem_event_manager.hpp"
2
3#include "requester/handler.hpp"
4#include "requester/request.hpp"
5
6#include <config.h>
7#include <libpldm/pldm.h>
8#include <libpldm/utils.h>
9#include <systemd/sd-journal.h>
10
11#include <phosphor-logging/lg2.hpp>
12#include <xyz/openbmc_project/Logging/Entry/server.hpp>
13
14#include <algorithm>
15#include <map>
16#include <sstream>
17#include <string>
18#include <unordered_map>
19
20namespace pldm
21{
22namespace oem_ampere
23{
24namespace boot_stage = boot::stage;
Chau Lycebf4762024-10-03 09:02:54 +000025namespace ddr_status = ddr::status;
26namespace dimm_status = dimm::status;
27namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
28namespace phy_syndrome = dimm::training_failure::phy_syndrome;
29namespace training_failure = dimm::training_failure;
Chau Lya743e382024-10-26 11:12:22 +000030
Chau Ly3de0d942024-10-03 08:57:11 +000031constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent.OK";
32constexpr const char* ampereWarningRegistry =
33 "OpenBMC.0.1.AmpereWarning.Warning";
34constexpr const char* ampereCriticalRegistry =
35 "OpenBMC.0.1.AmpereCritical.Critical";
Chau Lya743e382024-10-26 11:12:22 +000036constexpr const char* BIOSFWPanicRegistry =
37 "OpenBMC.0.1.BIOSFirmwarePanicReason.Warning";
38constexpr auto maxDIMMIdxBitNum = 24;
Chau Lycebf4762024-10-03 09:02:54 +000039constexpr auto maxDIMMInstantNum = 24;
Chau Lya743e382024-10-26 11:12:22 +000040
41/*
42 An array of possible boot status of a boot stage.
43 The index maps with byte 0 of boot code.
44*/
45std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
46
47/*
48 An array of possible boot status of DDR training stage.
49 The index maps with byte 0 of boot code.
50*/
51std::array<std::string, 3> ddrTrainingMsg = {
52 " progress started", " in-progress", " progress completed"};
53
54/*
Chau Lycebf4762024-10-03 09:02:54 +000055 A map between PMIC status and logging strings.
56*/
57std::array<std::string, 8> pmicTempAlertMsg = {
58 "Below 85°C", "85°C", "95°C", "105°C",
59 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
60
61/*
Chau Lya743e382024-10-26 11:12:22 +000062 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
63 EPs through SMBus and PCIe. When host boots up, SMBUS interface
64 comes up first. In this interface, BMC is bus owner.
65
66 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
67 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
68*/
69EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
70
71/*
72 A map between sensor IDs and their names in string.
73 Using pldm::oem::sensor_ids
74*/
Chau Ly4cca3dc2024-10-03 09:07:09 +000075EventToMsgMap_t sensorIdToStrMap = {
Chau Lyef214b52024-10-16 09:40:38 +000076 {DDR_STATUS, "DDR_STATUS"},
77 {PCP_VR_STATE, "PCP_VR_STATE"},
78 {SOC_VR_STATE, "SOC_VR_STATE"},
79 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
80 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
81 {D2D_VR_STATE, "D2D_VR_STATE"},
82 {IOC_VR1_STATE, "IOC_VR1_STATE"},
83 {IOC_VR2_STATE, "IOC_VR2_STATE"},
84 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
85 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
86 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
87 {BOOT_OVERALL, "BOOT_OVERALL"},
Chau Lyb01357f2024-10-17 09:18:01 +000088 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
89 {WATCH_DOG, "WATCH_DOG"}};
Chau Lya743e382024-10-26 11:12:22 +000090
91/*
92 A map between the boot stages and logging strings.
93 Using pldm::oem::boot::stage::boot_stage
94*/
95EventToMsgMap_t bootStageToMsgMap = {
96 {boot_stage::SECPRO, "SECpro"},
97 {boot_stage::MPRO, "Mpro"},
98 {boot_stage::ATF_BL1, "ATF BL1"},
99 {boot_stage::ATF_BL2, "ATF BL2"},
100 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
101 {boot_stage::DDR_TRAINING, "DDR training"},
102 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
103 {boot_stage::ATF_BL31, "ATF BL31"},
104 {boot_stage::ATF_BL32, "ATF BL32"},
105 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
106 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
107 "ATF BL33 (UEFI) booting status = "}};
108
109/*
Chau Lycebf4762024-10-03 09:02:54 +0000110 A map between DDR status and logging strings.
111 Using pldm::oem::ddr::status::ddr_status
112*/
113EventToMsgMap_t ddrStatusToMsgMap = {
114 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
115 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
116 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
117 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
118 {ddr_status::OTHER_FAILURE, "has other failure"},
119 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
120 "has boot failure due to no configuration"},
121 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
122 "failsafe activated but boot success with the next valid configuration"}};
123
124/*
125 A map between DIMM status and logging strings.
126 Using pldm::oem::dimm::status::dimm_status
127*/
128EventToMsgMap_t dimmStatusToMsgMap = {
129 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
130 {dimm_status::NOT_INSTALLED, "is not installed"},
131 {dimm_status::OTHER_FAILURE, "has other failure"},
132 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
133 {dimm_status::TRAINING_FAILURE, "has training failure; "},
134 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
135
136/*
137 A map between PHY training failure syndrome and logging strings.
138 Using
139 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
140*/
141EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
142 {phy_syndrome::NA, "(N/A)"},
143 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
144 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
145 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
146 "(PHY write level failure - see syndrome 1)"},
147 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
148 "(PHY read gate leveling failure)"},
149 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
150 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
151 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
152
153/*
154 A map between DIMM training failure syndrome and logging strings.
155 Using
156 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
157*/
158EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
159 {dimm_syndrome::NA, "(N/A)"},
160 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
161 "(DRAM VREFDQ training failure)"},
162 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
163 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
164 "(LRDRIMM DB SW training failure)"}};
165
166/*
167 A map between DIMM training failure type and a pair of <logging strings -
168 syndrome map>. Using
169 pldm::oem::dimm::training_faillure::dimm_training_failure_type
170*/
171std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
172 dimmTrainingFailureTypeMap = {
173 {training_failure::PHY_TRAINING_FAILURE_TYPE,
174 std::make_pair("PHY training failure",
175 phyTrainingFailureSyndromeToMsgMap)},
176 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
177 std::make_pair("DIMM training failure",
178 dimmTrainingFailureSyndromeToMsgMap)}};
179
180/*
Chau Lya743e382024-10-26 11:12:22 +0000181 A map between log level and the registry used for Redfish SEL log
182 Using pldm::oem::log_level
183*/
184std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
Chau Ly3de0d942024-10-03 08:57:11 +0000185 {log_level::OK, ampereEventRegistry},
186 {log_level::WARNING, ampereWarningRegistry},
187 {log_level::CRITICAL, ampereCriticalRegistry},
Chau Lya743e382024-10-26 11:12:22 +0000188 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
189
Chau Lyef214b52024-10-16 09:40:38 +0000190std::unordered_map<
191 uint16_t,
192 std::vector<std::pair<
193 std::string,
194 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
195 stateSensorToMsgMap = {
196 {SOC_HEALTH_AVAILABILITY,
197 {{"SoC Health",
198 {{1, {log_level::OK, "Normal"}},
199 {2, {log_level::WARNING, "Non-Critical"}},
200 {3, {log_level::CRITICAL, "Critical"}},
201 {4, {log_level::CRITICAL, "Fatal"}}}},
202 {"SoC Availability",
203 {{1, {log_level::OK, "Enabled"}},
204 {2, {log_level::WARNING, "Disabled"}},
Chau Lyb01357f2024-10-17 09:18:01 +0000205 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
206 {WATCH_DOG,
207 {{"Global Watch Dog",
208 {{1, {log_level::OK, "Normal"}},
209 {2, {log_level::CRITICAL, "Timer Expired"}}}},
210 {"Secure Watch Dog",
211 {{1, {log_level::OK, "Normal"}},
212 {2, {log_level::CRITICAL, "Timer Expired"}}}},
213 {"Non-secure Watch Dog",
214 {{1, {log_level::OK, "Normal"}},
215 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
Chau Lyef214b52024-10-16 09:40:38 +0000216
Chau Lya743e382024-10-26 11:12:22 +0000217std::string
218 OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId)
219{
220 std::string description;
221 if (!tidToSocketNameMap.contains(tid))
222 {
223 description += "TID " + std::to_string(tid) + ": ";
224 }
225 else
226 {
227 description += tidToSocketNameMap[tid] + ": ";
228 }
229
230 if (!sensorIdToStrMap.contains(sensorId))
231 {
232 description += "Sensor ID " + std::to_string(sensorId) + ": ";
233 }
234 else
235 {
236 description += sensorIdToStrMap[sensorId] + ": ";
237 }
238
239 return description;
240}
241
242void OemEventManager::sendJournalRedfish(const std::string& description,
243 log_level& logLevel)
244{
245 if (description.empty())
246 {
247 return;
248 }
249
250 if (!logLevelToRedfishMsgIdMap.contains(logLevel))
251 {
252 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
253 "DES", description);
254 return;
255 }
256 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
257 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
258 redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
259}
260
261std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
262{
263 std::string description;
264 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
265 {
266 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
267 {
268 description += " #" + std::to_string(bitIdx);
269 }
270 }
271 return description;
272}
273
274void OemEventManager::handleBootOverallEvent(
275 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
276{
277 log_level logLevel{log_level::OK};
278 std::string description;
279 std::stringstream strStream;
280
281 uint8_t byte0 = (presentReading & 0x000000ff);
282 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
283 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
284 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
285 /*
286 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
287 * ATF BL32 and DDR initialization
288 */
289 if (bootStageToMsgMap.contains(byte3))
290 {
291 // Boot stage adding
292 description += bootStageToMsgMap[byte3];
293
294 switch (byte3)
295 {
296 case boot_stage::DDR_TRAINING:
297 if (byte0 >= ddrTrainingMsg.size())
298 {
299 logLevel = log_level::BIOSFWPANIC;
300 description += " unknown status";
301 }
302 else
303 {
304 description += ddrTrainingMsg[byte0];
305 }
306 if (0x01 == byte0)
307 {
308 // Add complete percentage
309 description += " at " + std::to_string(byte1) + "%";
310 }
311 break;
312 case boot_stage::S0_DDR_TRAINING_FAILURE:
313 case boot_stage::S1_DDR_TRAINING_FAILURE:
314 // ddr_training_status_msg()
315 logLevel = log_level::BIOSFWPANIC;
316 description += " at DIMMs:";
317 // dimmIdxs = presentReading & 0x00ffffff;
318 description += dimmIdxsToString(presentReading & 0x00ffffff);
319 description += " of socket ";
320 description +=
321 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
322 break;
323 default:
324 if (byte0 >= bootStatMsg.size())
325 {
326 logLevel = log_level::BIOSFWPANIC;
327 description += " unknown status";
328 }
329 else
330 {
331 description += bootStatMsg[byte0];
332 }
333 break;
334 }
335
336 // Sensor report action is fail
337 if (boot::status::BOOT_STATUS_FAILURE == byte2)
338 {
339 logLevel = log_level::BIOSFWPANIC;
340 }
341 }
342 else
343 {
344 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
345 {
346 description +=
347 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
348
349 strStream
350 << "Segment (0x" << std::setfill('0') << std::hex
351 << std::setw(8) << static_cast<uint32_t>(presentReading)
Chau Ly3de0d942024-10-03 08:57:11 +0000352 << "); Status Class (0x" << std::setw(2)
353 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
Chau Lya743e382024-10-26 11:12:22 +0000354 << std::setw(2) << static_cast<uint32_t>(byte2)
Chau Ly3de0d942024-10-03 08:57:11 +0000355 << "); Operation Code (0x" << std::setw(4)
Chau Lya743e382024-10-26 11:12:22 +0000356 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
357 << ")" << std::dec;
358
359 description += strStream.str();
360 }
361 }
362
363 // Log to Redfish event
364 sendJournalRedfish(description, logLevel);
365}
366
367int OemEventManager::processNumericSensorEvent(
368 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
369 size_t sensorDataLength)
370{
371 uint8_t eventState = 0;
372 uint8_t previousEventState = 0;
373 uint8_t sensorDataSize = 0;
374 uint32_t presentReading;
375 auto rc = decode_numeric_sensor_data(
376 sensorData, sensorDataLength, &eventState, &previousEventState,
377 &sensorDataSize, &presentReading);
378 if (rc)
379 {
380 lg2::error(
381 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
382 "TID", tid, "RC", rc);
383 return rc;
384 }
385
Chau Lycebf4762024-10-03 09:02:54 +0000386 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
387 if (auto dimmIdx = (sensorId - 4) / 2;
388 sensorId >= 4 && dimmIdx >= 0 && dimmIdx < maxDIMMInstantNum)
389 {
390 handleDIMMStatusEvent(tid, sensorId, presentReading);
391 return PLDM_SUCCESS;
392 }
393
Chau Lya743e382024-10-26 11:12:22 +0000394 switch (sensorId)
395 {
396 case BOOT_OVERALL:
397 handleBootOverallEvent(tid, sensorId, presentReading);
398 break;
Chau Ly3de0d942024-10-03 08:57:11 +0000399 case PCIE_HOT_PLUG:
400 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
401 break;
Chau Lycebf4762024-10-03 09:02:54 +0000402 case DDR_STATUS:
403 handleDDRStatusEvent(tid, sensorId, presentReading);
404 break;
Chau Ly4cca3dc2024-10-03 09:07:09 +0000405 case PCP_VR_STATE:
406 case SOC_VR_STATE:
407 case DPHY_VR1_STATE:
408 case DPHY_VR2_STATE:
409 case D2D_VR_STATE:
410 case IOC_VR1_STATE:
411 case IOC_VR2_STATE:
412 case PCI_D_VR_STATE:
413 case PCI_A_VR_STATE:
414 handleVRDStatusEvent(tid, sensorId, presentReading);
415 break;
Chau Lyb01357f2024-10-17 09:18:01 +0000416 case WATCH_DOG:
417 handleNumericWatchdogEvent(tid, sensorId, presentReading);
418 break;
Chau Lya743e382024-10-26 11:12:22 +0000419 default:
420 std::string description;
421 std::stringstream strStream;
422 log_level logLevel = log_level::OK;
423
424 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
425 description += prefixMsgStrCreation(tid, sensorId);
426 strStream << std::setfill('0') << std::hex << "eventState 0x"
427 << std::setw(2) << static_cast<uint32_t>(eventState)
428 << " previousEventState 0x" << std::setw(2)
429 << static_cast<uint32_t>(previousEventState)
430 << " sensorDataSize 0x" << std::setw(2)
431 << static_cast<uint32_t>(sensorDataSize)
432 << " presentReading 0x" << std::setw(8)
433 << static_cast<uint32_t>(presentReading) << std::dec;
434 description += strStream.str();
435
436 sendJournalRedfish(description, logLevel);
437 break;
438 }
439 return PLDM_SUCCESS;
440}
441
442int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
443 const uint8_t* sensorData,
444 size_t sensorDataLength)
445{
446 uint8_t sensorOffset = 0;
447 uint8_t eventState = 0;
448 uint8_t previousEventState = 0;
449
450 auto rc =
451 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
452 &eventState, &previousEventState);
453 if (rc)
454 {
455 lg2::error(
456 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
457 "TID", tid, "RC", rc);
458 return rc;
459 }
460
461 std::string description;
Chau Lya743e382024-10-26 11:12:22 +0000462 log_level logLevel = log_level::OK;
463
Chau Lyef214b52024-10-16 09:40:38 +0000464 if (stateSensorToMsgMap.contains(sensorId))
465 {
466 description += prefixMsgStrCreation(tid, sensorId);
467 auto componentMap = stateSensorToMsgMap[sensorId];
468 if (sensorOffset < componentMap.size())
469 {
470 description += std::get<0>(componentMap[sensorOffset]);
471 auto stateMap = std::get<1>(componentMap[sensorOffset]);
472 if (stateMap.contains(eventState))
473 {
474 logLevel = std::get<0>(stateMap[eventState]);
475 description += " state : " + std::get<1>(stateMap[eventState]);
476 if (stateMap.contains(previousEventState))
477 {
478 description += "; previous state: " +
479 std::get<1>(stateMap[previousEventState]);
480 }
481 }
482 else
483 {
484 description += " sends unsupported event state: " +
485 std::to_string(eventState);
486 if (stateMap.contains(previousEventState))
487 {
488 description += "; previous state: " +
489 std::get<1>(stateMap[previousEventState]);
490 }
491 }
492 }
493 else
494 {
495 description += "sends unsupported component sensor offset " +
496 std::to_string(sensorOffset);
497 }
498 }
499 else
500 {
501 std::stringstream strStream;
502 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
503 description += prefixMsgStrCreation(tid, sensorId);
504 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
505 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
506 << "eventState 0x" << std::setw(2)
507 << static_cast<uint32_t>(eventState)
508 << " previousEventState 0x" << std::setw(2)
509 << static_cast<uint32_t>(previousEventState) << std::dec;
510 description += strStream.str();
511 }
Chau Lya743e382024-10-26 11:12:22 +0000512
513 sendJournalRedfish(description, logLevel);
514
515 return PLDM_SUCCESS;
516}
517
518int OemEventManager::processSensorOpStateEvent(
519 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
520 size_t sensorDataLength)
521{
522 uint8_t present_op_state = 0;
523 uint8_t previous_op_state = 0;
524
525 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
526 &present_op_state, &previous_op_state);
527 if (rc)
528 {
529 lg2::error(
530 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
531 "TID", tid, "RC", rc);
532 return rc;
533 }
534
535 std::string description;
536 std::stringstream strStream;
537 log_level logLevel = log_level::OK;
538
539 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
540 description += prefixMsgStrCreation(tid, sensorId);
541 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
542 << std::setw(2) << static_cast<uint32_t>(present_op_state)
543 << "previous_op_state 0x" << std::setw(2)
544 << static_cast<uint32_t>(previous_op_state) << std::dec;
545 description += strStream.str();
546
547 sendJournalRedfish(description, logLevel);
548
549 return PLDM_SUCCESS;
550}
551
552int OemEventManager::handleSensorEvent(
553 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
554 pldm_tid_t tid, size_t eventDataOffset)
555{
556 /* This OEM event handler is only used for SoC terminus*/
557 if (!tidToSocketNameMap.contains(tid))
558 {
559 return PLDM_SUCCESS;
560 }
561 auto eventData =
562 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
563 auto eventDataSize = payloadLength - eventDataOffset;
564
565 uint16_t sensorId = 0;
566 uint8_t sensorEventClassType = 0;
567 size_t eventClassDataOffset = 0;
568 auto rc =
569 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
570 &sensorEventClassType, &eventClassDataOffset);
571 if (rc)
572 {
573 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
574 rc);
575 return rc;
576 }
577 const uint8_t* sensorData = eventData + eventClassDataOffset;
578 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
579
580 switch (sensorEventClassType)
581 {
582 case PLDM_NUMERIC_SENSOR_STATE:
583 {
584 return processNumericSensorEvent(tid, sensorId, sensorData,
585 sensorDataLength);
586 }
587 case PLDM_STATE_SENSOR_STATE:
588 {
589 return processStateSensorEvent(tid, sensorId, sensorData,
590 sensorDataLength);
591 }
592 case PLDM_SENSOR_OP_STATE:
593 {
594 return processSensorOpStateEvent(tid, sensorId, sensorData,
595 sensorDataLength);
596 }
597 default:
598 std::string description;
599 std::stringstream strStream;
600 log_level logLevel = log_level::OK;
601
602 description += "SENSOR_EVENT : Unsupported Sensor Class " +
603 std::to_string(sensorEventClassType) + ": ";
604 description += prefixMsgStrCreation(tid, sensorId);
605 strStream << std::setfill('0') << std::hex
606 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
607
608 auto dataPtr = sensorData;
609 for ([[maybe_unused]] const auto& i :
610 std::views::iota(0, (int)sensorDataLength))
611 {
612 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
613 dataPtr += sizeof(sensorData);
614 }
615
616 description += strStream.str();
617
618 sendJournalRedfish(description, logLevel);
619 }
620 lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
621 sensorEventClassType);
622 return PLDM_ERROR;
623}
624
Chau Ly3de0d942024-10-03 08:57:11 +0000625void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
626 uint32_t presentReading)
627{
628 std::string description;
629 std::stringstream strStream;
630 PCIeHotPlugEventRecord_t record{presentReading};
631
632 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
633 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
634 log_level logLevel =
635 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
636
637 description += prefixMsgStrCreation(tid, sensorId);
638
639 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
640 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
641 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
642 << "); Device (0x" << std::setw(2)
643 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
644 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
645 << "); Action (" << sAction << "); Operation status ("
646 << sOpStatus << "); Media slot number (" << std::dec
647 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
648
649 description += strStream.str();
650
651 // Log to Redfish event
652 sendJournalRedfish(description, logLevel);
653}
654
Chau Lycebf4762024-10-03 09:02:54 +0000655std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
656{
657 std::string description;
658 DIMMTrainingFailure_t failure{failureInfo};
659
660 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
661 {
662 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
663
664 description += std::get<0>(failureInfoMap);
665
666 description += "; MCU rank index " +
667 std::to_string(failure.bits.mcuRankIdx);
668
669 description += "; Slice number " +
670 std::to_string(failure.bits.sliceNum);
671
672 description += "; Upper nibble error status: ";
673 description += (!failure.bits.upperNibbStatErr)
674 ? "No error"
675 : "Found no rising edge";
676
677 description += "; Lower nibble error status: ";
678 description += (!failure.bits.lowerNibbStatErr)
679 ? "No error"
680 : "Found no rising edge";
681
682 description += "; Failure syndrome 0: ";
683
684 auto& syndromeMap = std::get<1>(failureInfoMap);
685 if (syndromeMap.contains(failure.bits.syndrome))
686 {
687 description += syndromeMap[failure.bits.syndrome];
688 }
689 else
690 {
691 description += "(Unknown syndrome)";
692 }
693 }
694 else
695 {
696 description += "Unknown training failure type " +
697 std::to_string(failure.bits.type);
698 }
699
700 return description;
701}
702
703void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
704 uint32_t presentReading)
705{
706 log_level logLevel{log_level::WARNING};
707 std::string description;
708 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
709 uint32_t byte012 = presentReading & 0xffffff;
710
711 description += prefixMsgStrCreation(tid, sensorId);
712
713 uint8_t dimmIdx = (sensorId - 4) / 2;
714
715 description += "DIMM " + std::to_string(dimmIdx) + " ";
716
717 if (dimmStatusToMsgMap.contains(byte3))
718 {
719 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
720 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
721 {
722 logLevel = log_level::OK;
723 }
724
725 description += dimmStatusToMsgMap[byte3];
726
727 if (byte3 == dimm_status::TRAINING_FAILURE)
728 {
729 description += "; " + dimmTrainingFailureToMsg(byte012);
730 }
731 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
732 {
733 uint8_t byte0 = (byte012 & 0xff);
734 if (byte0 < pmicTempAlertMsg.size())
735 {
736 description += ": " + pmicTempAlertMsg[byte0];
737 }
738 }
739 }
740 else
741 {
742 switch (byte3)
743 {
744 case dimm_status::PMIC_HIGH_TEMP:
745 if (byte012 == 0x01)
746 {
747 description += "has PMIC high temp condition";
748 }
749 break;
750 case dimm_status::TSx_HIGH_TEMP:
751 switch (byte012)
752 {
753 case 0x01:
754 description += "has TS0";
755 break;
756 case 0x02:
757 description += "has TS1";
758 break;
759 case 0x03:
760 description += "has TS0 and TS1";
761 break;
762 }
763 description += " exceeding their high temperature threshold";
764 break;
765 case dimm_status::SPD_HUB_HIGH_TEMP:
766 if (byte012 == 0x01)
767 {
768 description += "has SPD/HUB high temp condition";
769 }
770 break;
771 default:
772 description += "has unsupported status " +
773 std::to_string(byte3);
774 break;
775 }
776 }
777
778 // Log to Redfish event
779 sendJournalRedfish(description, logLevel);
780}
781
782void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
783 uint32_t presentReading)
784{
785 log_level logLevel{log_level::WARNING};
786 std::string description;
787 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
788 uint32_t byte012 = presentReading & 0xffffff;
789
790 description += prefixMsgStrCreation(tid, sensorId);
791
792 description += "DDR ";
793 if (ddrStatusToMsgMap.contains(byte3))
794 {
795 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
796 {
797 logLevel = log_level::OK;
798 }
799
800 description += ddrStatusToMsgMap[byte3];
801
802 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
803 byte3 == ddr_status::TRAINING_FAILURE)
804 {
805 // List out failed DIMMs
806 description += dimmIdxsToString(byte012);
807 }
808 }
809 else
810 {
811 description += "has unsupported status " + std::to_string(byte3);
812 }
813
814 // Log to Redfish event
815 sendJournalRedfish(description, logLevel);
816}
817
Chau Ly4cca3dc2024-10-03 09:07:09 +0000818void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
819 uint32_t presentReading)
820{
821 log_level logLevel{log_level::WARNING};
822 std::string description;
823 std::stringstream strStream;
824
825 description += prefixMsgStrCreation(tid, sensorId);
826
827 VRDStatus_t status{presentReading};
828
829 if (status.bits.warning && status.bits.critical)
830 {
831 description += "A VR warning and a VR critical";
832 logLevel = log_level::CRITICAL;
833 }
834 else
835 {
836 if (status.bits.warning)
837 {
838 description += "A VR warning";
839 }
840 else if (status.bits.critical)
841 {
842 description += "A VR critical";
843 logLevel = log_level::CRITICAL;
844 }
845 else
846 {
847 description += "No VR warning or critical";
848 logLevel = log_level::OK;
849 }
850 }
851 description += " condition observed";
852
853 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
854 << std::setw(2)
855 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
856 << "; VR status byte low is 0x" << std::setw(2)
857 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
858 << "; Reading is 0x" << std::setw(2)
859 << static_cast<uint32_t>(presentReading) << ";";
860
861 description += strStream.str();
862
863 // Log to Redfish event
864 sendJournalRedfish(description, logLevel);
865}
866
Chau Lyb01357f2024-10-17 09:18:01 +0000867void OemEventManager::handleNumericWatchdogEvent(
868 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
869{
870 std::string description;
871 log_level logLevel = log_level::CRITICAL;
872
873 description += prefixMsgStrCreation(tid, sensorId);
874
875 if (presentReading & 0x01)
876 {
877 description += "Global watchdog expired;";
878 }
879 if (presentReading & 0x02)
880 {
881 description += "Secure watchdog expired;";
882 }
883 if (presentReading & 0x04)
884 {
885 description += "Non-secure watchdog expired;";
886 }
887
888 // Log to Redfish event
889 sendJournalRedfish(description, logLevel);
890}
891
Chau Lya743e382024-10-26 11:12:22 +0000892} // namespace oem_ampere
893} // namespace pldm