oem-ampere: eventManager: Handle DIMM and DDR status sensor event

This commit adds the handler and needed APIs to handle DIMM and DDR
status event as Numeric Sensor Event in Ampere's OemEventManager class.
The handler will decode the event data, parse them to readable info and
log to Redfish Event Log.

Change-Id: I8d9e91356971efaa5e838992a22d98604e72c130
Signed-off-by: Chau Ly <chaul@amperecomputing.com>
diff --git a/oem/ampere/event/oem_event_manager.cpp b/oem/ampere/event/oem_event_manager.cpp
index 9f52756..1aae241 100644
--- a/oem/ampere/event/oem_event_manager.cpp
+++ b/oem/ampere/event/oem_event_manager.cpp
@@ -22,6 +22,11 @@
 namespace oem_ampere
 {
 namespace boot_stage = boot::stage;
+namespace ddr_status = ddr::status;
+namespace dimm_status = dimm::status;
+namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
+namespace phy_syndrome = dimm::training_failure::phy_syndrome;
+namespace training_failure = dimm::training_failure;
 
 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent.OK";
 constexpr const char* ampereWarningRegistry =
@@ -31,6 +36,7 @@
 constexpr const char* BIOSFWPanicRegistry =
     "OpenBMC.0.1.BIOSFirmwarePanicReason.Warning";
 constexpr auto maxDIMMIdxBitNum = 24;
+constexpr auto maxDIMMInstantNum = 24;
 
 /*
     An array of possible boot status of a boot stage.
@@ -46,6 +52,13 @@
     " progress started", " in-progress", " progress completed"};
 
 /*
+    A map between PMIC status and logging strings.
+*/
+std::array<std::string, 8> pmicTempAlertMsg = {
+    "Below 85°C", "85°C",  "95°C",  "105°C",
+    "115°C",      "125°C", "135°C", "Equal or greater than 140°C"};
+
+/*
     In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
     EPs through SMBus and PCIe. When host boots up, SMBUS interface
     comes up first. In this interface, BMC is bus owner.
@@ -59,7 +72,8 @@
     A map between sensor IDs and their names in string.
     Using pldm::oem::sensor_ids
 */
-EventToMsgMap_t sensorIdToStrMap = {{PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
+EventToMsgMap_t sensorIdToStrMap = {{DDR_STATUS, "DDR_STATUS"},
+                                    {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
                                     {BOOT_OVERALL, "BOOT_OVERALL"}};
 
 /*
@@ -81,6 +95,77 @@
      "ATF BL33 (UEFI) booting status = "}};
 
 /*
+    A map between DDR status and logging strings.
+    Using pldm::oem::ddr::status::ddr_status
+*/
+EventToMsgMap_t ddrStatusToMsgMap = {
+    {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
+    {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
+    {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
+    {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
+    {ddr_status::OTHER_FAILURE, "has other failure"},
+    {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
+     "has boot failure due to no configuration"},
+    {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
+     "failsafe activated but boot success with the next valid configuration"}};
+
+/*
+    A map between DIMM status and logging strings.
+    Using pldm::oem::dimm::status::dimm_status
+*/
+EventToMsgMap_t dimmStatusToMsgMap = {
+    {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
+    {dimm_status::NOT_INSTALLED, "is not installed"},
+    {dimm_status::OTHER_FAILURE, "has other failure"},
+    {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
+    {dimm_status::TRAINING_FAILURE, "has training failure; "},
+    {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
+
+/*
+    A map between PHY training failure syndrome and logging strings.
+    Using
+   pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
+*/
+EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
+    {phy_syndrome::NA, "(N/A)"},
+    {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
+    {phy_syndrome::CA_LEVELING, "(CA leveling)"},
+    {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
+     "(PHY write level failure - see syndrome 1)"},
+    {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
+     "(PHY read gate leveling failure)"},
+    {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
+    {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
+    {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
+
+/*
+    A map between DIMM training failure syndrome and logging strings.
+    Using
+   pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
+*/
+EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
+    {dimm_syndrome::NA, "(N/A)"},
+    {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
+     "(DRAM VREFDQ training failure)"},
+    {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
+    {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
+     "(LRDRIMM DB SW training failure)"}};
+
+/*
+    A map between DIMM training failure type and a pair of <logging strings -
+   syndrome map>. Using
+   pldm::oem::dimm::training_faillure::dimm_training_failure_type
+*/
+std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
+    dimmTrainingFailureTypeMap = {
+        {training_failure::PHY_TRAINING_FAILURE_TYPE,
+         std::make_pair("PHY training failure",
+                        phyTrainingFailureSyndromeToMsgMap)},
+        {training_failure::DIMM_TRAINING_FAILURE_TYPE,
+         std::make_pair("DIMM training failure",
+                        dimmTrainingFailureSyndromeToMsgMap)}};
+
+/*
     A map between log level and the registry used for Redfish SEL log
     Using pldm::oem::log_level
 */
@@ -259,6 +344,14 @@
         return rc;
     }
 
+    // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
+    if (auto dimmIdx = (sensorId - 4) / 2;
+        sensorId >= 4 && dimmIdx >= 0 && dimmIdx < maxDIMMInstantNum)
+    {
+        handleDIMMStatusEvent(tid, sensorId, presentReading);
+        return PLDM_SUCCESS;
+    }
+
     switch (sensorId)
     {
         case BOOT_OVERALL:
@@ -267,6 +360,9 @@
         case PCIE_HOT_PLUG:
             handlePCIeHotPlugEvent(tid, sensorId, presentReading);
             break;
+        case DDR_STATUS:
+            handleDDRStatusEvent(tid, sensorId, presentReading);
+            break;
         default:
             std::string description;
             std::stringstream strStream;
@@ -465,5 +561,168 @@
     sendJournalRedfish(description, logLevel);
 }
 
+std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
+{
+    std::string description;
+    DIMMTrainingFailure_t failure{failureInfo};
+
+    if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
+    {
+        auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
+
+        description += std::get<0>(failureInfoMap);
+
+        description += "; MCU rank index " +
+                       std::to_string(failure.bits.mcuRankIdx);
+
+        description += "; Slice number " +
+                       std::to_string(failure.bits.sliceNum);
+
+        description += "; Upper nibble error status: ";
+        description += (!failure.bits.upperNibbStatErr)
+                           ? "No error"
+                           : "Found no rising edge";
+
+        description += "; Lower nibble error status: ";
+        description += (!failure.bits.lowerNibbStatErr)
+                           ? "No error"
+                           : "Found no rising edge";
+
+        description += "; Failure syndrome 0: ";
+
+        auto& syndromeMap = std::get<1>(failureInfoMap);
+        if (syndromeMap.contains(failure.bits.syndrome))
+        {
+            description += syndromeMap[failure.bits.syndrome];
+        }
+        else
+        {
+            description += "(Unknown syndrome)";
+        }
+    }
+    else
+    {
+        description += "Unknown training failure type " +
+                       std::to_string(failure.bits.type);
+    }
+
+    return description;
+}
+
+void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
+                                            uint32_t presentReading)
+{
+    log_level logLevel{log_level::WARNING};
+    std::string description;
+    uint8_t byte3 = (presentReading & 0xff000000) >> 24;
+    uint32_t byte012 = presentReading & 0xffffff;
+
+    description += prefixMsgStrCreation(tid, sensorId);
+
+    uint8_t dimmIdx = (sensorId - 4) / 2;
+
+    description += "DIMM " + std::to_string(dimmIdx) + " ";
+
+    if (dimmStatusToMsgMap.contains(byte3))
+    {
+        if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
+            byte3 == dimm_status::INSTALLED_BUT_DISABLED)
+        {
+            logLevel = log_level::OK;
+        }
+
+        description += dimmStatusToMsgMap[byte3];
+
+        if (byte3 == dimm_status::TRAINING_FAILURE)
+        {
+            description += "; " + dimmTrainingFailureToMsg(byte012);
+        }
+        else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
+        {
+            uint8_t byte0 = (byte012 & 0xff);
+            if (byte0 < pmicTempAlertMsg.size())
+            {
+                description += ": " + pmicTempAlertMsg[byte0];
+            }
+        }
+    }
+    else
+    {
+        switch (byte3)
+        {
+            case dimm_status::PMIC_HIGH_TEMP:
+                if (byte012 == 0x01)
+                {
+                    description += "has PMIC high temp condition";
+                }
+                break;
+            case dimm_status::TSx_HIGH_TEMP:
+                switch (byte012)
+                {
+                    case 0x01:
+                        description += "has TS0";
+                        break;
+                    case 0x02:
+                        description += "has TS1";
+                        break;
+                    case 0x03:
+                        description += "has TS0 and TS1";
+                        break;
+                }
+                description += " exceeding their high temperature threshold";
+                break;
+            case dimm_status::SPD_HUB_HIGH_TEMP:
+                if (byte012 == 0x01)
+                {
+                    description += "has SPD/HUB high temp condition";
+                }
+                break;
+            default:
+                description += "has unsupported status " +
+                               std::to_string(byte3);
+                break;
+        }
+    }
+
+    // Log to Redfish event
+    sendJournalRedfish(description, logLevel);
+}
+
+void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
+                                           uint32_t presentReading)
+{
+    log_level logLevel{log_level::WARNING};
+    std::string description;
+    uint8_t byte3 = (presentReading & 0xff000000) >> 24;
+    uint32_t byte012 = presentReading & 0xffffff;
+
+    description += prefixMsgStrCreation(tid, sensorId);
+
+    description += "DDR ";
+    if (ddrStatusToMsgMap.contains(byte3))
+    {
+        if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
+        {
+            logLevel = log_level::OK;
+        }
+
+        description += ddrStatusToMsgMap[byte3];
+
+        if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
+            byte3 == ddr_status::TRAINING_FAILURE)
+        {
+            // List out failed DIMMs
+            description += dimmIdxsToString(byte012);
+        }
+    }
+    else
+    {
+        description += "has unsupported status " + std::to_string(byte3);
+    }
+
+    // Log to Redfish event
+    sendJournalRedfish(description, logLevel);
+}
+
 } // namespace oem_ampere
 } // namespace pldm
diff --git a/oem/ampere/event/oem_event_manager.hpp b/oem/ampere/event/oem_event_manager.hpp
index 46ba526..93186cf 100644
--- a/oem/ampere/event/oem_event_manager.hpp
+++ b/oem/ampere/event/oem_event_manager.hpp
@@ -19,6 +19,7 @@
 
 enum sensor_ids
 {
+    DDR_STATUS = 51,
     PCIE_HOT_PLUG = 169,
     BOOT_OVERALL = 175,
 };
@@ -92,6 +93,95 @@
     } __attribute__((packed)) bits;
 } PCIeHotPlugEventRecord_t;
 
+typedef union
+{
+    uint32_t value;
+    struct
+    {
+        uint32_t type:2;
+        uint32_t mcuRankIdx:3;
+        uint32_t reserved_1:3; // byte0
+        uint32_t sliceNum:4;
+        uint32_t upperNibbStatErr:1;
+        uint32_t lowerNibbStatErr:1;
+        uint32_t reserved_2:2; // byte1
+        uint32_t syndrome:4;
+        uint32_t reserved_3:4; // byte2
+        uint32_t reserved_byte:8;
+    } __attribute__((packed)) bits;
+} DIMMTrainingFailure_t;
+
+namespace ddr
+{
+namespace status
+{
+enum ddr_status
+{
+    NO_SYSTEM_LEVEL_ERROR = 0x01,
+    ECC_INITIALIZATION_FAILURE = 0x04,
+    CONFIGURATION_FAILURE = 0x05,
+    TRAINING_FAILURE = 0x06,
+    OTHER_FAILURE = 0x07,
+    BOOT_FAILURE_NO_VALID_CONFIG = 0x08,
+    FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS = 0x09,
+};
+}
+} // namespace ddr
+
+namespace dimm
+{
+namespace status
+{
+enum dimm_status
+{
+    INSTALLED_NO_ERROR = 0x01,
+    NOT_INSTALLED = 0x02,
+    OTHER_FAILURE = 0x07,
+    INSTALLED_BUT_DISABLED = 0x10,
+    TRAINING_FAILURE = 0x12,
+    PMIC_HIGH_TEMP = 0x13,
+    TSx_HIGH_TEMP = 0x14,
+    SPD_HUB_HIGH_TEMP = 0x15,
+    PMIC_TEMP_ALERT = 0x16,
+};
+} // namespace status
+
+namespace training_failure
+{
+enum dimm_training_failure_type
+{
+    PHY_TRAINING_FAILURE_TYPE = 0x01,
+    DIMM_TRAINING_FAILURE_TYPE = 0x02,
+};
+
+namespace phy_syndrome
+{
+enum phy_training_failure_syndrome
+{
+    NA = 0x00,
+    PHY_TRAINING_SETUP_FAILURE = 0x01,
+    CA_LEVELING = 0x02,
+    PHY_WRITE_LEVEL_FAILURE = 0x03,
+    PHY_READ_GATE_LEVELING_FAILURE = 0x04,
+    PHY_READ_LEVEL_FAILURE = 0x05,
+    WRITE_DQ_LEVELING = 0x06,
+    PHY_SW_TRAINING_FAILURE = 0x07,
+};
+} // namespace phy_syndrome
+
+namespace dimm_syndrome
+{
+enum dimm_training_failure_syndrome
+{
+    NA = 0x00,
+    DRAM_VREFDQ_TRAINING_FAILURE = 0x01,
+    LRDIMM_DB_TRAINING_FAILURE = 0x02,
+    LRDRIMM_DB_SW_TRAINING_FAILURE = 0x03,
+};
+} // namespace dimm_syndrome
+} // namespace training_failure
+} // namespace dimm
+
 /**
  * @brief OemEventManager
  *
@@ -153,6 +243,14 @@
      */
     std::string dimmIdxsToString(uint32_t dimmIdxs);
 
+    /** @brief Convert the DIMM training failure into logging string.
+     *
+     *  @param[in] failureInfo - the one-hot DIMM index byte
+     *
+     *  @return std::string - the returned logging string
+     */
+    std::string dimmTrainingFailureToMsg(uint32_t failureInfo);
+
     /** @brief Handle numeric sensor event message from PCIe hot-plug sensor.
      *
      *  @param[in] tid - TID
@@ -171,6 +269,24 @@
     void handleBootOverallEvent(pldm_tid_t /*tid*/, uint16_t /*sensorId*/,
                                 uint32_t presentReading);
 
+    /** @brief Handle numeric sensor event message from DIMM status sensor.
+     *
+     *  @param[in] tid - TID
+     *  @param[in] sensorId - Sensor ID
+     *  @param[in] presentReading - the present reading of the sensor
+     */
+    void handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
+                               uint32_t presentReading);
+
+    /** @brief Handle numeric sensor event message from DDR status sensor.
+     *
+     *  @param[in] tid - TID
+     *  @param[in] sensorId - Sensor ID
+     *  @param[in] presentReading - the present reading of the sensor
+     */
+    void handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
+                              uint32_t presentReading);
+
     /** @brief Handle numeric sensor event messages.
      *
      *  @param[in] tid - TID