meta-ampere: ac01: boot-progress: update dimm training fail log
This commit is to log DIMM training fail information in case DDR
initialized is fail.
Tested: Inject DDR error and verify with Redfish and/or sel log:
1. IPMI chassis power off
2. tweak nvparam to inject DDR error
3. IPMI chassis power on
4. Read sel log and/or Redfish to confirm correct DDR event is
logged
Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Change-Id: I2a87c9b775faf17b8112d08eff4df44286a444bc
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service
index 267d901..44e4b96 100644
--- a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service
@@ -1,6 +1,6 @@
[Unit]
Description=Ampere Altra Boot Progress Handling
-After=ampere-host-already-on@0.target
+After=obmc-host-already-on@0.target
BindTo=obmc-host-already-on@0.target
[Service]
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh
index 87ad34f..7c5edbf 100755
--- a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh
@@ -149,7 +149,7 @@
while [ $cnt -lt 30 ];
do
# Sleep 200ms
- usleep 200000
+ sleep 1s
if ! read -r bg <<< "$(cat /sys/bus/platform/devices/smpro-misc.2.auto/boot_progress)";
then
cnt=$((cnt + 1))
@@ -180,6 +180,11 @@
if [ "${boot_status}" == "03" ]; then
# Log Redfish Event if failure.
log_redfish_bios_panic_event "$boot_stage" "$uefi_code"
+ # Dimm training failed, check errors
+ if [ "${boot_stage}" == "04" ]; then
+ /usr/sbin/dimm_train_fail_log.sh 0
+ /usr/sbin/dimm_train_fail_log.sh 1
+ fi
elif [ "${boot_status}" == "01" ]; then
# Check and set boot progress to dbus
set_boot_progress "$boot_stage" "$uefi_code"
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/dimm_train_fail_log.sh b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/dimm_train_fail_log.sh
new file mode 100644
index 0000000..f1f64c1
--- /dev/null
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/dimm_train_fail_log.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+smpro_path() {
+ if [ "$1" == 0 ]; then
+ echo "/sys/bus/i2c/drivers/smpro-core/2-004f"
+ else
+ echo "/sys/bus/i2c/drivers/smpro-core/2-004e"
+ fi
+}
+
+function log_ampere_oem_redfish_event()
+{
+ msg=$1
+ priority=$2
+ severity=$3
+ msgID=$4
+ msgArgs1=$5
+ msgArgs2=$6
+
+logger-systemd --journald << EOF
+MESSAGE=${msg}
+PRIORITY=${priority}
+SEVERITY=${severity}
+REDFISH_MESSAGE_ID=${msgID}
+REDFISH_MESSAGE_ARGS=${msgArgs1},${msgArgs2}
+EOF
+}
+
+parse_phy_syndrome_s1_type() {
+ s1=$1
+ slice=$((s1 & 0xf))
+ ubit=$(((s1 & 0x10) >> 4))
+ lbit=$(((s1 & 0x20) >> 5))
+ uMsg="Upper Nibble: No Error"
+ lMsg="Lower Nibble: No Error"
+ if [ $ubit == 1 ]; then
+ uMsg="Upper Nibble: No rising edge error"
+ fi
+ if [ $lbit == 1 ]; then
+ lMsg="Lower Nibble: No rising edge error"
+ fi
+ echo "Slice $slice: $uMsg, $lMsg"
+}
+
+parse_phy_syndrome() {
+ s0=$1
+ s1=$2
+ case $s0 in
+ 1)
+ echo "PHY Training Setup failure"
+ ;;
+ 2)
+ s1Msg=$(parse_phy_syndrome_s1_type "$s1")
+ echo "PHY Write Leveling failure: $s1Msg"
+ ;;
+ 3)
+ echo "PHY Read Gate Leveling failure"
+ ;;
+ 4)
+ echo "PHY Read Leveling failure"
+ ;;
+ 5)
+ echo "PHY Software Training failure"
+ ;;
+ *)
+ echo "N/A"
+ ;;
+ esac
+}
+
+parse_dimm_syndrome() {
+ s0=$1
+ case $s0 in
+ 1)
+ echo "DRAM VREFDQ Training failure"
+ ;;
+ 2)
+ echo "LRDIMM DB Training failure"
+ ;;
+ 3)
+ echo "LRDIMM DB Software Training failure"
+ ;;
+ *)
+ echo "N/A"
+ ;;
+ esac
+}
+
+log_err_to_redfish_err() {
+ channel="$(printf '%d' "0x$1" 2>/dev/null)"
+ data="$(printf '%d' "0x$2" 2>/dev/null)"
+ trErrType=$((data & 0x03))
+ rank=$(((data & 0x1C) >> 2))
+ syndrome0=$(((data & 0xE0) >> 5))
+ syndrome1=$(((data & 0xFF00) >> 8))
+
+ # PHY sysdrom errors
+ fType=""
+ redfisComp="DIMM"
+ redfisMsg=""
+ if [ $trErrType == 1 ]; then
+ fType="PHY training failure"
+ redfisMsg=$(parse_phy_syndrome $syndrome0 $syndrome1)
+ # DIMM traning errors
+ elif [ $trErrType == 2 ]; then
+ fType="DIMM training failure"
+ redfisMsg=$(parse_dimm_syndrome $syndrome0)
+ else
+ fType="Invalid DIMM Syndrome error type"
+ redfisMsg="NA"
+ fi
+
+ #smg=$("DDR training: MCU rank $rank: $fType: $redfisMsg")
+ log_ampere_oem_redfish_event \
+ "" 2 "" "OpenBMC.0.1.AmpereCritical.Critical" \
+ $redfisComp "Slot $channel MCU rank $rank: $fType: $redfisMsg"
+}
+
+log_err_to_sel_err() {
+ channel="$(printf '%d' "0x$1" 2>/dev/null)"
+ data="$(printf '%d' "0x$2" 2>/dev/null)"
+ byte0=$(((data & 0xff00) >> 8))
+ byte1=$((data & 0xff))
+ evtdata0=$((EVENT_DIR_ASSERTION | OEM_SENSOR_SPECIFIC))
+ evtdata1=$(((channel << 4) | BOOT_SYNDROME_DATA | (socket << 3)))
+
+ # phy sysdrom errors
+ # OEM data bytes
+ # oem id: 3 bytes [0x3a 0xcd 0x00]
+ # sensor num: 1 bytes
+ # sensor type: 1 bytes
+ # data bytes: 4 bytes
+ # sel type: 4 byte [0x00 0x00 0x00 0xC0]
+ busctl call xyz.openbmc_project.Logging.IPMI \
+ /xyz/openbmc_project/Logging/IPMI \
+ xyz.openbmc_project.Logging.IPMI IpmiSelAddOem \
+ sayy "" 12 \
+ 0x3a 0xcd 0x00 \
+ "$SENSOR_TYPE_SYSTEM_FW_PROGRESS" "$SENSOR_BOOT_PROGRESS" \
+ "$evtdata0" "$evtdata1" "$byte0" "$byte1" \
+ 0x00 0x00 0x00 0xC0
+}
+
+BOOT_SYNDROME_DATA=4
+SENSOR_BOOT_PROGRESS=235
+EVENT_DIR_ASSERTION=0x00
+OEM_SENSOR_SPECIFIC=0x70
+SENSOR_TYPE_SYSTEM_FW_PROGRESS=0x0F
+
+socket=$1
+base="$(smpro_path "$socket")"
+
+# For the second socket, it is required to read out to
+# clear all old boot progress before query the dimm
+# training fail info.
+# Normally, it would take up to 12 times to read them all
+# Make the value to 16 to make sure it always works.
+if [ "$socket" == "1" ]; then
+ path=("$base"/smpro-misc.*.auto/boot_progress)
+ filename="${path[0]}"
+ if [ ! -f "$filename" ];
+ then
+ echo "Error: $filename not found"
+ else
+ for ((i=0; i<16; i++))
+ do
+ cat "$filename" > /dev/null 2>&1
+ done
+ fi
+fi
+
+# Checking for DIMM slot 0-15
+for ((i=0; i<16; i++))
+do
+ path=("$base"/smpro-errmon.*.auto/event_dimm"${i}"_syndrome)
+ filename="${path[0]}"
+ if [ ! -f "$filename" ];
+ then
+ echo "Error: $filename not found"
+ continue
+ fi
+
+ line=$(cat "$filename")
+ if [ -n "$line" ];
+ then
+ log_err_to_redfish_err "$i" "$line"
+ log_err_to_sel_err "$i" "$line"
+ fi
+done
+
+exit 0;