meta-ampere: ac01: boot-progress: update dimm training fail log

This commit is to log DIMM training fail information in case DDR
initialized is fail.

Tested: Inject DDR error and verify with Redfish and/or sel log:
 1. IPMI chassis power off
 2. tweak nvparam to inject DDR error
 3. IPMI chassis power on
 4. Read sel log and/or Redfish to confirm correct DDR event is
    logged

Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Change-Id: I2a87c9b775faf17b8112d08eff4df44286a444bc
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress.bb b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress.bb
index 823bea7..acd9292 100644
--- a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress.bb
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress.bb
@@ -1,5 +1,5 @@
-SUMMARY = "Altra Boot Progress Handling Service"
-DESCRIPTION = "OpenBMC Altra Boot Progress Handling Daemon"
+SUMMARY = "Phosphor OpenBMC Boot Progress Handling Service"
+DESCRIPTION = "Phosphor OpenBMC Altra Boot Progress Handling Daemon"
 
 PR = "r1"
 LICENSE = "Apache-2.0"
@@ -14,6 +14,7 @@
 
 SRC_URI = " \
            file://ampere_boot_progress.sh \
+           file://dimm_train_fail_log.sh \
           "
 
 SYSTEMD_PACKAGES = "${PN}"
@@ -29,5 +30,6 @@
 do_install () {
     install -d ${D}${sbindir}
     install -m 0755 ${WORKDIR}/ampere_boot_progress.sh ${D}${sbindir}/
+    install -m 0755 ${WORKDIR}/dimm_train_fail_log.sh ${D}${sbindir}/
 }
 
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service
index 267d901..44e4b96 100644
--- a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere-boot-progress.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Ampere Altra Boot Progress Handling
-After=ampere-host-already-on@0.target
+After=obmc-host-already-on@0.target
 BindTo=obmc-host-already-on@0.target
 
 [Service]
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh
index 87ad34f..7c5edbf 100755
--- a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/ampere_boot_progress.sh
@@ -149,7 +149,7 @@
 while [ $cnt -lt 30 ];
 do
 	# Sleep 200ms
-	usleep 200000
+	sleep 1s
 	if ! read -r bg <<< "$(cat /sys/bus/platform/devices/smpro-misc.2.auto/boot_progress)";
 	then
 		cnt=$((cnt + 1))
@@ -180,6 +180,11 @@
 	if [ "${boot_status}" == "03" ]; then
 		# Log Redfish Event if failure.
 		log_redfish_bios_panic_event "$boot_stage" "$uefi_code"
+		# Dimm training failed, check errors
+		if [ "${boot_stage}" == "04" ]; then
+			/usr/sbin/dimm_train_fail_log.sh 0
+			/usr/sbin/dimm_train_fail_log.sh 1
+		fi
 	elif [ "${boot_status}" == "01" ]; then
 		# Check and set boot progress to dbus
 		set_boot_progress "$boot_stage" "$uefi_code"
diff --git a/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/dimm_train_fail_log.sh b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/dimm_train_fail_log.sh
new file mode 100644
index 0000000..f1f64c1
--- /dev/null
+++ b/meta-ampere/meta-common/recipes-ampere/host/ac01-boot-progress/dimm_train_fail_log.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+smpro_path() {
+	if [ "$1" == 0 ]; then
+		echo "/sys/bus/i2c/drivers/smpro-core/2-004f"
+	else
+		echo "/sys/bus/i2c/drivers/smpro-core/2-004e"
+	fi
+}
+
+function log_ampere_oem_redfish_event()
+{
+	msg=$1
+	priority=$2
+	severity=$3
+	msgID=$4
+	msgArgs1=$5
+	msgArgs2=$6
+
+logger-systemd --journald << EOF
+MESSAGE=${msg}
+PRIORITY=${priority}
+SEVERITY=${severity}
+REDFISH_MESSAGE_ID=${msgID}
+REDFISH_MESSAGE_ARGS=${msgArgs1},${msgArgs2}
+EOF
+}
+
+parse_phy_syndrome_s1_type() {
+	s1=$1
+	slice=$((s1 & 0xf))
+	ubit=$(((s1 & 0x10) >> 4))
+	lbit=$(((s1 & 0x20) >> 5))
+	uMsg="Upper Nibble: No Error"
+	lMsg="Lower Nibble: No Error"
+	if [ $ubit == 1 ]; then
+		uMsg="Upper Nibble: No rising edge error"
+	fi
+	if [ $lbit == 1 ]; then
+		lMsg="Lower Nibble: No rising edge error"
+	fi
+	echo "Slice $slice: $uMsg, $lMsg"
+}
+
+parse_phy_syndrome() {
+	s0=$1
+	s1=$2
+	case $s0 in
+		1)
+			echo "PHY Training Setup failure"
+			;;
+		2)
+			s1Msg=$(parse_phy_syndrome_s1_type "$s1")
+			echo "PHY Write Leveling failure: $s1Msg"
+			;;
+		3)
+			echo "PHY Read Gate Leveling failure"
+			;;
+		4)
+			echo "PHY Read Leveling failure"
+			;;
+		5)
+			echo "PHY Software Training failure"
+			;;
+		*)
+			echo "N/A"
+			;;
+	esac
+}
+
+parse_dimm_syndrome() {
+	s0=$1
+	case $s0 in
+		1)
+			echo "DRAM VREFDQ Training failure"
+			;;
+		2)
+			echo "LRDIMM DB Training failure"
+			;;
+		3)
+			echo "LRDIMM DB Software Training failure"
+			;;
+		*)
+			echo "N/A"
+			;;
+	esac
+}
+
+log_err_to_redfish_err() {
+	channel="$(printf '%d' "0x$1" 2>/dev/null)"
+	data="$(printf '%d' "0x$2" 2>/dev/null)"
+	trErrType=$((data & 0x03))
+	rank=$(((data & 0x1C) >> 2))
+	syndrome0=$(((data & 0xE0) >> 5))
+	syndrome1=$(((data & 0xFF00) >> 8))
+
+	# PHY sysdrom errors
+	fType=""
+	redfisComp="DIMM"
+	redfisMsg=""
+	if [ $trErrType == 1 ]; then
+		fType="PHY training failure"
+		redfisMsg=$(parse_phy_syndrome $syndrome0 $syndrome1)
+	# DIMM traning errors
+	elif [ $trErrType == 2 ]; then
+		fType="DIMM training failure"
+		redfisMsg=$(parse_dimm_syndrome $syndrome0)
+	else
+		fType="Invalid DIMM Syndrome error type"
+		redfisMsg="NA"
+	fi
+
+	#smg=$("DDR training: MCU rank $rank: $fType: $redfisMsg")
+	log_ampere_oem_redfish_event \
+		"" 2 "" "OpenBMC.0.1.AmpereCritical.Critical" \
+		$redfisComp "Slot $channel MCU rank $rank: $fType: $redfisMsg"
+}
+
+log_err_to_sel_err() {
+	channel="$(printf '%d' "0x$1" 2>/dev/null)"
+	data="$(printf '%d' "0x$2" 2>/dev/null)"
+	byte0=$(((data & 0xff00) >> 8))
+	byte1=$((data & 0xff))
+	evtdata0=$((EVENT_DIR_ASSERTION | OEM_SENSOR_SPECIFIC))
+	evtdata1=$(((channel << 4) | BOOT_SYNDROME_DATA | (socket << 3)))
+
+	# phy sysdrom errors
+	# OEM data bytes
+	#   oem id: 3 bytes [0x3a 0xcd 0x00]
+	#   sensor num: 1 bytes
+	#   sensor type: 1 bytes
+	#   data bytes: 4 bytes
+	#   sel type: 4 byte [0x00 0x00 0x00 0xC0]
+	busctl call xyz.openbmc_project.Logging.IPMI \
+		/xyz/openbmc_project/Logging/IPMI \
+		xyz.openbmc_project.Logging.IPMI IpmiSelAddOem \
+		sayy "" 12 \
+		0x3a 0xcd 0x00 \
+		"$SENSOR_TYPE_SYSTEM_FW_PROGRESS" "$SENSOR_BOOT_PROGRESS" \
+		"$evtdata0" "$evtdata1" "$byte0" "$byte1" \
+		0x00 0x00 0x00 0xC0
+}
+
+BOOT_SYNDROME_DATA=4
+SENSOR_BOOT_PROGRESS=235
+EVENT_DIR_ASSERTION=0x00
+OEM_SENSOR_SPECIFIC=0x70
+SENSOR_TYPE_SYSTEM_FW_PROGRESS=0x0F
+
+socket=$1
+base="$(smpro_path "$socket")"
+
+# For the second socket, it is required to read out to
+# clear all old boot progress before query the dimm
+# training fail info.
+# Normally, it would take up to 12 times to read them all
+# Make the value to 16 to make sure it always works.
+if [ "$socket" == "1" ]; then
+	path=("$base"/smpro-misc.*.auto/boot_progress)
+	filename="${path[0]}"
+	if [ ! -f "$filename" ];
+	then
+		echo "Error: $filename not found"
+	else
+		for ((i=0; i<16; i++))
+		do
+			cat "$filename" > /dev/null 2>&1
+		done
+	fi
+fi
+
+# Checking for DIMM slot 0-15
+for ((i=0; i<16; i++))
+do
+	path=("$base"/smpro-errmon.*.auto/event_dimm"${i}"_syndrome)
+	filename="${path[0]}"
+	if [ ! -f "$filename" ];
+	then
+		echo "Error: $filename not found"
+		continue
+	fi
+
+	line=$(cat "$filename")
+	if [ -n "$line" ];
+	then
+		log_err_to_redfish_err "$i" "$line"
+		log_err_to_sel_err "$i" "$line"
+	fi
+done
+
+exit 0;