meta-ampere: add fault monitor support

Support to detect GPIO, PSU, FAN, ... faults and turn ON/OFF fault LED.

Tested:
1. Unplug a PSU and check if Fault LED is turned ON.
2. Unplug a FAN and check if Fault LED is turned ON.
3. Stimulate GPIO fault pattern and check if the BMC can detect

Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Signed-off-by: Hieu Huynh <hieuh@os.amperecomputing.com>
Signed-off-by: Quang Nguyen <quangn@amperecomputing.com>
Change-Id: Idfcd32953cf811fbe9299a162f604cb8fd028962
diff --git a/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor.bb b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor.bb
new file mode 100644
index 0000000..79d7a3d
--- /dev/null
+++ b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor.bb
@@ -0,0 +1,23 @@
+SUMMARY = "Ampere Computing LLC Fault Monitor"
+DESCRIPTION = "Monitor fault events and update fault led status for Ampere systems"
+PR = "r1"
+
+LICENSE = "Apache-2.0"
+
+LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/Apache-2.0;md5=89aea4e17d99a7cacdbeed46a0096b10"
+
+inherit systemd
+inherit obmc-phosphor-systemd
+
+FILESEXTRAPATHS:append := "${THISDIR}/${PN}:"
+
+SYSTEMD_SERVICE:${PN} = "ampere-fault-monitor.service"
+
+GPIO_FAULT_START_TGT = "ampere-check-gpio-fault@.service"
+GPIO_FAULT_START_S0_INSTMPL = "ampere-check-gpio-fault@{0}.service"
+SYSTEMD_SERVICE:${PN} += "${GPIO_FAULT_START_TGT}"
+
+HOST_ON_STARTMIN_TGTFMT = "obmc-host-startmin@{0}.target"
+GPIO_FAULT_START_S0_STARTMIN_FMT = "../${GPIO_FAULT_START_TGT}:${HOST_ON_STARTMIN_TGTFMT}.wants/${GPIO_FAULT_START_S0_INSTMPL}"
+SYSTEMD_LINK:${PN} += "${@compose_list_zip(d, 'GPIO_FAULT_START_S0_STARTMIN_FMT', 'OBMC_HOST_INSTANCES')}"
+
diff --git a/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-check-gpio-fault@.service b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-check-gpio-fault@.service
new file mode 100644
index 0000000..8502b73
--- /dev/null
+++ b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-check-gpio-fault@.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Monitor GPIO fault and updade fault LED status %i
+Before=obmc-host-start-pre@0.target
+After=obmc-host-already-on@0.target
+Conflicts=obmc-host-stop@0.target
+
+[Service]
+Restart=no
+ExecStart=/bin/sh -c "if [ -f /usr/sbin/ampere_check_gpio_fault.sh ]; then /usr/sbin/ampere_check_gpio_fault.sh %i; fi"
+ExecStopPost=/bin/sh -c "if [ -f /tmp/gpio_fault ]; then sleep 5; rm /tmp/gpio_fault; fi"
+SyslogIdentifier=ampere_check_fault_gpio
+Type=simple
diff --git a/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-fault-monitor.service b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-fault-monitor.service
new file mode 100644
index 0000000..ee6af1e
--- /dev/null
+++ b/meta-ampere/meta-common/recipes-ampere/platform/ampere-fault-monitor/ampere-fault-monitor.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Monitor fault events and updade fault LED status
+After=xyz.openbmc_project.State.Host@0.service
+
+[Service]
+Restart=on-failure
+ExecStart=/usr/sbin/ampere_fault_monitor.sh
+SyslogIdentifier=ampere_fault_monitor.sh
+Type=simple
+
+[Install]
+WantedBy={SYSTEMD_DEFAULT_TARGET}
diff --git a/meta-ampere/meta-jade/conf/machine/mtjade.conf b/meta-ampere/meta-jade/conf/machine/mtjade.conf
index 914c160..f1e51bb 100644
--- a/meta-ampere/meta-jade/conf/machine/mtjade.conf
+++ b/meta-ampere/meta-jade/conf/machine/mtjade.conf
@@ -33,6 +33,7 @@
                                    phosphor-ipmi-blobs \
                                    phosphor-ipmi-blobs-binarystore \
                                    ampere-driver-binder \
+                                   ampere-fault-monitor \
                                   "
 
 PREFERRED_PROVIDER_virtual/obmc-chassis-mgmt = "packagegroup-ampere-apps"
diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend
new file mode 100644
index 0000000..2f3e457
--- /dev/null
+++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend
@@ -0,0 +1,14 @@
+FILESEXTRAPATHS:append := "${THISDIR}/${PN}:"
+
+RDEPENDS:${PN} = "bash"
+
+SRC_URI += " \
+            file://ampere_fault_monitor.sh \
+            file://ampere_check_gpio_fault.sh \
+           "
+
+do_install() {
+    install -d ${D}/${sbindir}
+    install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/
+    install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/
+}
diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
new file mode 100755
index 0000000..141c50c
--- /dev/null
+++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+
+# This script monitors S0/S1 fault GPIO and detects errors or warnings from CPUs
+#
+# According to OpenBMC_Software_Funcional_Specification, section 3.16,
+#
+# When the BMC detects the GPIO_FAULT signal indicating an SCP booting failure:
+# •  If a non-critical error/warning from the SCP occurs, the BMC blinks the Fault LED once.
+# •  If a critical error from the SCP occurs, the BMC turns on the Fault LED.
+# The BMC monitors the GPIO_FAULT signal from the SCP during SCP booting to determine whether
+# the error is non-critical or critical. A fatal error is indicated when the signal is On and then Off
+# continuously, followed by a “quiet” period of about three seconds, and this pattern repeats. If the “quiet”
+# period is longer than three seconds, the error is non-fatal. The BMC must set up appropriate debounce
+# times to detect such errors. The BMC is expected to turn on the Fault LED forever for fatal errors, or to
+# turn on the Fault LED and turn it off when the fault clears for non-fatal errors.
+#
+# Usage: <app_name> <socket 0/1>
+
+# shellcheck source=/dev/null
+source /usr/sbin/gpio-lib.sh
+
+# global variables
+	error_flag='/tmp/fault_err'
+	warning_flag='/tmp/fault_warning'
+
+	duty_cycle=250000
+	scan_pulse=100000
+	blank_num=8
+
+	curr_pattern=0
+	prev_pattern=0
+
+	gpio_status=0
+	repeat=0
+
+	socket=$1
+
+	socket1_present=15
+	socket1_status=1
+
+	S0_fault_gpio=73
+	S1_fault_gpio=201
+
+map_event_name() {
+	case $curr_pattern in
+		1)
+			event_name="RAS_GPIO_INVALID_LCS"
+			;;
+		2)
+			event_name="RAS_GPIO_FILE_HDR_INVALID"
+			;;
+		3)
+			event_name="RAS_GPIO_FILE_INTEGRITY_INVALID"
+			;;
+		4)
+			event_name="RAS_GPIO_KEY_CERT_AUTH_ERR"
+			;;
+		5)
+			event_name="RAS_GPIO_CNT_CERT_AUTH_ERR"
+			;;
+		6)
+			event_name="RAS_GPIO_I2C_HARDWARE_ERR"
+			;;
+		7)
+			event_name="RAS_GPIO_CRYPTO_ENGINE_ERR"
+			;;
+		8)
+			event_name="RAS_GPIO_ROTPK_EFUSE_INVALID"
+			;;
+		9)
+			event_name="RAS_GPIO_SEED_EFUSE_INVALID"
+			;;
+		10)
+			event_name="RAS_GPIO_LCS_FROM_EFUSE_INVALID"
+			;;
+		11)
+			event_name="RAS_GPIO_PRIM_ROLLBACK_EFUSE_INVALID"
+			;;
+		12)
+			event_name="RAS_GPIO_SEC_ROLLBACK_EFUSE_INVALID"
+			;;
+		13)
+			event_name="RAS_GPIO_HUK_EFUSE_INVALID"
+			;;
+		14)
+			event_name="RAS_GPIO_CERT_DATA_INVALID"
+			;;
+		15)
+			event_name="RAS_GPIO_INTERNAL_HW_ERR"
+			;;
+		*)
+			event_name="NOT_SUPPORT"
+			;;
+	esac
+}
+
+detect_patern_repeat() {
+	local prev=0
+	local curr=0
+	local cnt=13
+
+	while true
+	do
+		usleep $scan_pulse
+		gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+		prev=$curr
+		curr=$gpio_status
+		if [ "$prev" == 0 ] && [ "$curr" == 1 ]; then
+			# patern start repeating, check if previous and current pattern are the same
+			repeat=1
+			break
+		fi
+		if [ "$cnt" == 0 ]; then
+			map_event_name
+			echo "detected a warning from fault GPIO #$fault_gpio $socket, event $event_name"
+			# pattern not repeat, this is a warning, turn on warning flag
+			touch $warning_flag
+			break
+		fi
+		cnt=$(( cnt - 1 ))
+	done
+}
+
+detect_pattern() {
+	local cnt_falling_edge=0
+	local cnt_blank=0
+
+	local prev=0
+	local curr=0
+
+	while true
+	do
+		prev=$curr
+		curr=$gpio_status
+		# count the falling edges, if they appear, just reset cnt_blank
+		if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then
+			cnt_falling_edge=$(( cnt_falling_edge + 1 ))
+			cnt_blank=0
+			continue
+		# check if we are in the quite gap
+		elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then
+			cnt_blank=$(( cnt_blank + 1 ))
+			if [ "$cnt_blank" == "$blank_num" ]; then
+				# echo "pattern number falling_edge=$cnt_falling_edge blank=$cnt_blank"
+				curr_pattern=$cnt_falling_edge
+				# after count all falling edges, now check if patern repeat after 3s
+				detect_patern_repeat
+				break
+			fi
+		fi
+		usleep $scan_pulse
+		gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+	done
+}
+
+gpio_config_input() {
+	echo "$gpio_Id" > /sys/class/gpio/export
+	echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction
+}
+
+init_sysfs_fault_gpio() {
+	gpio_Id=$(gpio_number "$fault_gpio")
+	if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then
+		return
+	fi
+	gpio_config_input "$fault_gpio"
+}
+
+# init
+if [ "$socket" == "0" ]; then
+	fault_gpio=$S0_fault_gpio
+else
+	socket1_status=$(gpioget 0 "$socket1_present")
+	if [ "$socket1_status" == 1 ]; then
+		echo "socket 1 not present"
+		exit 1
+	fi
+	fault_gpio=$S1_fault_gpio
+fi
+
+init_sysfs_fault_gpio
+
+# daemon start
+while true
+do
+	# detect when pattern starts
+	if [ "$gpio_status" == 1 ]; then
+		# now, there is something on gpio, check if that is a pattern
+		detect_pattern
+		if [ "$repeat" == 1 ] && [ "$prev_pattern" == "$curr_pattern" ]; then
+			map_event_name
+			echo "detected an error from fault GPIO #$fault_gpio $socket, event#$curr_pattern $event_name"
+			touch $error_flag
+			repeat=0
+		fi
+		prev_pattern=$curr_pattern
+		curr_pattern=0
+		continue
+	fi
+	usleep $duty_cycle
+	gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+
+done
+
+exit 1
diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
new file mode 100644
index 0000000..44cbb11
--- /dev/null
+++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
+
+# shellcheck disable=SC2004
+# shellcheck disable=SC2046
+# shellcheck source=/dev/null
+
+# common variables
+	warning_fault_flag='/tmp/fault_warning'
+	error_fault_flag='/tmp/fault_err'
+	overtemp_fault_flag='/tmp/fault_overtemp'
+	fault_RAS_UE_flag='/tmp/fault_RAS_UE'
+
+	blink_rate=100000
+
+	fault="false"
+
+	on="true"
+	off="false"
+
+	gpio_fault="false"
+
+# fan variables
+	fan_failed="false"
+	fan_failed_flag='/tmp/fan_failed'
+
+# PSU variables
+	psu_failed="false"
+	psu_bus=6
+	psu0_addr=0x58
+	psu1_addr=0x59
+	status_word_cmd=0x79
+	# Following the PMBus Specification
+	# Bit[1]: CML faults
+	# Bit[2]: Over temperature faults
+	# Bit[3]: Under voltage faults
+	# Bit[4]: Over current faults
+	# Bit[5]: Over voltage fault
+	# Bit[10]: Fan faults
+	psu_fault_bitmask=0x43e
+
+# led variables
+	led_service='xyz.openbmc_project.LED.GroupManager'
+	led_fault_path='/xyz/openbmc_project/led/groups/system_fault'
+	led_fault_interface='xyz.openbmc_project.Led.Group'
+	fault_led_status=$off
+
+# functions declaration
+check_fan_failed() {
+	if [[ -f $fan_failed_flag ]]; then
+		fan_failed="true"
+	else
+		fan_failed="false"
+	fi
+}
+
+turn_on_off_fault_led() {
+	busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null
+}
+
+check_psu_failed() {
+	local psu0_presence
+	local psu1_presence
+	local psu0_value
+	local psu1_value
+
+	psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT))
+	psu0_failed="true"
+	if [ "$psu0_presence" == "0" ]; then
+		# PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+		psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
+		psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
+		if [ "$psu0_bit_fault" == "0" ]; then
+			psu0_failed="false"
+		fi
+	fi
+
+	psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT))
+	psu1_failed="true"
+	if [ "$psu1_presence" == "0" ]; then
+		# PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+		psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
+		psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
+		if [ "$psu1_bit_fault" == "0" ]; then
+			psu1_failed="false"
+		fi
+	fi
+
+	if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
+		psu_failed="true"
+	else
+		psu_failed="false"
+	fi
+}
+
+check_fault() {
+	if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
+									|| [[ "$gpio_fault" == "true" ]] \
+									|| [[ "$RAS_UE_occured" == "true" ]] \
+									|| [[ "$overtemp_occured" == "true" ]]; then
+		fault="true"
+	else
+		fault="false"
+	fi
+}
+
+control_fault_led() {
+	if [ "$fault" == "true" ]; then
+		if [ "$fault_led_status" == $off ]; then
+			turn_on_off_fault_led $on
+			fault_led_status=$on
+		fi
+	else
+		if [ "$fault_led_status" == $on ]; then
+			turn_on_off_fault_led $off
+			fault_led_status=$off
+		fi
+	fi
+}
+
+blink_fault_led() {
+	if [ "$fault_led_status" == $off ]; then
+		turn_on_off_fault_led $on
+		usleep $blink_rate
+		turn_on_off_fault_led $off
+	else
+		turn_on_off_fault_led $off
+		usleep $blink_rate
+		turn_on_off_fault_led $on
+	fi
+}
+
+check_gpio_fault() {
+	if [[ -f $error_fault_flag ]]; then
+		gpio_fault="true"
+	else
+		if [ -f $warning_fault_flag ]; then
+			blink_fault_led
+			rm $warning_fault_flag
+		fi
+		gpio_fault="false"
+	fi
+}
+
+check_RAS_UE_occured() {
+	if [[ -f $fault_RAS_UE_flag ]]; then
+		echo "RAS UE error occured, turn on fault LED"
+		RAS_UE_occured="true"
+	else
+		RAS_UE_occured="false"
+	fi
+}
+
+check_overtemp_occured() {
+	if [[ -f $overtemp_fault_flag ]]; then
+		echo "Over temperature occured, turn on fault LED"
+		overtemp_occured="true"
+	else
+		overtemp_occured="false"
+	fi
+}
+
+# daemon start
+while true
+do
+	check_gpio_fault
+	check_fan_failed
+	check_overtemp_occured
+	check_RAS_UE_occured
+
+	# Monitors PSU presence
+	check_psu_failed
+
+	check_fault
+	control_fault_led
+	sleep 2
+done
+
+exit 1
diff --git a/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf b/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf
index aecb597..712401b 100644
--- a/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf
+++ b/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf
@@ -36,6 +36,7 @@
                                    phosphor-ipmi-blobs-binarystore \
                                    util-linux \
                                    ampere-sysfw-hang-handler \
+                                   ampere-fault-monitor \
                                   "
 
 PREFERRED_PROVIDER_virtual/obmc-chassis-mgmt = "packagegroup-obmc-ampere-apps"
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend
new file mode 100644
index 0000000..2f3e457
--- /dev/null
+++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend
@@ -0,0 +1,14 @@
+FILESEXTRAPATHS:append := "${THISDIR}/${PN}:"
+
+RDEPENDS:${PN} = "bash"
+
+SRC_URI += " \
+            file://ampere_fault_monitor.sh \
+            file://ampere_check_gpio_fault.sh \
+           "
+
+do_install() {
+    install -d ${D}/${sbindir}
+    install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/
+    install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/
+}
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
new file mode 100755
index 0000000..9922420
--- /dev/null
+++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
@@ -0,0 +1,351 @@
+#!/bin/bash
+
+# This script monitors S0/S1 GPIO fault and detects errors from CPUs
+#
+# So far, there is no specification describes the behavior of LED when an error (a pattern is detected) occurs.
+# So when detecting a pattern, we simply set the gpio fault flag and turn on the SYS LED.
+#
+# The Parttern will in the format:
+# <minor_byte> <quite_gap_1second> <major_byte> <stop_condition_low_for_3seconds>
+#
+# Ex: pattern minor_byte=0x03, major_byte=0x02, you will see the waveform like
+# _1010100...(quite gap, low for 1 second)..0111111111000000000111111111110000000000...(stop condition, low for 3 seconds)..
+#
+# Usage: <app_name> <socket 0/1>
+#
+# shellcheck source=/dev/null
+source /usr/sbin/gpio-lib.sh
+
+# global variables
+    error_flag='/tmp/gpio_fault'
+
+    # the command "cat /sys/class/gpio/gpio"$gpio_Id"/value" itself, takes 10ms~35ms to complete, depends on CPU loading
+    polling_minor_byte_rate=0
+    polling_major_byte_rate=200000
+    polling_rate=$polling_minor_byte_rate
+
+    # the mount of low to ensure that already get out of minor_byte and is in quite gap
+    # this value depends on the polling_minor_byte_rate
+    max_low_in_minor_byte=9
+
+    # the mount of low to ensure that already get out of major_byte and is in stop condition
+    # this value depends on the polling_major_byte_rate
+    max_low_in_major_byte=9
+
+    max_low=$max_low_in_minor_byte
+
+    # state machines:
+    # detecting_minor_byte=0
+    # detecting_major_byte=1
+    curr_state=0
+
+    minor_byte=0
+    major_byte=0
+
+    gpio_status=0
+
+    socket=$1
+
+    socket1_present=151
+    socket1_status=1
+
+    S0_fault_gpio='s0-fault-alert'
+    S1_fault_gpio='s1-fault-alert'
+
+map_event_name() {
+    case $major_byte in
+        2)
+            event_major="FAULT_LED_BOOT_ERROR"
+            case $minor_byte in
+                1)
+                    event_minor="SOC_BOOTDEV_INIT_SEC_ERROR"
+                    ;;
+                2)
+                    event_minor="SECJMP_FAIL_ERROR"
+                    ;;
+                3)
+                    event_minor="UART_INIT_WARN"
+                    ;;
+                4)
+                    event_minor="UART_TX_WARN"
+                    ;;
+                5)
+                    event_minor="SOC_ROMPATCH_BAD_ERROR"
+                    ;;
+                6)
+                    event_minor="SOC_ROMPATCH_RANGE_ERROR"
+                    ;;
+                7)
+                    event_minor="SPI_INIT_ERROR"
+                    ;;
+                8)
+                    event_minor="SPI_TX_ERROR"
+                    ;;
+                9)
+                    event_minor="SPINOR_UNKNOW_DEVICE_WARN"
+                    ;;
+                10)
+                    event_minor="EEPROM_BAD_NVP_HEADER_WARN"
+                    ;;
+                11)
+                    event_minor="EEPROM_BAD_NVP_FIELD_WARN"
+                    ;;
+                12)
+                    event_minor="EEPROM_BAD_CHECKSUM_ERROR_WARN"
+                    ;;
+                13)
+                    event_minor="I2C_DMA_ERROR"
+                    ;;
+                14)
+                    event_minor="I2C_TIMEOUT_ERROR"
+                    ;;
+                15)
+                    event_minor="SOC_BOOTDEV_SPI_LOAD_ERROR"
+                    ;;
+                16)
+                    event_minor="SOC_BOOTDEV_AUTHENTICATION_ERROR"
+                    ;;
+                17)
+                    event_minor="PCP_POWERUP_FAILED"
+                    ;;
+                18)
+                    event_minor="PCP_POWERDOWN_FAILED"
+                    ;;
+                19)
+                    event_minor="CPUPLL_INIT_FAILED"
+                    ;;
+                20)
+                    event_minor="MESHPLL_INIT_FAILED"
+                    ;;
+                *)
+                    event_minor="NOT_SUPPORT"
+            esac
+            ;;
+        3)
+            event_major="FAULT_LED_FW_LOAD_ERROR"
+            case $minor_byte in
+                9)
+                    event_minor="LFS_ERROR"
+                    ;;
+                *)
+                    event_minor="NOT_SUPPORT"
+            esac
+            ;;
+        4)
+            event_major="FAULT_LED_SECURITY_ERROR"
+            case $minor_byte in
+                1)
+                    event_minor="SEC_INVALID_KEY_CERT"
+                    ;;
+                2)
+                    event_minor="SEC_INVALID_CONT_CERT"
+                    ;;
+                3)
+                    event_minor="SEC_INVALID_ROOT_KEY"
+                    ;;
+                4)
+                    event_minor="SEC_INVALID_SECPRO_KEY"
+                    ;;
+                5)
+                    event_minor="SEC_INVALID_KEY_CERT_SIG"
+                    ;;
+                6)
+                    event_minor="SEC_INVALID_CONT_CERT_SIG"
+                    ;;
+                7)
+                    event_minor="SEC_INVALID_IMAGE_HASH"
+                    ;;
+                8)
+                    event_minor="SEC_INVALID_PRI_VERSION"
+                    ;;
+                9)
+                    event_minor="SEC_HUK_MISMATCH"
+                    ;;
+                10)
+                    event_minor="SEC_FUSE_BLOW_CERT_WITHOUT_SPECIAL_BOOT_PIN"
+                    ;;
+                11)
+                    event_minor="SEC_INVALID_CERT_SUBTYPE_STRUCT"
+                    ;;
+                12)
+                    event_minor="SEC_TMMCFG_FAIL"
+                    ;;
+                13)
+                    event_minor="SEC_INVALID_LCS_FROM_EFUSE"
+                    ;;
+                14)
+                    event_minor="SEC_EFUSE_WRITE_FAILED"
+                    ;;
+                15)
+                    event_minor="SEC_INVALID_CERT_VALUE"
+                    ;;
+                16)
+                    event_minor="SEC_INVALID_CERT_VERSION"
+                    ;;
+                *)
+                    event_minor="NOT_SUPPORT"
+                    ;;
+            esac
+            ;;
+        5)
+            event_major="FAULT_LED_EXCEPTION_ERROR"
+            case $minor_byte in
+                1)
+                    event_minor="KERNEL_EXCEPTION_UNKNOWN_REASON_ERROR"
+                    ;;
+                2)
+                    event_minor="KERNEL_EXCEPTION_HARD_FAULT_ERROR"
+                    ;;
+                3)
+                    event_minor="KERNEL_EXCEPTION_BUS_FAULT_ERROR"
+                    ;;
+                4)
+                    event_minor="KERNEL_EXCEPTION_MEMMANAGE_FAULT_ERROR"
+                    ;;
+                5)
+                    event_minor="KERNEL_EXCEPTION_USAGE_FAULT_ERROR"
+                    ;;
+                *)
+                    event_minor="NOT_SUPPORT"
+                    ;;
+            esac
+            ;;
+        *)
+            event_major="NOT_SUPPORT"
+            ;;
+    esac
+}
+
+set_unset_gpio_fault_flag() {
+    if [ ! -f $error_flag ] && [ "$1" == 1 ] ; then
+        touch $error_flag
+    elif [ -f $error_flag ] && [ "$1" == 0 ]; then
+        rm $error_flag
+    fi
+}
+
+toggle_state() {
+    if [ "$curr_state" == 0 ]; then
+        curr_state=1
+        polling_rate=$polling_major_byte_rate
+    else
+        curr_state=0
+        polling_rate=$polling_minor_byte_rate
+        map_event_name
+        echo "detected major_byte=$event_major, minor_byte=$event_minor"
+        set_unset_gpio_fault_flag 1
+    fi
+}
+
+save_pulse_of_byte() {
+    if [ "$curr_state" == 0 ]; then
+        minor_byte=$1
+        #echo "minor_byte=$1"
+    else
+        major_byte=$1
+        #echo "major_byte=$1"
+    fi
+}
+
+# we do not care the pulse is 50ms or 500ms, what we care is that the number of high pulses
+cnt_falling_edge_in_byte() {
+    local cnt_falling_edge=0
+    local cnt_low=0
+
+    local prev=0
+    local curr=0
+
+    while true
+    do
+        prev=$curr
+        curr=$gpio_status
+        # count the falling edges, if they occur, just reset cnt_low
+        if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then
+            cnt_falling_edge=$(( cnt_falling_edge + 1 ))
+            cnt_low=0
+            continue
+        # check if we are in the quite gap or stop condition
+        elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then
+            cnt_low=$(( cnt_low + 1 ))
+            if [ "$cnt_low" == "$max_low" ]; then
+                save_pulse_of_byte "$cnt_falling_edge"
+                toggle_state
+                break
+            fi
+        fi
+        usleep $polling_rate
+        gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+    done
+}
+
+gpio_config_input() {
+    echo "$gpio_Id" > /sys/class/gpio/export
+    echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction
+}
+
+gpio_number() {
+    local offset
+    local gpioPin
+    local str
+
+    str=$(gpiofind "$1")
+    if [ "$?" == '1' ]; then
+        echo -1
+    else
+        gpioid=$(echo "$str"|cut -c 9)
+        offset=$(echo "$str"|cut -d " " -f 2)
+        gpioPin=$(("$offset" + ${AST2600_GPIO_BASE[$gpioid]}))
+        echo "$gpioPin"
+    fi
+}
+
+init_sysfs_fault_gpio() {
+    gpio_Id=$(gpio_number "$fault_gpio")
+    if [ "$gpio_Id" == "-1" ]; then
+        echo "Invalid GPIO number"
+        exit 1
+    fi
+
+    if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then
+        return
+    fi
+    gpio_config_input "$gpio_Id"
+}
+
+# init
+if [ "$socket" == "0" ]; then
+	fault_gpio="$S0_fault_gpio"
+else
+	socket1_status=$(gpioget 0 "$socket1_present")
+	if [ "$socket1_status" == 1 ]; then
+		echo "socket 1 not present"
+		exit 0
+	fi
+	fault_gpio=$S1_fault_gpio
+fi
+
+init_sysfs_fault_gpio
+
+# daemon start
+while true
+do
+    # detect when pattern starts
+    if [ "$gpio_status" == 1 ]; then
+        if [ "$curr_state" == 0 ]; then
+            # detecting minor byte, set up minor byte variables
+            max_low=$max_low_in_minor_byte
+            polling_rate=$polling_minor_byte_rate
+        else
+            # detecting major byte, set up major byte variables
+            max_low=$max_low_in_major_byte
+            polling_rate=$polling_major_byte_rate
+        fi
+        # now, there is something on gpio, check if that is a byte pattern
+        cnt_falling_edge_in_byte
+    fi
+
+    usleep $polling_rate
+    gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+done
+
+exit 1
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
new file mode 100644
index 0000000..e176629
--- /dev/null
+++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
+
+# shellcheck disable=SC2004
+# shellcheck source=/dev/null
+source /usr/sbin/gpio-lib.sh
+
+# common variables
+	on=1
+	off=0
+
+    overtemp_fault_flag='/tmp/fault_overtemp'
+
+# gpio fault
+	gpio_fault="false"
+	gpio_fault_flag="/tmp/gpio_fault"
+
+# fan variables
+	fan_failed="false"
+	fan_failed_flag='/tmp/fan_failed'
+
+# PSU variables
+	psu_failed="false"
+	psu_bus=2
+	psu0_addr=0x58
+	psu1_addr=0x59
+	status_word_cmd=0x79
+	# Following the PMBus Specification
+	# Bit[1]: CML faults
+	# Bit[2]: Over temperature faults
+	# Bit[3]: Under voltage faults
+	# Bit[4]: Over current faults
+	# Bit[5]: Over voltage fault
+	# Bit[10]: Fan faults
+	psu_fault_bitmask=0x43e
+
+# led variables
+	fan_fault_led_status=$off
+	psu_fault_led_status=$off
+	led_bus=15
+	led_addr=0x22
+	led_port0_config=0x06
+	led_port0_output=0x02
+
+# functions declaration
+check_fan_failed() {
+	if [[ -f $fan_failed_flag ]]; then
+		fan_failed="true"
+	else
+		fan_failed="false"
+	fi
+}
+
+turn_on_off_fan_fault_led() {
+	# Control fan fault led via CPLD's I2C at slave address 0x22, I2C16.
+	# Get Port0 value
+	p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config)
+	p0_val=$(("$p0_val" & ~1))
+	# Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0].
+	i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val
+
+	# Get led value
+	led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output)
+
+	if [ "$1" == $on ]; then
+		led_st=$(("$led_st" | 1))
+	else
+		led_st=$(("$led_st" & ~1))
+	fi
+
+	# Turn on/off fan fault led
+	i2cset -f -y $led_bus $led_addr $led_port0_output $led_st
+}
+
+turn_on_off_psu_fault_led() {
+	# Control psu fault led via CPLD's I2C at slave address 0x22, I2C16.
+	# Get Port1 value
+	p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config)
+	p1_val=$(("$p1_val" & ~2))
+	# Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1].
+	i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val
+
+	# Get led value
+	led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output)
+	if [ "$1" == $on ]; then
+		led_st=$(("$led_st" | 2))
+	else
+		led_st=$(("$led_st" & ~2))
+	fi
+
+	# Turn on/off psu fault led
+	i2cset -f -y $led_bus $led_addr $led_port0_output $led_st
+}
+
+control_fan_fault_led() {
+	if [ "$fan_failed" == "true" ]; then
+		if [ "$fan_fault_led_status" == $off ]; then
+			turn_on_off_fan_fault_led $on
+			fan_fault_led_status=$on
+		fi
+	else
+		if [ "$fan_fault_led_status" == $on ]; then
+			turn_on_off_fan_fault_led $off
+			fan_fault_led_status=$off
+		fi
+	fi
+}
+
+check_psu_failed() {
+	local psu0_presence
+	local psu1_presence
+	local psu0_value
+	local psu1_value
+
+	psu0_presence=$(gpio_name_get presence-ps0)
+	psu0_failed="true"
+	if [ "$psu0_presence" == "0" ]; then
+		# PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+		psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
+		psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
+		if [ "$psu0_bit_fault" == "0" ]; then
+			psu0_failed="false"
+		fi
+	fi
+
+	psu1_presence=$(gpio_name_get presence-ps1)
+	psu1_failed="true"
+	if [ "$psu1_presence" == "0" ]; then
+		# PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+		psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
+		psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
+		if [ "$psu1_bit_fault" == "0" ]; then
+			psu1_failed="false"
+		fi
+	fi
+
+	if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
+		psu_failed="true"
+	else
+		psu_failed="false"
+	fi
+}
+
+control_psu_fault_led() {
+	if [ "$psu_failed" == "true" ]; then
+		if [ "$psu_fault_led_status" == $off ]; then
+			turn_on_off_psu_fault_led $on
+			psu_fault_led_status=$on
+		fi
+	else
+		if [ "$psu_fault_led_status" == $on ]; then
+			turn_on_off_psu_fault_led $off
+			psu_fault_led_status=$off
+		fi
+	fi
+}
+
+check_overtemp_occured() {
+    if [[ -f $overtemp_fault_flag ]]; then
+        echo "Over temperature occured, turn on fault LED"
+        overtemp_occured="true"
+    else
+        overtemp_occured="false"
+    fi
+}
+
+
+check_gpio_fault() {
+    if [[ -f $gpio_fault_flag ]]; then
+        echo "GPIO fault event(s) occured, turn on fault LED"
+        gpio_fault="true"
+    else
+        gpio_fault="false"
+    fi
+}
+
+check_fault() {
+	if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
+                                    || [[ "$overtemp_occured" == "true" ]] \
+                                    || [[ "$gpio_fault" == "true" ]]; then
+		fault="true"
+	else
+		fault="false"
+	fi
+}
+
+# The System Fault Led turns on upon the system error, update the System Fault Led
+# based on the Fan fault status and PSU fault status
+control_sys_fault_led() {
+	# Turn on/off the System Fault Led
+	if [ "$fault" == "true" ]; then
+		gpio_name_set led-fault $on
+	else
+		gpio_name_set led-fault $off
+	fi
+}
+
+# daemon start
+while true
+do
+	#  Monitors Fan speeds
+	check_fan_failed
+	# Monitors PSU presence
+	check_psu_failed
+
+	check_overtemp_occured
+	check_gpio_fault
+	# Check fault to update fail
+	check_fault
+	control_sys_fault_led
+
+	control_fan_fault_led
+	control_psu_fault_led
+
+	sleep 2
+done
+
+exit 1