meta-ampere: add fault monitor support
Support to detect GPIO, PSU, FAN, ... faults and turn ON/OFF fault LED.
Tested:
1. Unplug a PSU and check if Fault LED is turned ON.
2. Unplug a FAN and check if Fault LED is turned ON.
3. Stimulate GPIO fault pattern and check if the BMC can detect
Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Signed-off-by: Hieu Huynh <hieuh@os.amperecomputing.com>
Signed-off-by: Quang Nguyen <quangn@amperecomputing.com>
Change-Id: Idfcd32953cf811fbe9299a162f604cb8fd028962
diff --git a/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf b/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf
index aecb597..712401b 100644
--- a/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf
+++ b/meta-ampere/meta-mitchell/conf/machine/mtmitchell.conf
@@ -36,6 +36,7 @@
phosphor-ipmi-blobs-binarystore \
util-linux \
ampere-sysfw-hang-handler \
+ ampere-fault-monitor \
"
PREFERRED_PROVIDER_virtual/obmc-chassis-mgmt = "packagegroup-obmc-ampere-apps"
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend
new file mode 100644
index 0000000..2f3e457
--- /dev/null
+++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor.bbappend
@@ -0,0 +1,14 @@
+FILESEXTRAPATHS:append := "${THISDIR}/${PN}:"
+
+RDEPENDS:${PN} = "bash"
+
+SRC_URI += " \
+ file://ampere_fault_monitor.sh \
+ file://ampere_check_gpio_fault.sh \
+ "
+
+do_install() {
+ install -d ${D}/${sbindir}
+ install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/
+ install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/
+}
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
new file mode 100755
index 0000000..9922420
--- /dev/null
+++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
@@ -0,0 +1,351 @@
+#!/bin/bash
+
+# This script monitors S0/S1 GPIO fault and detects errors from CPUs
+#
+# So far, there is no specification describes the behavior of LED when an error (a pattern is detected) occurs.
+# So when detecting a pattern, we simply set the gpio fault flag and turn on the SYS LED.
+#
+# The Parttern will in the format:
+# <minor_byte> <quite_gap_1second> <major_byte> <stop_condition_low_for_3seconds>
+#
+# Ex: pattern minor_byte=0x03, major_byte=0x02, you will see the waveform like
+# _1010100...(quite gap, low for 1 second)..0111111111000000000111111111110000000000...(stop condition, low for 3 seconds)..
+#
+# Usage: <app_name> <socket 0/1>
+#
+# shellcheck source=/dev/null
+source /usr/sbin/gpio-lib.sh
+
+# global variables
+ error_flag='/tmp/gpio_fault'
+
+ # the command "cat /sys/class/gpio/gpio"$gpio_Id"/value" itself, takes 10ms~35ms to complete, depends on CPU loading
+ polling_minor_byte_rate=0
+ polling_major_byte_rate=200000
+ polling_rate=$polling_minor_byte_rate
+
+ # the mount of low to ensure that already get out of minor_byte and is in quite gap
+ # this value depends on the polling_minor_byte_rate
+ max_low_in_minor_byte=9
+
+ # the mount of low to ensure that already get out of major_byte and is in stop condition
+ # this value depends on the polling_major_byte_rate
+ max_low_in_major_byte=9
+
+ max_low=$max_low_in_minor_byte
+
+ # state machines:
+ # detecting_minor_byte=0
+ # detecting_major_byte=1
+ curr_state=0
+
+ minor_byte=0
+ major_byte=0
+
+ gpio_status=0
+
+ socket=$1
+
+ socket1_present=151
+ socket1_status=1
+
+ S0_fault_gpio='s0-fault-alert'
+ S1_fault_gpio='s1-fault-alert'
+
+map_event_name() {
+ case $major_byte in
+ 2)
+ event_major="FAULT_LED_BOOT_ERROR"
+ case $minor_byte in
+ 1)
+ event_minor="SOC_BOOTDEV_INIT_SEC_ERROR"
+ ;;
+ 2)
+ event_minor="SECJMP_FAIL_ERROR"
+ ;;
+ 3)
+ event_minor="UART_INIT_WARN"
+ ;;
+ 4)
+ event_minor="UART_TX_WARN"
+ ;;
+ 5)
+ event_minor="SOC_ROMPATCH_BAD_ERROR"
+ ;;
+ 6)
+ event_minor="SOC_ROMPATCH_RANGE_ERROR"
+ ;;
+ 7)
+ event_minor="SPI_INIT_ERROR"
+ ;;
+ 8)
+ event_minor="SPI_TX_ERROR"
+ ;;
+ 9)
+ event_minor="SPINOR_UNKNOW_DEVICE_WARN"
+ ;;
+ 10)
+ event_minor="EEPROM_BAD_NVP_HEADER_WARN"
+ ;;
+ 11)
+ event_minor="EEPROM_BAD_NVP_FIELD_WARN"
+ ;;
+ 12)
+ event_minor="EEPROM_BAD_CHECKSUM_ERROR_WARN"
+ ;;
+ 13)
+ event_minor="I2C_DMA_ERROR"
+ ;;
+ 14)
+ event_minor="I2C_TIMEOUT_ERROR"
+ ;;
+ 15)
+ event_minor="SOC_BOOTDEV_SPI_LOAD_ERROR"
+ ;;
+ 16)
+ event_minor="SOC_BOOTDEV_AUTHENTICATION_ERROR"
+ ;;
+ 17)
+ event_minor="PCP_POWERUP_FAILED"
+ ;;
+ 18)
+ event_minor="PCP_POWERDOWN_FAILED"
+ ;;
+ 19)
+ event_minor="CPUPLL_INIT_FAILED"
+ ;;
+ 20)
+ event_minor="MESHPLL_INIT_FAILED"
+ ;;
+ *)
+ event_minor="NOT_SUPPORT"
+ esac
+ ;;
+ 3)
+ event_major="FAULT_LED_FW_LOAD_ERROR"
+ case $minor_byte in
+ 9)
+ event_minor="LFS_ERROR"
+ ;;
+ *)
+ event_minor="NOT_SUPPORT"
+ esac
+ ;;
+ 4)
+ event_major="FAULT_LED_SECURITY_ERROR"
+ case $minor_byte in
+ 1)
+ event_minor="SEC_INVALID_KEY_CERT"
+ ;;
+ 2)
+ event_minor="SEC_INVALID_CONT_CERT"
+ ;;
+ 3)
+ event_minor="SEC_INVALID_ROOT_KEY"
+ ;;
+ 4)
+ event_minor="SEC_INVALID_SECPRO_KEY"
+ ;;
+ 5)
+ event_minor="SEC_INVALID_KEY_CERT_SIG"
+ ;;
+ 6)
+ event_minor="SEC_INVALID_CONT_CERT_SIG"
+ ;;
+ 7)
+ event_minor="SEC_INVALID_IMAGE_HASH"
+ ;;
+ 8)
+ event_minor="SEC_INVALID_PRI_VERSION"
+ ;;
+ 9)
+ event_minor="SEC_HUK_MISMATCH"
+ ;;
+ 10)
+ event_minor="SEC_FUSE_BLOW_CERT_WITHOUT_SPECIAL_BOOT_PIN"
+ ;;
+ 11)
+ event_minor="SEC_INVALID_CERT_SUBTYPE_STRUCT"
+ ;;
+ 12)
+ event_minor="SEC_TMMCFG_FAIL"
+ ;;
+ 13)
+ event_minor="SEC_INVALID_LCS_FROM_EFUSE"
+ ;;
+ 14)
+ event_minor="SEC_EFUSE_WRITE_FAILED"
+ ;;
+ 15)
+ event_minor="SEC_INVALID_CERT_VALUE"
+ ;;
+ 16)
+ event_minor="SEC_INVALID_CERT_VERSION"
+ ;;
+ *)
+ event_minor="NOT_SUPPORT"
+ ;;
+ esac
+ ;;
+ 5)
+ event_major="FAULT_LED_EXCEPTION_ERROR"
+ case $minor_byte in
+ 1)
+ event_minor="KERNEL_EXCEPTION_UNKNOWN_REASON_ERROR"
+ ;;
+ 2)
+ event_minor="KERNEL_EXCEPTION_HARD_FAULT_ERROR"
+ ;;
+ 3)
+ event_minor="KERNEL_EXCEPTION_BUS_FAULT_ERROR"
+ ;;
+ 4)
+ event_minor="KERNEL_EXCEPTION_MEMMANAGE_FAULT_ERROR"
+ ;;
+ 5)
+ event_minor="KERNEL_EXCEPTION_USAGE_FAULT_ERROR"
+ ;;
+ *)
+ event_minor="NOT_SUPPORT"
+ ;;
+ esac
+ ;;
+ *)
+ event_major="NOT_SUPPORT"
+ ;;
+ esac
+}
+
+set_unset_gpio_fault_flag() {
+ if [ ! -f $error_flag ] && [ "$1" == 1 ] ; then
+ touch $error_flag
+ elif [ -f $error_flag ] && [ "$1" == 0 ]; then
+ rm $error_flag
+ fi
+}
+
+toggle_state() {
+ if [ "$curr_state" == 0 ]; then
+ curr_state=1
+ polling_rate=$polling_major_byte_rate
+ else
+ curr_state=0
+ polling_rate=$polling_minor_byte_rate
+ map_event_name
+ echo "detected major_byte=$event_major, minor_byte=$event_minor"
+ set_unset_gpio_fault_flag 1
+ fi
+}
+
+save_pulse_of_byte() {
+ if [ "$curr_state" == 0 ]; then
+ minor_byte=$1
+ #echo "minor_byte=$1"
+ else
+ major_byte=$1
+ #echo "major_byte=$1"
+ fi
+}
+
+# we do not care the pulse is 50ms or 500ms, what we care is that the number of high pulses
+cnt_falling_edge_in_byte() {
+ local cnt_falling_edge=0
+ local cnt_low=0
+
+ local prev=0
+ local curr=0
+
+ while true
+ do
+ prev=$curr
+ curr=$gpio_status
+ # count the falling edges, if they occur, just reset cnt_low
+ if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then
+ cnt_falling_edge=$(( cnt_falling_edge + 1 ))
+ cnt_low=0
+ continue
+ # check if we are in the quite gap or stop condition
+ elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then
+ cnt_low=$(( cnt_low + 1 ))
+ if [ "$cnt_low" == "$max_low" ]; then
+ save_pulse_of_byte "$cnt_falling_edge"
+ toggle_state
+ break
+ fi
+ fi
+ usleep $polling_rate
+ gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+ done
+}
+
+gpio_config_input() {
+ echo "$gpio_Id" > /sys/class/gpio/export
+ echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction
+}
+
+gpio_number() {
+ local offset
+ local gpioPin
+ local str
+
+ str=$(gpiofind "$1")
+ if [ "$?" == '1' ]; then
+ echo -1
+ else
+ gpioid=$(echo "$str"|cut -c 9)
+ offset=$(echo "$str"|cut -d " " -f 2)
+ gpioPin=$(("$offset" + ${AST2600_GPIO_BASE[$gpioid]}))
+ echo "$gpioPin"
+ fi
+}
+
+init_sysfs_fault_gpio() {
+ gpio_Id=$(gpio_number "$fault_gpio")
+ if [ "$gpio_Id" == "-1" ]; then
+ echo "Invalid GPIO number"
+ exit 1
+ fi
+
+ if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then
+ return
+ fi
+ gpio_config_input "$gpio_Id"
+}
+
+# init
+if [ "$socket" == "0" ]; then
+ fault_gpio="$S0_fault_gpio"
+else
+ socket1_status=$(gpioget 0 "$socket1_present")
+ if [ "$socket1_status" == 1 ]; then
+ echo "socket 1 not present"
+ exit 0
+ fi
+ fault_gpio=$S1_fault_gpio
+fi
+
+init_sysfs_fault_gpio
+
+# daemon start
+while true
+do
+ # detect when pattern starts
+ if [ "$gpio_status" == 1 ]; then
+ if [ "$curr_state" == 0 ]; then
+ # detecting minor byte, set up minor byte variables
+ max_low=$max_low_in_minor_byte
+ polling_rate=$polling_minor_byte_rate
+ else
+ # detecting major byte, set up major byte variables
+ max_low=$max_low_in_major_byte
+ polling_rate=$polling_major_byte_rate
+ fi
+ # now, there is something on gpio, check if that is a byte pattern
+ cnt_falling_edge_in_byte
+ fi
+
+ usleep $polling_rate
+ gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+done
+
+exit 1
diff --git a/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
new file mode 100644
index 0000000..e176629
--- /dev/null
+++ b/meta-ampere/meta-mitchell/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
+
+# shellcheck disable=SC2004
+# shellcheck source=/dev/null
+source /usr/sbin/gpio-lib.sh
+
+# common variables
+ on=1
+ off=0
+
+ overtemp_fault_flag='/tmp/fault_overtemp'
+
+# gpio fault
+ gpio_fault="false"
+ gpio_fault_flag="/tmp/gpio_fault"
+
+# fan variables
+ fan_failed="false"
+ fan_failed_flag='/tmp/fan_failed'
+
+# PSU variables
+ psu_failed="false"
+ psu_bus=2
+ psu0_addr=0x58
+ psu1_addr=0x59
+ status_word_cmd=0x79
+ # Following the PMBus Specification
+ # Bit[1]: CML faults
+ # Bit[2]: Over temperature faults
+ # Bit[3]: Under voltage faults
+ # Bit[4]: Over current faults
+ # Bit[5]: Over voltage fault
+ # Bit[10]: Fan faults
+ psu_fault_bitmask=0x43e
+
+# led variables
+ fan_fault_led_status=$off
+ psu_fault_led_status=$off
+ led_bus=15
+ led_addr=0x22
+ led_port0_config=0x06
+ led_port0_output=0x02
+
+# functions declaration
+check_fan_failed() {
+ if [[ -f $fan_failed_flag ]]; then
+ fan_failed="true"
+ else
+ fan_failed="false"
+ fi
+}
+
+turn_on_off_fan_fault_led() {
+ # Control fan fault led via CPLD's I2C at slave address 0x22, I2C16.
+ # Get Port0 value
+ p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config)
+ p0_val=$(("$p0_val" & ~1))
+ # Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0].
+ i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val
+
+ # Get led value
+ led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output)
+
+ if [ "$1" == $on ]; then
+ led_st=$(("$led_st" | 1))
+ else
+ led_st=$(("$led_st" & ~1))
+ fi
+
+ # Turn on/off fan fault led
+ i2cset -f -y $led_bus $led_addr $led_port0_output $led_st
+}
+
+turn_on_off_psu_fault_led() {
+ # Control psu fault led via CPLD's I2C at slave address 0x22, I2C16.
+ # Get Port1 value
+ p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config)
+ p1_val=$(("$p1_val" & ~2))
+ # Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1].
+ i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val
+
+ # Get led value
+ led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output)
+ if [ "$1" == $on ]; then
+ led_st=$(("$led_st" | 2))
+ else
+ led_st=$(("$led_st" & ~2))
+ fi
+
+ # Turn on/off psu fault led
+ i2cset -f -y $led_bus $led_addr $led_port0_output $led_st
+}
+
+control_fan_fault_led() {
+ if [ "$fan_failed" == "true" ]; then
+ if [ "$fan_fault_led_status" == $off ]; then
+ turn_on_off_fan_fault_led $on
+ fan_fault_led_status=$on
+ fi
+ else
+ if [ "$fan_fault_led_status" == $on ]; then
+ turn_on_off_fan_fault_led $off
+ fan_fault_led_status=$off
+ fi
+ fi
+}
+
+check_psu_failed() {
+ local psu0_presence
+ local psu1_presence
+ local psu0_value
+ local psu1_value
+
+ psu0_presence=$(gpio_name_get presence-ps0)
+ psu0_failed="true"
+ if [ "$psu0_presence" == "0" ]; then
+ # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+ psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
+ psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
+ if [ "$psu0_bit_fault" == "0" ]; then
+ psu0_failed="false"
+ fi
+ fi
+
+ psu1_presence=$(gpio_name_get presence-ps1)
+ psu1_failed="true"
+ if [ "$psu1_presence" == "0" ]; then
+ # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+ psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
+ psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
+ if [ "$psu1_bit_fault" == "0" ]; then
+ psu1_failed="false"
+ fi
+ fi
+
+ if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
+ psu_failed="true"
+ else
+ psu_failed="false"
+ fi
+}
+
+control_psu_fault_led() {
+ if [ "$psu_failed" == "true" ]; then
+ if [ "$psu_fault_led_status" == $off ]; then
+ turn_on_off_psu_fault_led $on
+ psu_fault_led_status=$on
+ fi
+ else
+ if [ "$psu_fault_led_status" == $on ]; then
+ turn_on_off_psu_fault_led $off
+ psu_fault_led_status=$off
+ fi
+ fi
+}
+
+check_overtemp_occured() {
+ if [[ -f $overtemp_fault_flag ]]; then
+ echo "Over temperature occured, turn on fault LED"
+ overtemp_occured="true"
+ else
+ overtemp_occured="false"
+ fi
+}
+
+
+check_gpio_fault() {
+ if [[ -f $gpio_fault_flag ]]; then
+ echo "GPIO fault event(s) occured, turn on fault LED"
+ gpio_fault="true"
+ else
+ gpio_fault="false"
+ fi
+}
+
+check_fault() {
+ if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
+ || [[ "$overtemp_occured" == "true" ]] \
+ || [[ "$gpio_fault" == "true" ]]; then
+ fault="true"
+ else
+ fault="false"
+ fi
+}
+
+# The System Fault Led turns on upon the system error, update the System Fault Led
+# based on the Fan fault status and PSU fault status
+control_sys_fault_led() {
+ # Turn on/off the System Fault Led
+ if [ "$fault" == "true" ]; then
+ gpio_name_set led-fault $on
+ else
+ gpio_name_set led-fault $off
+ fi
+}
+
+# daemon start
+while true
+do
+ # Monitors Fan speeds
+ check_fan_failed
+ # Monitors PSU presence
+ check_psu_failed
+
+ check_overtemp_occured
+ check_gpio_fault
+ # Check fault to update fail
+ check_fault
+ control_sys_fault_led
+
+ control_fan_fault_led
+ control_psu_fault_led
+
+ sleep 2
+done
+
+exit 1