meta-ampere: add fault monitor support
Support to detect GPIO, PSU, FAN, ... faults and turn ON/OFF fault LED.
Tested:
1. Unplug a PSU and check if Fault LED is turned ON.
2. Unplug a FAN and check if Fault LED is turned ON.
3. Stimulate GPIO fault pattern and check if the BMC can detect
Signed-off-by: Thang Q. Nguyen <thang@os.amperecomputing.com>
Signed-off-by: Hieu Huynh <hieuh@os.amperecomputing.com>
Signed-off-by: Quang Nguyen <quangn@amperecomputing.com>
Change-Id: Idfcd32953cf811fbe9299a162f604cb8fd028962
diff --git a/meta-ampere/meta-jade/conf/machine/mtjade.conf b/meta-ampere/meta-jade/conf/machine/mtjade.conf
index 914c160..f1e51bb 100644
--- a/meta-ampere/meta-jade/conf/machine/mtjade.conf
+++ b/meta-ampere/meta-jade/conf/machine/mtjade.conf
@@ -33,6 +33,7 @@
phosphor-ipmi-blobs \
phosphor-ipmi-blobs-binarystore \
ampere-driver-binder \
+ ampere-fault-monitor \
"
PREFERRED_PROVIDER_virtual/obmc-chassis-mgmt = "packagegroup-ampere-apps"
diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend
new file mode 100644
index 0000000..2f3e457
--- /dev/null
+++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor.bbappend
@@ -0,0 +1,14 @@
+FILESEXTRAPATHS:append := "${THISDIR}/${PN}:"
+
+RDEPENDS:${PN} = "bash"
+
+SRC_URI += " \
+ file://ampere_fault_monitor.sh \
+ file://ampere_check_gpio_fault.sh \
+ "
+
+do_install() {
+ install -d ${D}/${sbindir}
+ install -m 755 ${WORKDIR}/ampere_fault_monitor.sh ${D}/${sbindir}/
+ install -m 755 ${WORKDIR}/ampere_check_gpio_fault.sh ${D}/${sbindir}/
+}
diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
new file mode 100755
index 0000000..141c50c
--- /dev/null
+++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_check_gpio_fault.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+
+# This script monitors S0/S1 fault GPIO and detects errors or warnings from CPUs
+#
+# According to OpenBMC_Software_Funcional_Specification, section 3.16,
+#
+# When the BMC detects the GPIO_FAULT signal indicating an SCP booting failure:
+# • If a non-critical error/warning from the SCP occurs, the BMC blinks the Fault LED once.
+# • If a critical error from the SCP occurs, the BMC turns on the Fault LED.
+# The BMC monitors the GPIO_FAULT signal from the SCP during SCP booting to determine whether
+# the error is non-critical or critical. A fatal error is indicated when the signal is On and then Off
+# continuously, followed by a “quiet” period of about three seconds, and this pattern repeats. If the “quiet”
+# period is longer than three seconds, the error is non-fatal. The BMC must set up appropriate debounce
+# times to detect such errors. The BMC is expected to turn on the Fault LED forever for fatal errors, or to
+# turn on the Fault LED and turn it off when the fault clears for non-fatal errors.
+#
+# Usage: <app_name> <socket 0/1>
+
+# shellcheck source=/dev/null
+source /usr/sbin/gpio-lib.sh
+
+# global variables
+ error_flag='/tmp/fault_err'
+ warning_flag='/tmp/fault_warning'
+
+ duty_cycle=250000
+ scan_pulse=100000
+ blank_num=8
+
+ curr_pattern=0
+ prev_pattern=0
+
+ gpio_status=0
+ repeat=0
+
+ socket=$1
+
+ socket1_present=15
+ socket1_status=1
+
+ S0_fault_gpio=73
+ S1_fault_gpio=201
+
+map_event_name() {
+ case $curr_pattern in
+ 1)
+ event_name="RAS_GPIO_INVALID_LCS"
+ ;;
+ 2)
+ event_name="RAS_GPIO_FILE_HDR_INVALID"
+ ;;
+ 3)
+ event_name="RAS_GPIO_FILE_INTEGRITY_INVALID"
+ ;;
+ 4)
+ event_name="RAS_GPIO_KEY_CERT_AUTH_ERR"
+ ;;
+ 5)
+ event_name="RAS_GPIO_CNT_CERT_AUTH_ERR"
+ ;;
+ 6)
+ event_name="RAS_GPIO_I2C_HARDWARE_ERR"
+ ;;
+ 7)
+ event_name="RAS_GPIO_CRYPTO_ENGINE_ERR"
+ ;;
+ 8)
+ event_name="RAS_GPIO_ROTPK_EFUSE_INVALID"
+ ;;
+ 9)
+ event_name="RAS_GPIO_SEED_EFUSE_INVALID"
+ ;;
+ 10)
+ event_name="RAS_GPIO_LCS_FROM_EFUSE_INVALID"
+ ;;
+ 11)
+ event_name="RAS_GPIO_PRIM_ROLLBACK_EFUSE_INVALID"
+ ;;
+ 12)
+ event_name="RAS_GPIO_SEC_ROLLBACK_EFUSE_INVALID"
+ ;;
+ 13)
+ event_name="RAS_GPIO_HUK_EFUSE_INVALID"
+ ;;
+ 14)
+ event_name="RAS_GPIO_CERT_DATA_INVALID"
+ ;;
+ 15)
+ event_name="RAS_GPIO_INTERNAL_HW_ERR"
+ ;;
+ *)
+ event_name="NOT_SUPPORT"
+ ;;
+ esac
+}
+
+detect_patern_repeat() {
+ local prev=0
+ local curr=0
+ local cnt=13
+
+ while true
+ do
+ usleep $scan_pulse
+ gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+ prev=$curr
+ curr=$gpio_status
+ if [ "$prev" == 0 ] && [ "$curr" == 1 ]; then
+ # patern start repeating, check if previous and current pattern are the same
+ repeat=1
+ break
+ fi
+ if [ "$cnt" == 0 ]; then
+ map_event_name
+ echo "detected a warning from fault GPIO #$fault_gpio $socket, event $event_name"
+ # pattern not repeat, this is a warning, turn on warning flag
+ touch $warning_flag
+ break
+ fi
+ cnt=$(( cnt - 1 ))
+ done
+}
+
+detect_pattern() {
+ local cnt_falling_edge=0
+ local cnt_blank=0
+
+ local prev=0
+ local curr=0
+
+ while true
+ do
+ prev=$curr
+ curr=$gpio_status
+ # count the falling edges, if they appear, just reset cnt_blank
+ if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then
+ cnt_falling_edge=$(( cnt_falling_edge + 1 ))
+ cnt_blank=0
+ continue
+ # check if we are in the quite gap
+ elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then
+ cnt_blank=$(( cnt_blank + 1 ))
+ if [ "$cnt_blank" == "$blank_num" ]; then
+ # echo "pattern number falling_edge=$cnt_falling_edge blank=$cnt_blank"
+ curr_pattern=$cnt_falling_edge
+ # after count all falling edges, now check if patern repeat after 3s
+ detect_patern_repeat
+ break
+ fi
+ fi
+ usleep $scan_pulse
+ gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+ done
+}
+
+gpio_config_input() {
+ echo "$gpio_Id" > /sys/class/gpio/export
+ echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction
+}
+
+init_sysfs_fault_gpio() {
+ gpio_Id=$(gpio_number "$fault_gpio")
+ if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then
+ return
+ fi
+ gpio_config_input "$fault_gpio"
+}
+
+# init
+if [ "$socket" == "0" ]; then
+ fault_gpio=$S0_fault_gpio
+else
+ socket1_status=$(gpioget 0 "$socket1_present")
+ if [ "$socket1_status" == 1 ]; then
+ echo "socket 1 not present"
+ exit 1
+ fi
+ fault_gpio=$S1_fault_gpio
+fi
+
+init_sysfs_fault_gpio
+
+# daemon start
+while true
+do
+ # detect when pattern starts
+ if [ "$gpio_status" == 1 ]; then
+ # now, there is something on gpio, check if that is a pattern
+ detect_pattern
+ if [ "$repeat" == 1 ] && [ "$prev_pattern" == "$curr_pattern" ]; then
+ map_event_name
+ echo "detected an error from fault GPIO #$fault_gpio $socket, event#$curr_pattern $event_name"
+ touch $error_flag
+ repeat=0
+ fi
+ prev_pattern=$curr_pattern
+ curr_pattern=0
+ continue
+ fi
+ usleep $duty_cycle
+ gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
+
+done
+
+exit 1
diff --git a/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
new file mode 100644
index 0000000..44cbb11
--- /dev/null
+++ b/meta-ampere/meta-jade/recipes-ampere/platform/ampere-fault-monitor/ampere_fault_monitor.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
+
+# shellcheck disable=SC2004
+# shellcheck disable=SC2046
+# shellcheck source=/dev/null
+
+# common variables
+ warning_fault_flag='/tmp/fault_warning'
+ error_fault_flag='/tmp/fault_err'
+ overtemp_fault_flag='/tmp/fault_overtemp'
+ fault_RAS_UE_flag='/tmp/fault_RAS_UE'
+
+ blink_rate=100000
+
+ fault="false"
+
+ on="true"
+ off="false"
+
+ gpio_fault="false"
+
+# fan variables
+ fan_failed="false"
+ fan_failed_flag='/tmp/fan_failed'
+
+# PSU variables
+ psu_failed="false"
+ psu_bus=6
+ psu0_addr=0x58
+ psu1_addr=0x59
+ status_word_cmd=0x79
+ # Following the PMBus Specification
+ # Bit[1]: CML faults
+ # Bit[2]: Over temperature faults
+ # Bit[3]: Under voltage faults
+ # Bit[4]: Over current faults
+ # Bit[5]: Over voltage fault
+ # Bit[10]: Fan faults
+ psu_fault_bitmask=0x43e
+
+# led variables
+ led_service='xyz.openbmc_project.LED.GroupManager'
+ led_fault_path='/xyz/openbmc_project/led/groups/system_fault'
+ led_fault_interface='xyz.openbmc_project.Led.Group'
+ fault_led_status=$off
+
+# functions declaration
+check_fan_failed() {
+ if [[ -f $fan_failed_flag ]]; then
+ fan_failed="true"
+ else
+ fan_failed="false"
+ fi
+}
+
+turn_on_off_fault_led() {
+ busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null
+}
+
+check_psu_failed() {
+ local psu0_presence
+ local psu1_presence
+ local psu0_value
+ local psu1_value
+
+ psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT))
+ psu0_failed="true"
+ if [ "$psu0_presence" == "0" ]; then
+ # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+ psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
+ psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
+ if [ "$psu0_bit_fault" == "0" ]; then
+ psu0_failed="false"
+ fi
+ fi
+
+ psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT))
+ psu1_failed="true"
+ if [ "$psu1_presence" == "0" ]; then
+ # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
+ psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
+ psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
+ if [ "$psu1_bit_fault" == "0" ]; then
+ psu1_failed="false"
+ fi
+ fi
+
+ if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
+ psu_failed="true"
+ else
+ psu_failed="false"
+ fi
+}
+
+check_fault() {
+ if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
+ || [[ "$gpio_fault" == "true" ]] \
+ || [[ "$RAS_UE_occured" == "true" ]] \
+ || [[ "$overtemp_occured" == "true" ]]; then
+ fault="true"
+ else
+ fault="false"
+ fi
+}
+
+control_fault_led() {
+ if [ "$fault" == "true" ]; then
+ if [ "$fault_led_status" == $off ]; then
+ turn_on_off_fault_led $on
+ fault_led_status=$on
+ fi
+ else
+ if [ "$fault_led_status" == $on ]; then
+ turn_on_off_fault_led $off
+ fault_led_status=$off
+ fi
+ fi
+}
+
+blink_fault_led() {
+ if [ "$fault_led_status" == $off ]; then
+ turn_on_off_fault_led $on
+ usleep $blink_rate
+ turn_on_off_fault_led $off
+ else
+ turn_on_off_fault_led $off
+ usleep $blink_rate
+ turn_on_off_fault_led $on
+ fi
+}
+
+check_gpio_fault() {
+ if [[ -f $error_fault_flag ]]; then
+ gpio_fault="true"
+ else
+ if [ -f $warning_fault_flag ]; then
+ blink_fault_led
+ rm $warning_fault_flag
+ fi
+ gpio_fault="false"
+ fi
+}
+
+check_RAS_UE_occured() {
+ if [[ -f $fault_RAS_UE_flag ]]; then
+ echo "RAS UE error occured, turn on fault LED"
+ RAS_UE_occured="true"
+ else
+ RAS_UE_occured="false"
+ fi
+}
+
+check_overtemp_occured() {
+ if [[ -f $overtemp_fault_flag ]]; then
+ echo "Over temperature occured, turn on fault LED"
+ overtemp_occured="true"
+ else
+ overtemp_occured="false"
+ fi
+}
+
+# daemon start
+while true
+do
+ check_gpio_fault
+ check_fan_failed
+ check_overtemp_occured
+ check_RAS_UE_occured
+
+ # Monitors PSU presence
+ check_psu_failed
+
+ check_fault
+ control_fault_led
+ sleep 2
+done
+
+exit 1