blob: 141c50c36e40d3547b71e88cfa3df29e67340726 [file] [log] [blame]
#!/bin/bash
# This script monitors S0/S1 fault GPIO and detects errors or warnings from CPUs
#
# According to OpenBMC_Software_Funcional_Specification, section 3.16,
#
# When the BMC detects the GPIO_FAULT signal indicating an SCP booting failure:
# • If a non-critical error/warning from the SCP occurs, the BMC blinks the Fault LED once.
# • If a critical error from the SCP occurs, the BMC turns on the Fault LED.
# The BMC monitors the GPIO_FAULT signal from the SCP during SCP booting to determine whether
# the error is non-critical or critical. A fatal error is indicated when the signal is On and then Off
# continuously, followed by a “quiet” period of about three seconds, and this pattern repeats. If the “quiet”
# period is longer than three seconds, the error is non-fatal. The BMC must set up appropriate debounce
# times to detect such errors. The BMC is expected to turn on the Fault LED forever for fatal errors, or to
# turn on the Fault LED and turn it off when the fault clears for non-fatal errors.
#
# Usage: <app_name> <socket 0/1>
# shellcheck source=/dev/null
source /usr/sbin/gpio-lib.sh
# global variables
error_flag='/tmp/fault_err'
warning_flag='/tmp/fault_warning'
duty_cycle=250000
scan_pulse=100000
blank_num=8
curr_pattern=0
prev_pattern=0
gpio_status=0
repeat=0
socket=$1
socket1_present=15
socket1_status=1
S0_fault_gpio=73
S1_fault_gpio=201
map_event_name() {
case $curr_pattern in
1)
event_name="RAS_GPIO_INVALID_LCS"
;;
2)
event_name="RAS_GPIO_FILE_HDR_INVALID"
;;
3)
event_name="RAS_GPIO_FILE_INTEGRITY_INVALID"
;;
4)
event_name="RAS_GPIO_KEY_CERT_AUTH_ERR"
;;
5)
event_name="RAS_GPIO_CNT_CERT_AUTH_ERR"
;;
6)
event_name="RAS_GPIO_I2C_HARDWARE_ERR"
;;
7)
event_name="RAS_GPIO_CRYPTO_ENGINE_ERR"
;;
8)
event_name="RAS_GPIO_ROTPK_EFUSE_INVALID"
;;
9)
event_name="RAS_GPIO_SEED_EFUSE_INVALID"
;;
10)
event_name="RAS_GPIO_LCS_FROM_EFUSE_INVALID"
;;
11)
event_name="RAS_GPIO_PRIM_ROLLBACK_EFUSE_INVALID"
;;
12)
event_name="RAS_GPIO_SEC_ROLLBACK_EFUSE_INVALID"
;;
13)
event_name="RAS_GPIO_HUK_EFUSE_INVALID"
;;
14)
event_name="RAS_GPIO_CERT_DATA_INVALID"
;;
15)
event_name="RAS_GPIO_INTERNAL_HW_ERR"
;;
*)
event_name="NOT_SUPPORT"
;;
esac
}
detect_patern_repeat() {
local prev=0
local curr=0
local cnt=13
while true
do
usleep $scan_pulse
gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
prev=$curr
curr=$gpio_status
if [ "$prev" == 0 ] && [ "$curr" == 1 ]; then
# patern start repeating, check if previous and current pattern are the same
repeat=1
break
fi
if [ "$cnt" == 0 ]; then
map_event_name
echo "detected a warning from fault GPIO #$fault_gpio $socket, event $event_name"
# pattern not repeat, this is a warning, turn on warning flag
touch $warning_flag
break
fi
cnt=$(( cnt - 1 ))
done
}
detect_pattern() {
local cnt_falling_edge=0
local cnt_blank=0
local prev=0
local curr=0
while true
do
prev=$curr
curr=$gpio_status
# count the falling edges, if they appear, just reset cnt_blank
if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then
cnt_falling_edge=$(( cnt_falling_edge + 1 ))
cnt_blank=0
continue
# check if we are in the quite gap
elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then
cnt_blank=$(( cnt_blank + 1 ))
if [ "$cnt_blank" == "$blank_num" ]; then
# echo "pattern number falling_edge=$cnt_falling_edge blank=$cnt_blank"
curr_pattern=$cnt_falling_edge
# after count all falling edges, now check if patern repeat after 3s
detect_patern_repeat
break
fi
fi
usleep $scan_pulse
gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
done
}
gpio_config_input() {
echo "$gpio_Id" > /sys/class/gpio/export
echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction
}
init_sysfs_fault_gpio() {
gpio_Id=$(gpio_number "$fault_gpio")
if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then
return
fi
gpio_config_input "$fault_gpio"
}
# init
if [ "$socket" == "0" ]; then
fault_gpio=$S0_fault_gpio
else
socket1_status=$(gpioget 0 "$socket1_present")
if [ "$socket1_status" == 1 ]; then
echo "socket 1 not present"
exit 1
fi
fault_gpio=$S1_fault_gpio
fi
init_sysfs_fault_gpio
# daemon start
while true
do
# detect when pattern starts
if [ "$gpio_status" == 1 ]; then
# now, there is something on gpio, check if that is a pattern
detect_pattern
if [ "$repeat" == 1 ] && [ "$prev_pattern" == "$curr_pattern" ]; then
map_event_name
echo "detected an error from fault GPIO #$fault_gpio $socket, event#$curr_pattern $event_name"
touch $error_flag
repeat=0
fi
prev_pattern=$curr_pattern
curr_pattern=0
continue
fi
usleep $duty_cycle
gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
done
exit 1