Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | # This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status |
| 4 | |
| 5 | # shellcheck disable=SC2004 |
| 6 | # shellcheck disable=SC2046 |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 7 | |
| 8 | # common variables |
| 9 | warning_fault_flag='/tmp/fault_warning' |
| 10 | error_fault_flag='/tmp/fault_err' |
| 11 | overtemp_fault_flag='/tmp/fault_overtemp' |
| 12 | fault_RAS_UE_flag='/tmp/fault_RAS_UE' |
| 13 | |
| 14 | blink_rate=100000 |
| 15 | |
| 16 | fault="false" |
| 17 | |
| 18 | on="true" |
| 19 | off="false" |
| 20 | |
| 21 | gpio_fault="false" |
| 22 | |
| 23 | # fan variables |
| 24 | fan_failed="false" |
| 25 | fan_failed_flag='/tmp/fan_failed' |
| 26 | |
| 27 | # PSU variables |
| 28 | psu_failed="false" |
| 29 | psu_bus=6 |
| 30 | psu0_addr=0x58 |
| 31 | psu1_addr=0x59 |
| 32 | status_word_cmd=0x79 |
| 33 | # Following the PMBus Specification |
| 34 | # Bit[1]: CML faults |
| 35 | # Bit[2]: Over temperature faults |
| 36 | # Bit[3]: Under voltage faults |
| 37 | # Bit[4]: Over current faults |
| 38 | # Bit[5]: Over voltage fault |
| 39 | # Bit[10]: Fan faults |
| 40 | psu_fault_bitmask=0x43e |
| 41 | |
| 42 | # led variables |
| 43 | led_service='xyz.openbmc_project.LED.GroupManager' |
| 44 | led_fault_path='/xyz/openbmc_project/led/groups/system_fault' |
| 45 | led_fault_interface='xyz.openbmc_project.Led.Group' |
| 46 | fault_led_status=$off |
| 47 | |
| 48 | # functions declaration |
| 49 | check_fan_failed() { |
| 50 | if [[ -f $fan_failed_flag ]]; then |
| 51 | fan_failed="true" |
| 52 | else |
| 53 | fan_failed="false" |
| 54 | fi |
| 55 | } |
| 56 | |
| 57 | turn_on_off_fault_led() { |
| 58 | busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null |
| 59 | } |
| 60 | |
| 61 | check_psu_failed() { |
| 62 | local psu0_presence |
| 63 | local psu1_presence |
| 64 | local psu0_value |
| 65 | local psu1_value |
| 66 | |
| 67 | psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT)) |
| 68 | psu0_failed="true" |
| 69 | if [ "$psu0_presence" == "0" ]; then |
| 70 | # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD |
| 71 | psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w) |
| 72 | psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask)) |
| 73 | if [ "$psu0_bit_fault" == "0" ]; then |
| 74 | psu0_failed="false" |
| 75 | fi |
| 76 | fi |
| 77 | |
| 78 | psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT)) |
| 79 | psu1_failed="true" |
| 80 | if [ "$psu1_presence" == "0" ]; then |
| 81 | # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD |
| 82 | psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w) |
| 83 | psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask)) |
| 84 | if [ "$psu1_bit_fault" == "0" ]; then |
| 85 | psu1_failed="false" |
| 86 | fi |
| 87 | fi |
| 88 | |
| 89 | if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then |
| 90 | psu_failed="true" |
| 91 | else |
| 92 | psu_failed="false" |
| 93 | fi |
| 94 | } |
| 95 | |
| 96 | check_fault() { |
| 97 | if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \ |
| 98 | || [[ "$gpio_fault" == "true" ]] \ |
| 99 | || [[ "$RAS_UE_occured" == "true" ]] \ |
| 100 | || [[ "$overtemp_occured" == "true" ]]; then |
| 101 | fault="true" |
| 102 | else |
| 103 | fault="false" |
| 104 | fi |
| 105 | } |
| 106 | |
| 107 | control_fault_led() { |
| 108 | if [ "$fault" == "true" ]; then |
| 109 | if [ "$fault_led_status" == $off ]; then |
| 110 | turn_on_off_fault_led $on |
| 111 | fault_led_status=$on |
| 112 | fi |
| 113 | else |
| 114 | if [ "$fault_led_status" == $on ]; then |
| 115 | turn_on_off_fault_led $off |
| 116 | fault_led_status=$off |
| 117 | fi |
| 118 | fi |
| 119 | } |
| 120 | |
| 121 | blink_fault_led() { |
| 122 | if [ "$fault_led_status" == $off ]; then |
| 123 | turn_on_off_fault_led $on |
| 124 | usleep $blink_rate |
| 125 | turn_on_off_fault_led $off |
| 126 | else |
| 127 | turn_on_off_fault_led $off |
| 128 | usleep $blink_rate |
| 129 | turn_on_off_fault_led $on |
| 130 | fi |
| 131 | } |
| 132 | |
| 133 | check_gpio_fault() { |
| 134 | if [[ -f $error_fault_flag ]]; then |
| 135 | gpio_fault="true" |
| 136 | else |
| 137 | if [ -f $warning_fault_flag ]; then |
| 138 | blink_fault_led |
| 139 | rm $warning_fault_flag |
| 140 | fi |
| 141 | gpio_fault="false" |
| 142 | fi |
| 143 | } |
| 144 | |
| 145 | check_RAS_UE_occured() { |
| 146 | if [[ -f $fault_RAS_UE_flag ]]; then |
| 147 | echo "RAS UE error occured, turn on fault LED" |
| 148 | RAS_UE_occured="true" |
| 149 | else |
| 150 | RAS_UE_occured="false" |
| 151 | fi |
| 152 | } |
| 153 | |
| 154 | check_overtemp_occured() { |
| 155 | if [[ -f $overtemp_fault_flag ]]; then |
| 156 | echo "Over temperature occured, turn on fault LED" |
| 157 | overtemp_occured="true" |
| 158 | else |
| 159 | overtemp_occured="false" |
| 160 | fi |
| 161 | } |
| 162 | |
| 163 | # daemon start |
| 164 | while true |
| 165 | do |
| 166 | check_gpio_fault |
| 167 | check_fan_failed |
| 168 | check_overtemp_occured |
| 169 | check_RAS_UE_occured |
| 170 | |
| 171 | # Monitors PSU presence |
| 172 | check_psu_failed |
| 173 | |
| 174 | check_fault |
| 175 | control_fault_led |
| 176 | sleep 2 |
| 177 | done |
| 178 | |
| 179 | exit 1 |