Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | # This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status |
| 3 | |
| 4 | # shellcheck disable=SC2004 |
| 5 | # shellcheck source=/dev/null |
| 6 | source /usr/sbin/gpio-lib.sh |
| 7 | |
| 8 | # common variables |
| 9 | on=1 |
| 10 | off=0 |
| 11 | |
| 12 | overtemp_fault_flag='/tmp/fault_overtemp' |
| 13 | |
| 14 | # gpio fault |
| 15 | gpio_fault="false" |
| 16 | gpio_fault_flag="/tmp/gpio_fault" |
| 17 | |
| 18 | # fan variables |
| 19 | fan_failed="false" |
| 20 | fan_failed_flag='/tmp/fan_failed' |
| 21 | |
| 22 | # PSU variables |
| 23 | psu_failed="false" |
| 24 | psu_bus=2 |
| 25 | psu0_addr=0x58 |
| 26 | psu1_addr=0x59 |
| 27 | status_word_cmd=0x79 |
| 28 | # Following the PMBus Specification |
| 29 | # Bit[1]: CML faults |
| 30 | # Bit[2]: Over temperature faults |
| 31 | # Bit[3]: Under voltage faults |
| 32 | # Bit[4]: Over current faults |
| 33 | # Bit[5]: Over voltage fault |
| 34 | # Bit[10]: Fan faults |
| 35 | psu_fault_bitmask=0x43e |
| 36 | |
| 37 | # led variables |
| 38 | fan_fault_led_status=$off |
| 39 | psu_fault_led_status=$off |
| 40 | led_bus=15 |
| 41 | led_addr=0x22 |
| 42 | led_port0_config=0x06 |
| 43 | led_port0_output=0x02 |
| 44 | |
| 45 | # functions declaration |
| 46 | check_fan_failed() { |
| 47 | if [[ -f $fan_failed_flag ]]; then |
| 48 | fan_failed="true" |
| 49 | else |
| 50 | fan_failed="false" |
| 51 | fi |
| 52 | } |
| 53 | |
| 54 | turn_on_off_fan_fault_led() { |
| 55 | # Control fan fault led via CPLD's I2C at slave address 0x22, I2C16. |
| 56 | # Get Port0 value |
| 57 | p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) |
| 58 | p0_val=$(("$p0_val" & ~1)) |
| 59 | # Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0]. |
| 60 | i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val |
| 61 | |
| 62 | # Get led value |
| 63 | led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) |
| 64 | |
| 65 | if [ "$1" == $on ]; then |
| 66 | led_st=$(("$led_st" | 1)) |
| 67 | else |
| 68 | led_st=$(("$led_st" & ~1)) |
| 69 | fi |
| 70 | |
| 71 | # Turn on/off fan fault led |
| 72 | i2cset -f -y $led_bus $led_addr $led_port0_output $led_st |
| 73 | } |
| 74 | |
| 75 | turn_on_off_psu_fault_led() { |
| 76 | # Control psu fault led via CPLD's I2C at slave address 0x22, I2C16. |
| 77 | # Get Port1 value |
| 78 | p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) |
| 79 | p1_val=$(("$p1_val" & ~2)) |
| 80 | # Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1]. |
| 81 | i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val |
| 82 | |
| 83 | # Get led value |
| 84 | led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) |
| 85 | if [ "$1" == $on ]; then |
| 86 | led_st=$(("$led_st" | 2)) |
| 87 | else |
| 88 | led_st=$(("$led_st" & ~2)) |
| 89 | fi |
| 90 | |
| 91 | # Turn on/off psu fault led |
| 92 | i2cset -f -y $led_bus $led_addr $led_port0_output $led_st |
| 93 | } |
| 94 | |
| 95 | control_fan_fault_led() { |
| 96 | if [ "$fan_failed" == "true" ]; then |
| 97 | if [ "$fan_fault_led_status" == $off ]; then |
| 98 | turn_on_off_fan_fault_led $on |
| 99 | fan_fault_led_status=$on |
| 100 | fi |
| 101 | else |
| 102 | if [ "$fan_fault_led_status" == $on ]; then |
| 103 | turn_on_off_fan_fault_led $off |
| 104 | fan_fault_led_status=$off |
| 105 | fi |
| 106 | fi |
| 107 | } |
| 108 | |
| 109 | check_psu_failed() { |
| 110 | local psu0_presence |
| 111 | local psu1_presence |
| 112 | local psu0_value |
| 113 | local psu1_value |
| 114 | |
| 115 | psu0_presence=$(gpio_name_get presence-ps0) |
| 116 | psu0_failed="true" |
| 117 | if [ "$psu0_presence" == "0" ]; then |
| 118 | # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD |
| 119 | psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w) |
| 120 | psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask)) |
| 121 | if [ "$psu0_bit_fault" == "0" ]; then |
| 122 | psu0_failed="false" |
| 123 | fi |
| 124 | fi |
| 125 | |
| 126 | psu1_presence=$(gpio_name_get presence-ps1) |
| 127 | psu1_failed="true" |
| 128 | if [ "$psu1_presence" == "0" ]; then |
| 129 | # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD |
| 130 | psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w) |
| 131 | psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask)) |
| 132 | if [ "$psu1_bit_fault" == "0" ]; then |
| 133 | psu1_failed="false" |
| 134 | fi |
| 135 | fi |
| 136 | |
| 137 | if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then |
| 138 | psu_failed="true" |
| 139 | else |
| 140 | psu_failed="false" |
| 141 | fi |
| 142 | } |
| 143 | |
| 144 | control_psu_fault_led() { |
| 145 | if [ "$psu_failed" == "true" ]; then |
| 146 | if [ "$psu_fault_led_status" == $off ]; then |
| 147 | turn_on_off_psu_fault_led $on |
| 148 | psu_fault_led_status=$on |
| 149 | fi |
| 150 | else |
| 151 | if [ "$psu_fault_led_status" == $on ]; then |
| 152 | turn_on_off_psu_fault_led $off |
| 153 | psu_fault_led_status=$off |
| 154 | fi |
| 155 | fi |
| 156 | } |
| 157 | |
| 158 | check_overtemp_occured() { |
| 159 | if [[ -f $overtemp_fault_flag ]]; then |
| 160 | echo "Over temperature occured, turn on fault LED" |
| 161 | overtemp_occured="true" |
| 162 | else |
| 163 | overtemp_occured="false" |
| 164 | fi |
| 165 | } |
| 166 | |
| 167 | |
| 168 | check_gpio_fault() { |
| 169 | if [[ -f $gpio_fault_flag ]]; then |
| 170 | echo "GPIO fault event(s) occured, turn on fault LED" |
| 171 | gpio_fault="true" |
| 172 | else |
| 173 | gpio_fault="false" |
| 174 | fi |
| 175 | } |
| 176 | |
| 177 | check_fault() { |
| 178 | if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \ |
| 179 | || [[ "$overtemp_occured" == "true" ]] \ |
| 180 | || [[ "$gpio_fault" == "true" ]]; then |
| 181 | fault="true" |
| 182 | else |
| 183 | fault="false" |
| 184 | fi |
| 185 | } |
| 186 | |
| 187 | # The System Fault Led turns on upon the system error, update the System Fault Led |
| 188 | # based on the Fan fault status and PSU fault status |
| 189 | control_sys_fault_led() { |
| 190 | # Turn on/off the System Fault Led |
| 191 | if [ "$fault" == "true" ]; then |
| 192 | gpio_name_set led-fault $on |
| 193 | else |
| 194 | gpio_name_set led-fault $off |
| 195 | fi |
| 196 | } |
| 197 | |
| 198 | # daemon start |
| 199 | while true |
| 200 | do |
| 201 | # Monitors Fan speeds |
| 202 | check_fan_failed |
| 203 | # Monitors PSU presence |
| 204 | check_psu_failed |
| 205 | |
| 206 | check_overtemp_occured |
| 207 | check_gpio_fault |
| 208 | # Check fault to update fail |
| 209 | check_fault |
| 210 | control_sys_fault_led |
| 211 | |
| 212 | control_fan_fault_led |
| 213 | control_psu_fault_led |
| 214 | |
| 215 | sleep 2 |
| 216 | done |
| 217 | |
| 218 | exit 1 |