Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | # This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status |
| 3 | |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 4 | # shellcheck disable=SC2046 |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 5 | |
| 6 | # common variables |
| 7 | on=1 |
| 8 | off=0 |
| 9 | |
| 10 | overtemp_fault_flag='/tmp/fault_overtemp' |
| 11 | |
| 12 | # gpio fault |
| 13 | gpio_fault="false" |
| 14 | gpio_fault_flag="/tmp/gpio_fault" |
| 15 | |
| 16 | # fan variables |
| 17 | fan_failed="false" |
| 18 | fan_failed_flag='/tmp/fan_failed' |
| 19 | |
| 20 | # PSU variables |
| 21 | psu_failed="false" |
| 22 | psu_bus=2 |
| 23 | psu0_addr=0x58 |
| 24 | psu1_addr=0x59 |
| 25 | status_word_cmd=0x79 |
| 26 | # Following the PMBus Specification |
| 27 | # Bit[1]: CML faults |
| 28 | # Bit[2]: Over temperature faults |
| 29 | # Bit[3]: Under voltage faults |
| 30 | # Bit[4]: Over current faults |
| 31 | # Bit[5]: Over voltage fault |
| 32 | # Bit[10]: Fan faults |
| 33 | psu_fault_bitmask=0x43e |
| 34 | |
| 35 | # led variables |
| 36 | fan_fault_led_status=$off |
| 37 | psu_fault_led_status=$off |
| 38 | led_bus=15 |
| 39 | led_addr=0x22 |
| 40 | led_port0_config=0x06 |
| 41 | led_port0_output=0x02 |
| 42 | |
| 43 | # functions declaration |
| 44 | check_fan_failed() { |
| 45 | if [[ -f $fan_failed_flag ]]; then |
| 46 | fan_failed="true" |
| 47 | else |
| 48 | fan_failed="false" |
| 49 | fi |
| 50 | } |
| 51 | |
| 52 | turn_on_off_fan_fault_led() { |
| 53 | # Control fan fault led via CPLD's I2C at slave address 0x22, I2C16. |
| 54 | # Get Port0 value |
| 55 | p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) |
| 56 | p0_val=$(("$p0_val" & ~1)) |
| 57 | # Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0]. |
| 58 | i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val |
| 59 | |
| 60 | # Get led value |
| 61 | led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) |
| 62 | |
| 63 | if [ "$1" == $on ]; then |
| 64 | led_st=$(("$led_st" | 1)) |
| 65 | else |
| 66 | led_st=$(("$led_st" & ~1)) |
| 67 | fi |
| 68 | |
| 69 | # Turn on/off fan fault led |
| 70 | i2cset -f -y $led_bus $led_addr $led_port0_output $led_st |
| 71 | } |
| 72 | |
| 73 | turn_on_off_psu_fault_led() { |
| 74 | # Control psu fault led via CPLD's I2C at slave address 0x22, I2C16. |
| 75 | # Get Port1 value |
| 76 | p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config) |
| 77 | p1_val=$(("$p1_val" & ~2)) |
| 78 | # Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1]. |
| 79 | i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val |
| 80 | |
| 81 | # Get led value |
| 82 | led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output) |
| 83 | if [ "$1" == $on ]; then |
| 84 | led_st=$(("$led_st" | 2)) |
| 85 | else |
| 86 | led_st=$(("$led_st" & ~2)) |
| 87 | fi |
| 88 | |
| 89 | # Turn on/off psu fault led |
| 90 | i2cset -f -y $led_bus $led_addr $led_port0_output $led_st |
| 91 | } |
| 92 | |
| 93 | control_fan_fault_led() { |
| 94 | if [ "$fan_failed" == "true" ]; then |
| 95 | if [ "$fan_fault_led_status" == $off ]; then |
| 96 | turn_on_off_fan_fault_led $on |
| 97 | fan_fault_led_status=$on |
| 98 | fi |
| 99 | else |
| 100 | if [ "$fan_fault_led_status" == $on ]; then |
| 101 | turn_on_off_fan_fault_led $off |
| 102 | fan_fault_led_status=$off |
| 103 | fi |
| 104 | fi |
| 105 | } |
| 106 | |
| 107 | check_psu_failed() { |
| 108 | local psu0_presence |
| 109 | local psu1_presence |
| 110 | local psu0_value |
| 111 | local psu1_value |
| 112 | |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 113 | psu0_presence=$(gpioget $(gpiofind presence-ps0)) |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 114 | psu0_failed="true" |
| 115 | if [ "$psu0_presence" == "0" ]; then |
| 116 | # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD |
| 117 | psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w) |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 118 | psu0_bit_fault=$((psu0_value & psu_fault_bitmask)) |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 119 | if [ "$psu0_bit_fault" == "0" ]; then |
| 120 | psu0_failed="false" |
| 121 | fi |
| 122 | fi |
| 123 | |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 124 | psu1_presence=$(gpioget $(gpiofind presence-ps1)) |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 125 | psu1_failed="true" |
| 126 | if [ "$psu1_presence" == "0" ]; then |
| 127 | # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD |
| 128 | psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w) |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 129 | psu1_bit_fault=$((psu1_value & psu_fault_bitmask)) |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 130 | if [ "$psu1_bit_fault" == "0" ]; then |
| 131 | psu1_failed="false" |
| 132 | fi |
| 133 | fi |
| 134 | |
| 135 | if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then |
| 136 | psu_failed="true" |
| 137 | else |
| 138 | psu_failed="false" |
| 139 | fi |
| 140 | } |
| 141 | |
| 142 | control_psu_fault_led() { |
| 143 | if [ "$psu_failed" == "true" ]; then |
| 144 | if [ "$psu_fault_led_status" == $off ]; then |
| 145 | turn_on_off_psu_fault_led $on |
| 146 | psu_fault_led_status=$on |
| 147 | fi |
| 148 | else |
| 149 | if [ "$psu_fault_led_status" == $on ]; then |
| 150 | turn_on_off_psu_fault_led $off |
| 151 | psu_fault_led_status=$off |
| 152 | fi |
| 153 | fi |
| 154 | } |
| 155 | |
| 156 | check_overtemp_occured() { |
| 157 | if [[ -f $overtemp_fault_flag ]]; then |
| 158 | echo "Over temperature occured, turn on fault LED" |
| 159 | overtemp_occured="true" |
| 160 | else |
| 161 | overtemp_occured="false" |
| 162 | fi |
| 163 | } |
| 164 | |
| 165 | |
| 166 | check_gpio_fault() { |
| 167 | if [[ -f $gpio_fault_flag ]]; then |
| 168 | echo "GPIO fault event(s) occured, turn on fault LED" |
| 169 | gpio_fault="true" |
| 170 | else |
| 171 | gpio_fault="false" |
| 172 | fi |
| 173 | } |
| 174 | |
| 175 | check_fault() { |
| 176 | if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \ |
| 177 | || [[ "$overtemp_occured" == "true" ]] \ |
| 178 | || [[ "$gpio_fault" == "true" ]]; then |
| 179 | fault="true" |
| 180 | else |
| 181 | fault="false" |
| 182 | fi |
| 183 | } |
| 184 | |
| 185 | # The System Fault Led turns on upon the system error, update the System Fault Led |
| 186 | # based on the Fan fault status and PSU fault status |
| 187 | control_sys_fault_led() { |
| 188 | # Turn on/off the System Fault Led |
| 189 | if [ "$fault" == "true" ]; then |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 190 | gpioset $(gpiofind led-fault)=1 |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 191 | else |
Thang Q. Nguyen | d9c8965 | 2023-10-05 09:02:31 +0700 | [diff] [blame^] | 192 | gpioset $(gpiofind led-fault)=0 |
Thang Q. Nguyen | 98de8b9 | 2023-05-05 15:33:27 +0700 | [diff] [blame] | 193 | fi |
| 194 | } |
| 195 | |
| 196 | # daemon start |
| 197 | while true |
| 198 | do |
| 199 | # Monitors Fan speeds |
| 200 | check_fan_failed |
| 201 | # Monitors PSU presence |
| 202 | check_psu_failed |
| 203 | |
| 204 | check_overtemp_occured |
| 205 | check_gpio_fault |
| 206 | # Check fault to update fail |
| 207 | check_fault |
| 208 | control_sys_fault_led |
| 209 | |
| 210 | control_fan_fault_led |
| 211 | control_psu_fault_led |
| 212 | |
| 213 | sleep 2 |
| 214 | done |
| 215 | |
| 216 | exit 1 |