blob: b1cf7853aba3eaaf8b856798566bbca2c27e067b [file] [log] [blame]
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +07001#!/bin/bash
2# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
3
Thang Q. Nguyend9c89652023-10-05 09:02:31 +07004# shellcheck disable=SC2046
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +07005
6# common variables
7 on=1
8 off=0
9
10 overtemp_fault_flag='/tmp/fault_overtemp'
11
12# gpio fault
13 gpio_fault="false"
14 gpio_fault_flag="/tmp/gpio_fault"
15
16# fan variables
17 fan_failed="false"
18 fan_failed_flag='/tmp/fan_failed'
19
20# PSU variables
21 psu_failed="false"
22 psu_bus=2
23 psu0_addr=0x58
24 psu1_addr=0x59
25 status_word_cmd=0x79
26 # Following the PMBus Specification
27 # Bit[1]: CML faults
28 # Bit[2]: Over temperature faults
29 # Bit[3]: Under voltage faults
30 # Bit[4]: Over current faults
31 # Bit[5]: Over voltage fault
32 # Bit[10]: Fan faults
33 psu_fault_bitmask=0x43e
34
35# led variables
36 fan_fault_led_status=$off
37 psu_fault_led_status=$off
38 led_bus=15
39 led_addr=0x22
40 led_port0_config=0x06
41 led_port0_output=0x02
42
43# functions declaration
44check_fan_failed() {
45 if [[ -f $fan_failed_flag ]]; then
46 fan_failed="true"
47 else
48 fan_failed="false"
49 fi
50}
51
52turn_on_off_fan_fault_led() {
53 # Control fan fault led via CPLD's I2C at slave address 0x22, I2C16.
54 # Get Port0 value
55 p0_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config)
56 p0_val=$(("$p0_val" & ~1))
57 # Config CPLD's IOepx Port0[0] from input to output, clear IOepx Port0[0].
58 i2cset -f -y $led_bus $led_addr $led_port0_config $p0_val
59
60 # Get led value
61 led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output)
62
63 if [ "$1" == $on ]; then
64 led_st=$(("$led_st" | 1))
65 else
66 led_st=$(("$led_st" & ~1))
67 fi
68
69 # Turn on/off fan fault led
70 i2cset -f -y $led_bus $led_addr $led_port0_output $led_st
71}
72
73turn_on_off_psu_fault_led() {
74 # Control psu fault led via CPLD's I2C at slave address 0x22, I2C16.
75 # Get Port1 value
76 p1_val=$(i2cget -f -y $led_bus $led_addr $led_port0_config)
77 p1_val=$(("$p1_val" & ~2))
78 # Config CPLD's IOepx Port0[1] from input to output, clear IOepx Port0[1].
79 i2cset -f -y $led_bus $led_addr $led_port0_config $p1_val
80
81 # Get led value
82 led_st=$(i2cget -f -y $led_bus $led_addr $led_port0_output)
83 if [ "$1" == $on ]; then
84 led_st=$(("$led_st" | 2))
85 else
86 led_st=$(("$led_st" & ~2))
87 fi
88
89 # Turn on/off psu fault led
90 i2cset -f -y $led_bus $led_addr $led_port0_output $led_st
91}
92
93control_fan_fault_led() {
94 if [ "$fan_failed" == "true" ]; then
95 if [ "$fan_fault_led_status" == $off ]; then
96 turn_on_off_fan_fault_led $on
97 fan_fault_led_status=$on
98 fi
99 else
100 if [ "$fan_fault_led_status" == $on ]; then
101 turn_on_off_fan_fault_led $off
102 fan_fault_led_status=$off
103 fi
104 fi
105}
106
107check_psu_failed() {
108 local psu0_presence
109 local psu1_presence
110 local psu0_value
111 local psu1_value
112
Thang Q. Nguyend9c89652023-10-05 09:02:31 +0700113 psu0_presence=$(gpioget $(gpiofind presence-ps0))
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +0700114 psu0_failed="true"
115 if [ "$psu0_presence" == "0" ]; then
116 # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
117 psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
Thang Q. Nguyend9c89652023-10-05 09:02:31 +0700118 psu0_bit_fault=$((psu0_value & psu_fault_bitmask))
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +0700119 if [ "$psu0_bit_fault" == "0" ]; then
120 psu0_failed="false"
121 fi
122 fi
123
Thang Q. Nguyend9c89652023-10-05 09:02:31 +0700124 psu1_presence=$(gpioget $(gpiofind presence-ps1))
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +0700125 psu1_failed="true"
126 if [ "$psu1_presence" == "0" ]; then
127 # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
128 psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
Thang Q. Nguyend9c89652023-10-05 09:02:31 +0700129 psu1_bit_fault=$((psu1_value & psu_fault_bitmask))
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +0700130 if [ "$psu1_bit_fault" == "0" ]; then
131 psu1_failed="false"
132 fi
133 fi
134
135 if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
136 psu_failed="true"
137 else
138 psu_failed="false"
139 fi
140}
141
142control_psu_fault_led() {
143 if [ "$psu_failed" == "true" ]; then
144 if [ "$psu_fault_led_status" == $off ]; then
145 turn_on_off_psu_fault_led $on
146 psu_fault_led_status=$on
147 fi
148 else
149 if [ "$psu_fault_led_status" == $on ]; then
150 turn_on_off_psu_fault_led $off
151 psu_fault_led_status=$off
152 fi
153 fi
154}
155
156check_overtemp_occured() {
157 if [[ -f $overtemp_fault_flag ]]; then
158 echo "Over temperature occured, turn on fault LED"
159 overtemp_occured="true"
160 else
161 overtemp_occured="false"
162 fi
163}
164
165
166check_gpio_fault() {
167 if [[ -f $gpio_fault_flag ]]; then
168 echo "GPIO fault event(s) occured, turn on fault LED"
169 gpio_fault="true"
170 else
171 gpio_fault="false"
172 fi
173}
174
175check_fault() {
176 if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
177 || [[ "$overtemp_occured" == "true" ]] \
178 || [[ "$gpio_fault" == "true" ]]; then
179 fault="true"
180 else
181 fault="false"
182 fi
183}
184
185# The System Fault Led turns on upon the system error, update the System Fault Led
186# based on the Fan fault status and PSU fault status
187control_sys_fault_led() {
188 # Turn on/off the System Fault Led
189 if [ "$fault" == "true" ]; then
Thang Q. Nguyend9c89652023-10-05 09:02:31 +0700190 gpioset $(gpiofind led-fault)=1
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +0700191 else
Thang Q. Nguyend9c89652023-10-05 09:02:31 +0700192 gpioset $(gpiofind led-fault)=0
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +0700193 fi
194}
195
196# daemon start
197while true
198do
199 # Monitors Fan speeds
200 check_fan_failed
201 # Monitors PSU presence
202 check_psu_failed
203
204 check_overtemp_occured
205 check_gpio_fault
206 # Check fault to update fail
207 check_fault
208 control_sys_fault_led
209
210 control_fan_fault_led
211 control_psu_fault_led
212
213 sleep 2
214done
215
216exit 1