blob: 8f9637c82e5899d3e8ba200354528690a8ec58d6 [file] [log] [blame]
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +07001#!/bin/bash
2
3# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
4
5# shellcheck disable=SC2004
6# shellcheck disable=SC2046
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +07007
8# common variables
9 warning_fault_flag='/tmp/fault_warning'
10 error_fault_flag='/tmp/fault_err'
11 overtemp_fault_flag='/tmp/fault_overtemp'
12 fault_RAS_UE_flag='/tmp/fault_RAS_UE'
13
14 blink_rate=100000
15
16 fault="false"
17
18 on="true"
19 off="false"
20
21 gpio_fault="false"
22
23# fan variables
24 fan_failed="false"
25 fan_failed_flag='/tmp/fan_failed'
26
27# PSU variables
28 psu_failed="false"
29 psu_bus=6
30 psu0_addr=0x58
31 psu1_addr=0x59
32 status_word_cmd=0x79
33 # Following the PMBus Specification
34 # Bit[1]: CML faults
35 # Bit[2]: Over temperature faults
36 # Bit[3]: Under voltage faults
37 # Bit[4]: Over current faults
38 # Bit[5]: Over voltage fault
39 # Bit[10]: Fan faults
40 psu_fault_bitmask=0x43e
41
42# led variables
43 led_service='xyz.openbmc_project.LED.GroupManager'
44 led_fault_path='/xyz/openbmc_project/led/groups/system_fault'
45 led_fault_interface='xyz.openbmc_project.Led.Group'
46 fault_led_status=$off
47
48# functions declaration
49check_fan_failed() {
50 if [[ -f $fan_failed_flag ]]; then
51 fan_failed="true"
52 else
53 fan_failed="false"
54 fi
55}
56
57turn_on_off_fault_led() {
58 busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null
59}
60
61check_psu_failed() {
62 local psu0_presence
63 local psu1_presence
64 local psu0_value
65 local psu1_value
66
67 psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT))
68 psu0_failed="true"
69 if [ "$psu0_presence" == "0" ]; then
70 # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
71 psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
72 psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
73 if [ "$psu0_bit_fault" == "0" ]; then
74 psu0_failed="false"
75 fi
76 fi
77
78 psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT))
79 psu1_failed="true"
80 if [ "$psu1_presence" == "0" ]; then
81 # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
82 psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
83 psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
84 if [ "$psu1_bit_fault" == "0" ]; then
85 psu1_failed="false"
86 fi
87 fi
88
89 if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
90 psu_failed="true"
91 else
92 psu_failed="false"
93 fi
94}
95
96check_fault() {
97 if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
98 || [[ "$gpio_fault" == "true" ]] \
99 || [[ "$RAS_UE_occured" == "true" ]] \
100 || [[ "$overtemp_occured" == "true" ]]; then
101 fault="true"
102 else
103 fault="false"
104 fi
105}
106
107control_fault_led() {
108 if [ "$fault" == "true" ]; then
109 if [ "$fault_led_status" == $off ]; then
110 turn_on_off_fault_led $on
111 fault_led_status=$on
112 fi
113 else
114 if [ "$fault_led_status" == $on ]; then
115 turn_on_off_fault_led $off
116 fault_led_status=$off
117 fi
118 fi
119}
120
121blink_fault_led() {
122 if [ "$fault_led_status" == $off ]; then
123 turn_on_off_fault_led $on
124 usleep $blink_rate
125 turn_on_off_fault_led $off
126 else
127 turn_on_off_fault_led $off
128 usleep $blink_rate
129 turn_on_off_fault_led $on
130 fi
131}
132
133check_gpio_fault() {
134 if [[ -f $error_fault_flag ]]; then
135 gpio_fault="true"
136 else
137 if [ -f $warning_fault_flag ]; then
138 blink_fault_led
139 rm $warning_fault_flag
140 fi
141 gpio_fault="false"
142 fi
143}
144
145check_RAS_UE_occured() {
146 if [[ -f $fault_RAS_UE_flag ]]; then
147 echo "RAS UE error occured, turn on fault LED"
148 RAS_UE_occured="true"
149 else
150 RAS_UE_occured="false"
151 fi
152}
153
154check_overtemp_occured() {
155 if [[ -f $overtemp_fault_flag ]]; then
156 echo "Over temperature occured, turn on fault LED"
157 overtemp_occured="true"
158 else
159 overtemp_occured="false"
160 fi
161}
162
163# daemon start
164while true
165do
166 check_gpio_fault
167 check_fan_failed
168 check_overtemp_occured
169 check_RAS_UE_occured
170
171 # Monitors PSU presence
172 check_psu_failed
173
174 check_fault
175 control_fault_led
176 sleep 2
177done
178
179exit 1