blob: 44cbb11b137ac2f35bc344ff023c38f004d47feb [file] [log] [blame]
Thang Q. Nguyen98de8b92023-05-05 15:33:27 +07001#!/bin/bash
2
3# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
4
5# shellcheck disable=SC2004
6# shellcheck disable=SC2046
7# shellcheck source=/dev/null
8
9# common variables
10 warning_fault_flag='/tmp/fault_warning'
11 error_fault_flag='/tmp/fault_err'
12 overtemp_fault_flag='/tmp/fault_overtemp'
13 fault_RAS_UE_flag='/tmp/fault_RAS_UE'
14
15 blink_rate=100000
16
17 fault="false"
18
19 on="true"
20 off="false"
21
22 gpio_fault="false"
23
24# fan variables
25 fan_failed="false"
26 fan_failed_flag='/tmp/fan_failed'
27
28# PSU variables
29 psu_failed="false"
30 psu_bus=6
31 psu0_addr=0x58
32 psu1_addr=0x59
33 status_word_cmd=0x79
34 # Following the PMBus Specification
35 # Bit[1]: CML faults
36 # Bit[2]: Over temperature faults
37 # Bit[3]: Under voltage faults
38 # Bit[4]: Over current faults
39 # Bit[5]: Over voltage fault
40 # Bit[10]: Fan faults
41 psu_fault_bitmask=0x43e
42
43# led variables
44 led_service='xyz.openbmc_project.LED.GroupManager'
45 led_fault_path='/xyz/openbmc_project/led/groups/system_fault'
46 led_fault_interface='xyz.openbmc_project.Led.Group'
47 fault_led_status=$off
48
49# functions declaration
50check_fan_failed() {
51 if [[ -f $fan_failed_flag ]]; then
52 fan_failed="true"
53 else
54 fan_failed="false"
55 fi
56}
57
58turn_on_off_fault_led() {
59 busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null
60}
61
62check_psu_failed() {
63 local psu0_presence
64 local psu1_presence
65 local psu0_value
66 local psu1_value
67
68 psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT))
69 psu0_failed="true"
70 if [ "$psu0_presence" == "0" ]; then
71 # PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
72 psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
73 psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
74 if [ "$psu0_bit_fault" == "0" ]; then
75 psu0_failed="false"
76 fi
77 fi
78
79 psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT))
80 psu1_failed="true"
81 if [ "$psu1_presence" == "0" ]; then
82 # PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
83 psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
84 psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
85 if [ "$psu1_bit_fault" == "0" ]; then
86 psu1_failed="false"
87 fi
88 fi
89
90 if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
91 psu_failed="true"
92 else
93 psu_failed="false"
94 fi
95}
96
97check_fault() {
98 if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
99 || [[ "$gpio_fault" == "true" ]] \
100 || [[ "$RAS_UE_occured" == "true" ]] \
101 || [[ "$overtemp_occured" == "true" ]]; then
102 fault="true"
103 else
104 fault="false"
105 fi
106}
107
108control_fault_led() {
109 if [ "$fault" == "true" ]; then
110 if [ "$fault_led_status" == $off ]; then
111 turn_on_off_fault_led $on
112 fault_led_status=$on
113 fi
114 else
115 if [ "$fault_led_status" == $on ]; then
116 turn_on_off_fault_led $off
117 fault_led_status=$off
118 fi
119 fi
120}
121
122blink_fault_led() {
123 if [ "$fault_led_status" == $off ]; then
124 turn_on_off_fault_led $on
125 usleep $blink_rate
126 turn_on_off_fault_led $off
127 else
128 turn_on_off_fault_led $off
129 usleep $blink_rate
130 turn_on_off_fault_led $on
131 fi
132}
133
134check_gpio_fault() {
135 if [[ -f $error_fault_flag ]]; then
136 gpio_fault="true"
137 else
138 if [ -f $warning_fault_flag ]; then
139 blink_fault_led
140 rm $warning_fault_flag
141 fi
142 gpio_fault="false"
143 fi
144}
145
146check_RAS_UE_occured() {
147 if [[ -f $fault_RAS_UE_flag ]]; then
148 echo "RAS UE error occured, turn on fault LED"
149 RAS_UE_occured="true"
150 else
151 RAS_UE_occured="false"
152 fi
153}
154
155check_overtemp_occured() {
156 if [[ -f $overtemp_fault_flag ]]; then
157 echo "Over temperature occured, turn on fault LED"
158 overtemp_occured="true"
159 else
160 overtemp_occured="false"
161 fi
162}
163
164# daemon start
165while true
166do
167 check_gpio_fault
168 check_fan_failed
169 check_overtemp_occured
170 check_RAS_UE_occured
171
172 # Monitors PSU presence
173 check_psu_failed
174
175 check_fault
176 control_fault_led
177 sleep 2
178done
179
180exit 1