blob: ffa070ecbff6953b084afdd0dda2a12ae6c1f1da [file] [log] [blame]
George Keishing664a0162017-06-05 12:24:24 -05001*** Settings ***
Steven Sombar0278b132018-01-09 14:41:32 -06002Documentation Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST The BMC host name or IP address.
6# OS_HOST The OS host name or IP Address.
7# OS_USERNAME The OS login userid (usually "root").
8# OS_PASSWORD The password for the OS login.
9# HTX_DURATION Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP The number of times to loop HTX.
11# HTX_INTERVAL The time delay between consecutive checks of HTX
12# status, for example, "15m".
13# In summary: Run HTX for $HTX_DURATION, looping
14# $HTX_LOOP times checking for errors every
15# $HTX_INTERVAL. Then allow extra time for OS
16# Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING If set to 1, this indicates that the HTX is to
18# continue running after an error was found.
19
George Keishing664a0162017-06-05 12:24:24 -050020
21Resource ../syslib/utils_os.robot
22
Steven Sombar197e3802017-11-02 08:07:29 -050023Suite Setup Run Keyword Start SOL Console Logging
George Keishingdc1691d2017-12-07 12:17:46 -060024Test Setup Test Setup Execution
Steven Sombar197e3802017-11-02 08:07:29 -050025Test Teardown Test Teardown Execution
George Keishing664a0162017-06-05 12:24:24 -050026
27*** Variables ****
28
Steven Sombar0278b132018-01-09 14:41:32 -060029${HTX_DURATION} 1h
30${HTX_LOOP} ${1}
31${HTX_INTERVAL} 30m
32${HTX_KEEP_RUNNING} ${0}
George Keishing664a0162017-06-05 12:24:24 -050033${stack_mode} skip
34
35*** Test Cases ***
36
37GPU Stress Test
38 [Documentation] Stress the GPU using HTX exerciser.
39 [Tags] GPU_Stress_Test
40
Steven Sombar0278b132018-01-09 14:41:32 -060041 # Get number of GPU reported by the BMC.
42 ${num_bmc_gpus}= Count GPUs From BMC
43 Rpvars num_bmc_gpus
44
45 # The BMC and OS should report the same number of GPUs.
46 ${failmsg01}= Catenate OS reports ${num_os_gpus} GPUs, but BMC
47 ... reports ${num_bmc_gpus} present and functional GPUs.
48 Run Keyword If '${num_os_gpus}' != '${num_bmc_gpus}'
49 ... Fail msg=${failmsg01}
50
51 # Show parameters for HTX stress test.
George Keishing664a0162017-06-05 12:24:24 -050052 Rprintn
Steven Sombar0278b132018-01-09 14:41:32 -060053 Rpvars HTX_DURATION HTX_LOOP HTX_INTERVAL
54
55 # Set the iteration (loop) counter.
56 Set Suite Variable ${iteration} ${0} children=true
57
58
59 # Shutdown HTX if it is already running.
Steven Sombar3ecb6892018-03-23 11:41:08 -050060 ${status}= Is HTX Running
Steven Sombar0278b132018-01-09 14:41:32 -060061 Run Keyword If '${status}' == 'True'
62 ... Shutdown HTX Exerciser
George Keishing664a0162017-06-05 12:24:24 -050063
64 Repeat Keyword ${HTX_LOOP} times Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70 [Documentation] Start HTX exerciser.
71 # Test Flow:
72 # - Power on
73 # - Establish SSH connection session
74 # - Collect GPU nvidia status output
75 # - Create HTX mdt profile
76 # - Run GPU specific HTX exerciser
Steven Sombar0278b132018-01-09 14:41:32 -060077 # - Check for errors
78
79 Set Suite Variable ${iteration} ${iteration + 1}
80 ${loop_count}= Catenate Starting iteration: ${iteration}
81 Rprintn
82 Rpvars loop_count
83
84 REST Power On stack_mode=skip
George Keishing664a0162017-06-05 12:24:24 -050085
86 # Collect data before the test starts.
87 Collect NVIDIA Log File start
88
Steven Sombar0278b132018-01-09 14:41:32 -060089 # Collect NVIDIA maximum limits.
90 ${power_max}= Get GPU Power Limit
91 ${temperature_max}= Get GPU Temperature Limit
92 ${clock_max}= Get GPU Clock Limit
93
George Keishing664a0162017-06-05 12:24:24 -050094 Run Keyword If '${HTX_MDT_PROFILE}' == 'mdt.bu'
95 ... Create Default MDT Profile
96
97 Run MDT Profile
98
99 Loop HTX Health Check
100
101 # Post test loop look out for dmesg error logged.
102 Check For Errors On OS Dmesg Log
103
Steven Sombar0278b132018-01-09 14:41:32 -0600104 # Check NVIDIA power, temperature, and clocks.
105 ${power}= Get GPU Power
106 ${temperature}= Get GPU Temperature
Steven Sombarc02dde82018-05-01 09:49:14 -0500107 ${temperature_via_rest}= Get GPU Temperature Via REST
Steven Sombar0278b132018-01-09 14:41:32 -0600108 ${clock}= Get GPU Clock
109 Rprintn
Steven Sombarc02dde82018-05-01 09:49:14 -0500110 Rpvars power power_max temperature temperature_via_rest
111 ... temperature_max clock clock_max
112
Steven Sombar0278b132018-01-09 14:41:32 -0600113 Run Keyword If ${power} > ${power_max} Fail
114 ... msg=GPU Power ${power} exceeds limit of ${power_max}.
Steven Sombarc02dde82018-05-01 09:49:14 -0500115
116 ${err_msg}= Catenate GPU temperature of ${temperature} exceeds limit
Steven Sombar0278b132018-01-09 14:41:32 -0600117 ... of ${temperature_max}.
Steven Sombarc02dde82018-05-01 09:49:14 -0500118 Run Keyword If ${temperature} > ${temperature_max} Fail msg=${err_msg}
119
Steven Sombar0278b132018-01-09 14:41:32 -0600120 Run Keyword If ${clock} > ${clock_max} Fail
121 ... msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
122
Steven Sombarc02dde82018-05-01 09:49:14 -0500123 ${err_msg}= Catenate The GPU temperature reported by REST is not within
124 ... 5 degrees of the nvidia_smi reported temperature.
125 ${upper_limit}= Evaluate ${temperature_via_rest}+5
126 ${lower_limit}= Evaluate ${temperature_via_rest}-5
127 Run Keyword If
128 ... ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
129 ... Fail msg=${err_msg}
130
George Keishing664a0162017-06-05 12:24:24 -0500131 Shutdown HTX Exerciser
132
Steven Sombar0278b132018-01-09 14:41:32 -0600133 Collect NVIDIA Log File end
134 Error Logs Should Not Exist
135 REST Power Off
136
137 Flush REST Sessions
138
George Keishing664a0162017-06-05 12:24:24 -0500139 Rprint Timen HTX Test ran for: ${HTX_DURATION}
140
Steven Sombar0278b132018-01-09 14:41:32 -0600141 ${loop_count}= Catenate Ending iteration: ${iteration}
142 Rprintn
143 Rpvars loop_count
144
George Keishing664a0162017-06-05 12:24:24 -0500145
146Loop HTX Health Check
147 [Documentation] Run until HTX exerciser fails.
148
149 Repeat Keyword ${HTX_DURATION}
150 ... Run Keywords Check HTX Run Status
151 ... AND Sleep ${HTX_INTERVAL}
152
153
Steven Sombar0278b132018-01-09 14:41:32 -0600154Test Setup Execution
155 [Documentation] Do the initial test setup.
156
157 REST Power On stack_mode=skip
158 Delete All Error Logs
159 Tool Exist lspci
160 Tool Exist htxcmdline
161 Tool Exist nvidia-smi
162
163 # Get number of GPUs reported by the OS.
164 ${cmd}= Catenate lspci | grep NVIDIA | wc -l
165 ${num_os_gpus} ${stderr} ${rc}= OS Execute Command ${cmd}
166 Rprintn
167 Rpvars num_os_gpus
168
169 # If no GPUs detected, we cannot continue.
170 Run Keyword If '${num_os_gpus}' == '${0}' Fail
171 ... msg=No GPUs detected so cannot run test.
172
173 Set Suite Variable ${num_os_gpus} children=true
174
175
176
Steven Sombar197e3802017-11-02 08:07:29 -0500177Test Teardown Execution
George Keishing664a0162017-06-05 12:24:24 -0500178 [Documentation] Do the post test teardown.
George Keishing664a0162017-06-05 12:24:24 -0500179
180 # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
181 Run Keyword If '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
182 ... Shutdown HTX Exerciser
183
Steven Sombar197e3802017-11-02 08:07:29 -0500184 ${keyword_buf}= Catenate Stop SOL Console Logging
185 ... \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
186 Run Key ${keyword_buf}
187
George Keishing664a0162017-06-05 12:24:24 -0500188 FFDC On Test Case Fail
189 Close All Connections