blob: 2b03864345cf2ae5884cdafe720d9162b61f8f1e [file] [log] [blame]
George Keishing664a0162017-06-05 12:24:24 -05001*** Settings ***
Steven Sombar0278b132018-01-09 14:41:32 -06002Documentation Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST The BMC host name or IP address.
6# OS_HOST The OS host name or IP Address.
7# OS_USERNAME The OS login userid (usually "root").
8# OS_PASSWORD The password for the OS login.
9# HTX_DURATION Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP The number of times to loop HTX.
11# HTX_INTERVAL The time delay between consecutive checks of HTX
12# status, for example, "15m".
13# In summary: Run HTX for $HTX_DURATION, looping
14# $HTX_LOOP times checking for errors every
15# $HTX_INTERVAL. Then allow extra time for OS
16# Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING If set to 1, this indicates that the HTX is to
18# continue running after an error was found.
19
George Keishing664a0162017-06-05 12:24:24 -050020
George Keishinga28061a2023-12-15 14:46:54 +053021Resource ../lib/os_utilities.robot
George Keishing664a0162017-06-05 12:24:24 -050022
Steven Sombar197e3802017-11-02 08:07:29 -050023Suite Setup Run Keyword Start SOL Console Logging
George Keishingdc1691d2017-12-07 12:17:46 -060024Test Setup Test Setup Execution
Steven Sombar197e3802017-11-02 08:07:29 -050025Test Teardown Test Teardown Execution
George Keishing664a0162017-06-05 12:24:24 -050026
Matt Fischer6fb70d92023-10-24 19:06:33 -060027Test Tags GPU_Stress
George Keishing87dc4422023-10-20 12:56:30 +053028
George Keishing664a0162017-06-05 12:24:24 -050029*** Variables ****
30
Steven Sombar0278b132018-01-09 14:41:32 -060031${HTX_DURATION} 1h
32${HTX_LOOP} ${1}
33${HTX_INTERVAL} 30m
34${HTX_KEEP_RUNNING} ${0}
George Keishing664a0162017-06-05 12:24:24 -050035${stack_mode} skip
36
37*** Test Cases ***
38
39GPU Stress Test
40 [Documentation] Stress the GPU using HTX exerciser.
41 [Tags] GPU_Stress_Test
42
Steven Sombar0278b132018-01-09 14:41:32 -060043 # Get number of GPU reported by the BMC.
44 ${num_bmc_gpus}= Count GPUs From BMC
45 Rpvars num_bmc_gpus
46
47 # The BMC and OS should report the same number of GPUs.
48 ${failmsg01}= Catenate OS reports ${num_os_gpus} GPUs, but BMC
49 ... reports ${num_bmc_gpus} present and functional GPUs.
Sridevi Ramesh162c52a2025-05-13 23:47:09 -050050 IF '${num_os_gpus}' != '${num_bmc_gpus}' Fail msg=${failmsg01}
Steven Sombar0278b132018-01-09 14:41:32 -060051
52 # Show parameters for HTX stress test.
Michael Walshc108e422019-03-28 12:27:18 -050053 Printn
Steven Sombar0278b132018-01-09 14:41:32 -060054 Rpvars HTX_DURATION HTX_LOOP HTX_INTERVAL
55
56 # Set the iteration (loop) counter.
57 Set Suite Variable ${iteration} ${0} children=true
58
59
60 # Shutdown HTX if it is already running.
Steven Sombar3ecb6892018-03-23 11:41:08 -050061 ${status}= Is HTX Running
Sridevi Ramesh162c52a2025-05-13 23:47:09 -050062 IF '${status}' == 'True' Shutdown HTX Exerciser
George Keishing664a0162017-06-05 12:24:24 -050063
64 Repeat Keyword ${HTX_LOOP} times Execute GPU Test
65
66
67*** Keywords ***
68
69Execute GPU Test
70 [Documentation] Start HTX exerciser.
71 # Test Flow:
72 # - Power on
73 # - Establish SSH connection session
74 # - Collect GPU nvidia status output
75 # - Create HTX mdt profile
76 # - Run GPU specific HTX exerciser
Steven Sombar0278b132018-01-09 14:41:32 -060077 # - Check for errors
78
79 Set Suite Variable ${iteration} ${iteration + 1}
80 ${loop_count}= Catenate Starting iteration: ${iteration}
Michael Walshc108e422019-03-28 12:27:18 -050081 Printn
Steven Sombar0278b132018-01-09 14:41:32 -060082 Rpvars loop_count
83
84 REST Power On stack_mode=skip
Steven Sombar1ddc7c62018-06-01 11:55:34 -050085 Run Key U Sleep \ 15s
George Keishing664a0162017-06-05 12:24:24 -050086
87 # Collect data before the test starts.
88 Collect NVIDIA Log File start
89
Steven Sombar0278b132018-01-09 14:41:32 -060090 # Collect NVIDIA maximum limits.
91 ${power_max}= Get GPU Power Limit
92 ${temperature_max}= Get GPU Temperature Limit
93 ${clock_max}= Get GPU Clock Limit
94
George Keishing664a0162017-06-05 12:24:24 -050095 Run Keyword If '${HTX_MDT_PROFILE}' == 'mdt.bu'
96 ... Create Default MDT Profile
97
98 Run MDT Profile
99
100 Loop HTX Health Check
101
102 # Post test loop look out for dmesg error logged.
103 Check For Errors On OS Dmesg Log
104
Steven Sombar0278b132018-01-09 14:41:32 -0600105 # Check NVIDIA power, temperature, and clocks.
Joy Onyerikwu26975f02018-05-08 12:55:12 -0500106 ${power}= Get GPU Max Power
107 ${temperature}= Get GPU Max Temperature
Steven Sombarc02dde82018-05-01 09:49:14 -0500108 ${temperature_via_rest}= Get GPU Temperature Via REST
Steven Sombar0278b132018-01-09 14:41:32 -0600109 ${clock}= Get GPU Clock
Michael Walshc108e422019-03-28 12:27:18 -0500110 Printn
Steven Sombarc02dde82018-05-01 09:49:14 -0500111 Rpvars power power_max temperature temperature_via_rest
112 ... temperature_max clock clock_max
113
Sridevi Ramesh162c52a2025-05-13 23:47:09 -0500114 IF ${power} > ${power_max}
115 Fail msg=GPU Power ${power} exceeds limit of ${power_max}.
116 END
Steven Sombarc02dde82018-05-01 09:49:14 -0500117
118 ${err_msg}= Catenate GPU temperature of ${temperature} exceeds limit
Steven Sombar0278b132018-01-09 14:41:32 -0600119 ... of ${temperature_max}.
Sridevi Ramesh162c52a2025-05-13 23:47:09 -0500120 IF ${temperature} > ${temperature_max} Fail msg=${err_msg}
Steven Sombarc02dde82018-05-01 09:49:14 -0500121
Sridevi Ramesh162c52a2025-05-13 23:47:09 -0500122 IF ${clock} > ${clock_max} Fail msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
Steven Sombar0278b132018-01-09 14:41:32 -0600123
Steven Sombarc02dde82018-05-01 09:49:14 -0500124 ${err_msg}= Catenate The GPU temperature reported by REST is not within
125 ... 5 degrees of the nvidia_smi reported temperature.
126 ${upper_limit}= Evaluate ${temperature_via_rest}+5
127 ${lower_limit}= Evaluate ${temperature_via_rest}-5
Sridevi Ramesh162c52a2025-05-13 23:47:09 -0500128
129 IF ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
130 Fail msg=${err_msg}
131 END
Steven Sombarc02dde82018-05-01 09:49:14 -0500132
George Keishing664a0162017-06-05 12:24:24 -0500133 Shutdown HTX Exerciser
134
Steven Sombar0278b132018-01-09 14:41:32 -0600135 Collect NVIDIA Log File end
136 Error Logs Should Not Exist
137 REST Power Off
138
139 Flush REST Sessions
140
Michael Walshc108e422019-03-28 12:27:18 -0500141 Print Timen HTX Test ran for: ${HTX_DURATION}
George Keishing664a0162017-06-05 12:24:24 -0500142
Steven Sombar0278b132018-01-09 14:41:32 -0600143 ${loop_count}= Catenate Ending iteration: ${iteration}
Michael Walshc108e422019-03-28 12:27:18 -0500144 Printn
Steven Sombar0278b132018-01-09 14:41:32 -0600145 Rpvars loop_count
146
George Keishing664a0162017-06-05 12:24:24 -0500147
148Loop HTX Health Check
149 [Documentation] Run until HTX exerciser fails.
150
151 Repeat Keyword ${HTX_DURATION}
152 ... Run Keywords Check HTX Run Status
153 ... AND Sleep ${HTX_INTERVAL}
154
155
Steven Sombar0278b132018-01-09 14:41:32 -0600156Test Setup Execution
157 [Documentation] Do the initial test setup.
158
159 REST Power On stack_mode=skip
Steven Sombar1ddc7c62018-06-01 11:55:34 -0500160 Run Key U Sleep \ 15s
Steven Sombar0278b132018-01-09 14:41:32 -0600161 Delete All Error Logs
162 Tool Exist lspci
163 Tool Exist htxcmdline
164 Tool Exist nvidia-smi
165
166 # Get number of GPUs reported by the OS.
167 ${cmd}= Catenate lspci | grep NVIDIA | wc -l
168 ${num_os_gpus} ${stderr} ${rc}= OS Execute Command ${cmd}
Michael Walshc108e422019-03-28 12:27:18 -0500169 Printn
Steven Sombar0278b132018-01-09 14:41:32 -0600170 Rpvars num_os_gpus
171
172 # If no GPUs detected, we cannot continue.
Sridevi Ramesh162c52a2025-05-13 23:47:09 -0500173 IF '${num_os_gpus}' == '${0}' Fail msg=No GPUs detected so cannot run test.
Steven Sombar0278b132018-01-09 14:41:32 -0600174
175 Set Suite Variable ${num_os_gpus} children=true
176
177
178
Steven Sombar197e3802017-11-02 08:07:29 -0500179Test Teardown Execution
George Keishing664a0162017-06-05 12:24:24 -0500180 [Documentation] Do the post test teardown.
George Keishing664a0162017-06-05 12:24:24 -0500181
182 # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
Sridevi Ramesh162c52a2025-05-13 23:47:09 -0500183 IF '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0} Shutdown HTX Exerciser
George Keishing664a0162017-06-05 12:24:24 -0500184
Steven Sombar197e3802017-11-02 08:07:29 -0500185 ${keyword_buf}= Catenate Stop SOL Console Logging
186 ... \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
187 Run Key ${keyword_buf}
188
George Keishing664a0162017-06-05 12:24:24 -0500189 FFDC On Test Case Fail
190 Close All Connections