blob: cfe3915bd636b3b379e20e816254e73cb1784611 [file] [log] [blame]
George Keishing664a0162017-06-05 12:24:24 -05001*** Settings ***
Steven Sombar0278b132018-01-09 14:41:32 -06002Documentation Stress the system GPUs using the HTX exerciser.
3
4# Test Parameters:
5# OPENBMC_HOST The BMC host name or IP address.
6# OS_HOST The OS host name or IP Address.
7# OS_USERNAME The OS login userid (usually "root").
8# OS_PASSWORD The password for the OS login.
9# HTX_DURATION Duration of HTX run, for example, "2h", or "30m".
10# HTX_LOOP The number of times to loop HTX.
11# HTX_INTERVAL The time delay between consecutive checks of HTX
12# status, for example, "15m".
13# In summary: Run HTX for $HTX_DURATION, looping
14# $HTX_LOOP times checking for errors every
15# $HTX_INTERVAL. Then allow extra time for OS
16# Boot, HTX startup, shutdown.
17# HTX_KEEP_RUNNING If set to 1, this indicates that the HTX is to
18# continue running after an error was found.
19
George Keishing664a0162017-06-05 12:24:24 -050020
21Resource ../syslib/utils_os.robot
Steven Sombar0278b132018-01-09 14:41:32 -060022Resource ../lib/boot_utils.robot
George Keishing664a0162017-06-05 12:24:24 -050023
Steven Sombar197e3802017-11-02 08:07:29 -050024Suite Setup Run Keyword Start SOL Console Logging
George Keishingdc1691d2017-12-07 12:17:46 -060025Test Setup Test Setup Execution
Steven Sombar197e3802017-11-02 08:07:29 -050026Test Teardown Test Teardown Execution
George Keishing664a0162017-06-05 12:24:24 -050027
28*** Variables ****
29
Steven Sombar0278b132018-01-09 14:41:32 -060030${HTX_DURATION} 1h
31${HTX_LOOP} ${1}
32${HTX_INTERVAL} 30m
33${HTX_KEEP_RUNNING} ${0}
George Keishing664a0162017-06-05 12:24:24 -050034${stack_mode} skip
35
36*** Test Cases ***
37
38GPU Stress Test
39 [Documentation] Stress the GPU using HTX exerciser.
40 [Tags] GPU_Stress_Test
41
Steven Sombar0278b132018-01-09 14:41:32 -060042 # Get number of GPU reported by the BMC.
43 ${num_bmc_gpus}= Count GPUs From BMC
44 Rpvars num_bmc_gpus
45
46 # The BMC and OS should report the same number of GPUs.
47 ${failmsg01}= Catenate OS reports ${num_os_gpus} GPUs, but BMC
48 ... reports ${num_bmc_gpus} present and functional GPUs.
49 Run Keyword If '${num_os_gpus}' != '${num_bmc_gpus}'
50 ... Fail msg=${failmsg01}
51
52 # Show parameters for HTX stress test.
George Keishing664a0162017-06-05 12:24:24 -050053 Rprintn
Steven Sombar0278b132018-01-09 14:41:32 -060054 Rpvars HTX_DURATION HTX_LOOP HTX_INTERVAL
55
56 # Set the iteration (loop) counter.
57 Set Suite Variable ${iteration} ${0} children=true
58
59
60 # Shutdown HTX if it is already running.
61 ${status}= Run Keyword And Return Status Is HTX Running
62 Run Keyword If '${status}' == 'True'
63 ... Shutdown HTX Exerciser
George Keishing664a0162017-06-05 12:24:24 -050064
65 Repeat Keyword ${HTX_LOOP} times Execute GPU Test
66
67
68*** Keywords ***
69
70Execute GPU Test
71 [Documentation] Start HTX exerciser.
72 # Test Flow:
73 # - Power on
74 # - Establish SSH connection session
75 # - Collect GPU nvidia status output
76 # - Create HTX mdt profile
77 # - Run GPU specific HTX exerciser
Steven Sombar0278b132018-01-09 14:41:32 -060078 # - Check for errors
79
80 Set Suite Variable ${iteration} ${iteration + 1}
81 ${loop_count}= Catenate Starting iteration: ${iteration}
82 Rprintn
83 Rpvars loop_count
84
85 REST Power On stack_mode=skip
George Keishing664a0162017-06-05 12:24:24 -050086
87 # Collect data before the test starts.
88 Collect NVIDIA Log File start
89
Steven Sombar0278b132018-01-09 14:41:32 -060090 # Collect NVIDIA maximum limits.
91 ${power_max}= Get GPU Power Limit
92 ${temperature_max}= Get GPU Temperature Limit
93 ${clock_max}= Get GPU Clock Limit
94
George Keishing664a0162017-06-05 12:24:24 -050095 Run Keyword If '${HTX_MDT_PROFILE}' == 'mdt.bu'
96 ... Create Default MDT Profile
97
98 Run MDT Profile
99
100 Loop HTX Health Check
101
102 # Post test loop look out for dmesg error logged.
103 Check For Errors On OS Dmesg Log
104
Steven Sombar0278b132018-01-09 14:41:32 -0600105 # Check NVIDIA power, temperature, and clocks.
106 ${power}= Get GPU Power
107 ${temperature}= Get GPU Temperature
108 ${clock}= Get GPU Clock
109 Rprintn
110 Rpvars power power_max temperature temperature_max clock clock_max
111 Run Keyword If ${power} > ${power_max} Fail
112 ... msg=GPU Power ${power} exceeds limit of ${power_max}.
113 ${errmsg}= Canenate GPU temperature of ${temperature} exceeds limit
114 ... of ${temperature_max}.
115 Run Keyword If ${temperature} > ${temperature_max} Fail msg=${errmsg}
116 Run Keyword If ${clock} > ${clock_max} Fail
117 ... msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
118
George Keishing664a0162017-06-05 12:24:24 -0500119 Shutdown HTX Exerciser
120
Steven Sombar0278b132018-01-09 14:41:32 -0600121 Collect NVIDIA Log File end
122 Error Logs Should Not Exist
123 REST Power Off
124
125 Flush REST Sessions
126
George Keishing664a0162017-06-05 12:24:24 -0500127 Rprint Timen HTX Test ran for: ${HTX_DURATION}
128
Steven Sombar0278b132018-01-09 14:41:32 -0600129 ${loop_count}= Catenate Ending iteration: ${iteration}
130 Rprintn
131 Rpvars loop_count
132
George Keishing664a0162017-06-05 12:24:24 -0500133
134Loop HTX Health Check
135 [Documentation] Run until HTX exerciser fails.
136
137 Repeat Keyword ${HTX_DURATION}
138 ... Run Keywords Check HTX Run Status
139 ... AND Sleep ${HTX_INTERVAL}
140
141
Steven Sombar0278b132018-01-09 14:41:32 -0600142Test Setup Execution
143 [Documentation] Do the initial test setup.
144
145 REST Power On stack_mode=skip
146 Delete All Error Logs
147 Tool Exist lspci
148 Tool Exist htxcmdline
149 Tool Exist nvidia-smi
150
151 # Get number of GPUs reported by the OS.
152 ${cmd}= Catenate lspci | grep NVIDIA | wc -l
153 ${num_os_gpus} ${stderr} ${rc}= OS Execute Command ${cmd}
154 Rprintn
155 Rpvars num_os_gpus
156
157 # If no GPUs detected, we cannot continue.
158 Run Keyword If '${num_os_gpus}' == '${0}' Fail
159 ... msg=No GPUs detected so cannot run test.
160
161 Set Suite Variable ${num_os_gpus} children=true
162
163
164
Steven Sombar197e3802017-11-02 08:07:29 -0500165Test Teardown Execution
George Keishing664a0162017-06-05 12:24:24 -0500166 [Documentation] Do the post test teardown.
George Keishing664a0162017-06-05 12:24:24 -0500167
168 # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
169 Run Keyword If '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
170 ... Shutdown HTX Exerciser
171
Steven Sombar197e3802017-11-02 08:07:29 -0500172 ${keyword_buf}= Catenate Stop SOL Console Logging
173 ... \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
174 Run Key ${keyword_buf}
175
George Keishing664a0162017-06-05 12:24:24 -0500176 FFDC On Test Case Fail
177 Close All Connections