Add temperature, power, frequency tests to GPU stress test

Add GPU temperature, power, frequency utilities to
utils_os.robot, and remove the common Test Setup Execution
utility from syslib.

Resolves  openbmc/openbmc-test-automation#1030
Resolves  openbmc/openbmc-test-automation#637
Change-Id: I823b9956b61d9c35b09faa4e7a4372aa9244a805
Signed-off-by: Steven Sombar <ssombar@us.ibm.com>
diff --git a/systest/gpu_stress_test.robot b/systest/gpu_stress_test.robot
old mode 100644
new mode 100755
index 40cfc50..cfe3915
--- a/systest/gpu_stress_test.robot
+++ b/systest/gpu_stress_test.robot
@@ -1,15 +1,36 @@
 *** Settings ***
-Documentation    Stress the system using HTX exerciser.
+Documentation    Stress the system GPUs using the HTX exerciser.
+
+# Test Parameters:
+# OPENBMC_HOST        The BMC host name or IP address.
+# OS_HOST             The OS host name or IP Address.
+# OS_USERNAME         The OS login userid (usually "root").
+# OS_PASSWORD         The password for the OS login.
+# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
+# HTX_LOOP            The number of times to loop HTX.
+# HTX_INTERVAL        The time delay between consecutive checks of HTX
+#                     status, for example, "15m".
+#                     In summary: Run HTX for $HTX_DURATION, looping
+#                     $HTX_LOOP times checking for errors every
+#                     $HTX_INTERVAL.  Then allow extra time for OS
+#                     Boot, HTX startup, shutdown.
+# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
+#                     continue running after an error was found.
+
 
 Resource         ../syslib/utils_os.robot
+Resource         ../lib/boot_utils.robot
 
 Suite Setup      Run Keyword  Start SOL Console Logging
 Test Setup       Test Setup Execution
 Test Teardown    Test Teardown Execution
-Suite Teardown   Suite Teardown Execution
 
 *** Variables ****
 
+${HTX_DURATION}      1h
+${HTX_LOOP}          ${1}
+${HTX_INTERVAL}      30m
+${HTX_KEEP_RUNNING}  ${0}
 ${stack_mode}        skip
 
 *** Test Cases ***
@@ -18,8 +39,28 @@
     [Documentation]  Stress the GPU using HTX exerciser.
     [Tags]  GPU_Stress_Test
 
+    # Get number of GPU reported by the BMC.
+    ${num_bmc_gpus}=  Count GPUs From BMC
+    Rpvars  num_bmc_gpus
+
+    # The BMC and OS should report the same number of GPUs.
+    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
+    ...  reports ${num_bmc_gpus} present and functional GPUs.
+    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
+    ...  Fail  msg=${failmsg01}
+
+    # Show parameters for HTX stress test.
     Rprintn
-    Rpvars  HTX_DURATION  HTX_INTERVAL
+    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
+
+    # Set the iteration (loop) counter.
+    Set Suite Variable  ${iteration}  ${0}  children=true
+
+
+    # Shutdown HTX if it is already running.
+    ${status}=  Run Keyword And Return Status  Is HTX Running
+    Run Keyword If  '${status}' == 'True'
+    ...  Shutdown HTX Exerciser
 
     Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
 
@@ -34,11 +75,23 @@
     #              - Collect GPU nvidia status output
     #              - Create HTX mdt profile
     #              - Run GPU specific HTX exerciser
-    #              - Check HTX status for errors
+    #              - Check for errors
+
+    Set Suite Variable  ${iteration}  ${iteration + 1}
+    ${loop_count}=  Catenate  Starting iteration: ${iteration}
+    Rprintn
+    Rpvars  loop_count
+
+    REST Power On  stack_mode=skip
 
     # Collect data before the test starts.
     Collect NVIDIA Log File  start
 
+    # Collect NVIDIA maximum limits.
+    ${power_max}=  Get GPU Power Limit
+    ${temperature_max}=  Get GPU Temperature Limit
+    ${clock_max}=  Get GPU Clock Limit
+
     Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
     ...  Create Default MDT Profile
 
@@ -49,10 +102,34 @@
     # Post test loop look out for dmesg error logged.
     Check For Errors On OS Dmesg Log
 
+    # Check NVIDIA power, temperature, and clocks.
+    ${power}=  Get GPU Power
+    ${temperature}=  Get GPU Temperature
+    ${clock}=  Get GPU Clock
+    Rprintn
+    Rpvars  power  power_max  temperature  temperature_max  clock  clock_max
+    Run Keyword If  ${power} > ${power_max}  Fail
+    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
+    ${errmsg}=  Canenate  GPU temperature of ${temperature} exceeds limit
+    ...  of ${temperature_max}.
+    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${errmsg}
+    Run Keyword If  ${clock} > ${clock_max}  Fail
+    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
+
     Shutdown HTX Exerciser
 
+    Collect NVIDIA Log File  end
+    Error Logs Should Not Exist
+    REST Power Off
+
+    Flush REST Sessions
+
     Rprint Timen  HTX Test ran for: ${HTX_DURATION}
 
+    ${loop_count}=  Catenate  Ending iteration: ${iteration}
+    Rprintn
+    Rpvars  loop_count
+
 
 Loop HTX Health Check
     [Documentation]  Run until HTX exerciser fails.
@@ -62,25 +139,36 @@
     ...  AND  Sleep  ${HTX_INTERVAL}
 
 
+Test Setup Execution
+    [Documentation]  Do the initial test setup.
+
+    REST Power On  stack_mode=skip
+    Delete All Error Logs
+    Tool Exist  lspci
+    Tool Exist  htxcmdline
+    Tool Exist  nvidia-smi
+
+    # Get number of GPUs reported by the OS.
+    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
+    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    Rprintn
+    Rpvars  num_os_gpus
+
+    # If no GPUs detected, we cannot continue.
+    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
+    ...  msg=No GPUs detected so cannot run test.
+
+    Set Suite Variable  ${num_os_gpus}  children=true
+
+
+
 Test Teardown Execution
     [Documentation]  Do the post test teardown.
-    #  Shut down HTX exerciser if test Failed.
-    #  Collect NVIDIA log.
 
     # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
     Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
     ...  Shutdown HTX Exerciser
 
-    # Collect nvidia-smi output data on exit.
-    Collect NVIDIA Log File  end
-
-
-Suite Teardown Execution
-    [Documentation]  Do the final teardown and cleanup.
-    #  Stop SOL Console Logging.
-    #  Collect FFDC if Test Case Fail.
-    #  Close Connections.
-
     ${keyword_buf}=  Catenate  Stop SOL Console Logging
     ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
     Run Key  ${keyword_buf}