Add temperature, power, frequency tests to GPU stress test

Add GPU temperature, power, frequency utilities to
utils_os.robot, and remove the common Test Setup Execution
utility from syslib.

Resolves  openbmc/openbmc-test-automation#1030
Resolves  openbmc/openbmc-test-automation#637
Change-Id: I823b9956b61d9c35b09faa4e7a4372aa9244a805
Signed-off-by: Steven Sombar <ssombar@us.ibm.com>
diff --git a/syslib/utils_os.robot b/syslib/utils_os.robot
index dbd1d22..9bba861 100755
--- a/syslib/utils_os.robot
+++ b/syslib/utils_os.robot
@@ -170,7 +170,7 @@
 
     ${dmesg_log}=  Execute Command On OS  dmesg | egrep '${ERROR_REGEX}'
     # To enable multiple string check.
-    Should Not Contain Any  ${dmesg_log}  ${ERROR_DBE_MSG}
+    Should Not Contain  ${dmesg_log}  ${ERROR_DBE_MSG}
 
 
 Collect NVIDIA Log File
@@ -180,8 +180,6 @@
     # suffix     String name to append.
 
     # Collects the output of ndivia-smi cmd output.
-    # TODO: GPU current temperature threshold check.
-    #       openbmc/openbmc-test-automation#637
     # +-----------------------------------------------------------------------------+
     # | NVIDIA-SMI 361.89                 Driver Version: 361.89                    |
     # |-------------------------------+----------------------+----------------------+
@@ -214,26 +212,131 @@
     Create Directory  ${htx_log_dir_path}
     ${cur_datetime}=  Get Current Date  result_format=%Y%m%d%H%M%S%f
 
-    ${nvidia_out}=  Execute Command On BMC  nvidia-smi
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  nvidia-smi
     Write Log Data To File
     ...  ${nvidia_out}
     ...  ${htx_log_dir_path}/${OS_HOST}_${cur_datetime}.nvidia_${suffix}
 
 
-Test Setup Execution
-    [Documentation]  Do the initial test setup.
-    # 1. Check if HTX tool exist.
-    # 2. Power on
+Get GPU Power Limit
+    [Documentation]  Get NVIDIA GPU maximum permitted power draw.
 
-    Boot To OS
-    Delete All Error Logs
-    Tool Exist  htxcmdline
-    Tool Exist  lshw
+    # nvidia-smi --query-gpu=power.limit --format=csv returns
+    # power.limit [W]
+    # 300.00 W
+    # 300.00 W
+    # 300.00 W
+    # 300.00 W
 
-    # Shutdown if HTX is running.
-    ${status}=  Run Keyword And Return Status  Is HTX Running
-    Run Keyword If  '${status}' == 'True'
-    ...  Shutdown HTX Exerciser
+    ${cmd}=  Catenate  nvidia-smi --query-gpu=power.limit
+    ...  --format=csv | cut -f 1 -d ' ' | sort -n -u | tail -n 1
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    # Allow for sensor overshoot.  That is, max power reported for
+    # a GPU could be a few watts above the limit.
+    ${power_max}=  Evaluate  ${nvidia_out}+${7.00}
+    [Return]  ${power_max}
+
+
+Get GPU Power
+    [Documentation]  Get the GPU power dissipation.
+
+    # nvidia-smi --query-gpu=power.draw --format=csv returns
+    # power.draw [W]
+    # 34.12 W
+    # 34.40 W
+    # 36.55 W
+    # 36.05 W
+
+    ${cmd}=  Catenate  nvidia-smi --query-gpu=power.draw
+    ...  --format=csv | cut -f 1 -d ' ' | sort -n -u | tail -n 1
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    [Return]  ${nvidia_out}
+
+
+Get GPU Temperature Limit
+    [Documentation]  Get NVIDIA GPU maximum permitted temperature.
+
+    # nvidia-smi -q -d TEMPERATURE  | grep "GPU Max" returns
+    #    GPU Max Operating Temp      : 83 C
+    #    GPU Max Operating Temp      : 83 C
+    #    GPU Max Operating Temp      : 83 C
+    #    GPU Max Operating Temp      : 83 C
+
+    ${cmd}=  Catenate  nvidia-smi -q -d TEMPERATURE  | grep "GPU Max"
+    ...  | cut -f 2 -d ":" |  tr -dc '0-9\n' | sort -n -u | tail -n 1
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    [Return]  ${nvidia_out}
+
+
+Get GPU Temperature
+    [Documentation]  Get the GPU temperature.
+
+    # nvidia-smi --query-gpu=temperature.gpu --format=csv returns
+    # 38
+    # 41
+    # 38
+    # 40
+
+    ${cmd}=  Catenate  nvidia-smi --query-gpu=temperature.gpu
+    ...  --format=csv | sort -n -u | tail -n 1
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    [Return]  ${nvidia_out}
+
+
+Get GPU Clock Limit
+    [Documentation]  Get NVIDIA GPU maximum permitted graphics clock.
+
+    # nvidia-smi --query-gpu=clocks.max.gr --format=csv  returns
+    # 1530 MHz
+    # 1530 MHz
+    # 1530 MHz
+    # 1530 MHz
+
+    ${cmd}=  Catenate  nvidia-smi --query-gpu=clocks.max.gr
+    ...  --format=csv | cut -f 1 -d ' ' |  sort -n -u | tail -n 1
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    [Return]  ${nvidia_out}
+
+
+Get GPU Clock
+    [Documentation]  Get the highest assigned value of the GPU graphics clock.
+
+    # nvidia-smi --query-gpu=clocks.gr --format=csv  returns
+    # 1230 MHz
+    # 1230 MHz
+    # 135 MHz
+    # 150 MHz
+
+    ${cmd}=  Catenate  nvidia-smi --query-gpu=clocks.gr
+    ...  --format=csv | cut -f 1 -d ' ' | sort -n -u | tail -n 1
+    ${nvidia_out}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    [Return]  ${nvidia_out}
+
+
+Count GPUs From BMC
+    [Documentation]  Determine number of GPUs from the BMC.  Hostboot
+    ...  needs to have been run previously because the BMC gets GPU data
+    ...  from Hostboot.
+
+    # Example of gv* endpoint data:
+    # "/xyz/openbmc_project/inventory/system/chassis/motherboard/gv100card0": {
+    #     "Functional": 1,
+    #     "Present": 1,
+    #     "PrettyName": ""
+    # },
+
+    ${num_bmc_gpus}=  Set Variable  ${0}
+
+    ${gpu_list}=  Get Endpoint Paths
+    ...  ${HOST_INVENTORY_URI}system/chassis/motherboard  gv*
+
+    :FOR  ${gpu_uri}  IN  @{gpu_list}
+    \  ${present}=  Read Attribute  ${gpu_uri}  Present
+    \  ${state}=  Read Attribute  ${gpu_uri}  Functional
+    \  Rpvars  gpu_uri  present  state
+    \  ${num_bmc_gpus}=  Run Keyword If  ${present} and ${state}
+    ...  Evaluate  ${num_bmc_gpus}+${1}
+    [Return]  ${num_bmc_gpus}
 
 
 Create Default MDT Profile
diff --git a/systest/gpu_stress_test.robot b/systest/gpu_stress_test.robot
old mode 100644
new mode 100755
index 40cfc50..cfe3915
--- a/systest/gpu_stress_test.robot
+++ b/systest/gpu_stress_test.robot
@@ -1,15 +1,36 @@
 *** Settings ***
-Documentation    Stress the system using HTX exerciser.
+Documentation    Stress the system GPUs using the HTX exerciser.
+
+# Test Parameters:
+# OPENBMC_HOST        The BMC host name or IP address.
+# OS_HOST             The OS host name or IP Address.
+# OS_USERNAME         The OS login userid (usually "root").
+# OS_PASSWORD         The password for the OS login.
+# HTX_DURATION        Duration of HTX run, for example, "2h", or "30m".
+# HTX_LOOP            The number of times to loop HTX.
+# HTX_INTERVAL        The time delay between consecutive checks of HTX
+#                     status, for example, "15m".
+#                     In summary: Run HTX for $HTX_DURATION, looping
+#                     $HTX_LOOP times checking for errors every
+#                     $HTX_INTERVAL.  Then allow extra time for OS
+#                     Boot, HTX startup, shutdown.
+# HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
+#                     continue running after an error was found.
+
 
 Resource         ../syslib/utils_os.robot
+Resource         ../lib/boot_utils.robot
 
 Suite Setup      Run Keyword  Start SOL Console Logging
 Test Setup       Test Setup Execution
 Test Teardown    Test Teardown Execution
-Suite Teardown   Suite Teardown Execution
 
 *** Variables ****
 
+${HTX_DURATION}      1h
+${HTX_LOOP}          ${1}
+${HTX_INTERVAL}      30m
+${HTX_KEEP_RUNNING}  ${0}
 ${stack_mode}        skip
 
 *** Test Cases ***
@@ -18,8 +39,28 @@
     [Documentation]  Stress the GPU using HTX exerciser.
     [Tags]  GPU_Stress_Test
 
+    # Get number of GPU reported by the BMC.
+    ${num_bmc_gpus}=  Count GPUs From BMC
+    Rpvars  num_bmc_gpus
+
+    # The BMC and OS should report the same number of GPUs.
+    ${failmsg01}=  Catenate  OS reports ${num_os_gpus} GPUs, but BMC
+    ...  reports ${num_bmc_gpus} present and functional GPUs.
+    Run Keyword If  '${num_os_gpus}' != '${num_bmc_gpus}'
+    ...  Fail  msg=${failmsg01}
+
+    # Show parameters for HTX stress test.
     Rprintn
-    Rpvars  HTX_DURATION  HTX_INTERVAL
+    Rpvars  HTX_DURATION  HTX_LOOP  HTX_INTERVAL
+
+    # Set the iteration (loop) counter.
+    Set Suite Variable  ${iteration}  ${0}  children=true
+
+
+    # Shutdown HTX if it is already running.
+    ${status}=  Run Keyword And Return Status  Is HTX Running
+    Run Keyword If  '${status}' == 'True'
+    ...  Shutdown HTX Exerciser
 
     Repeat Keyword  ${HTX_LOOP} times  Execute GPU Test
 
@@ -34,11 +75,23 @@
     #              - Collect GPU nvidia status output
     #              - Create HTX mdt profile
     #              - Run GPU specific HTX exerciser
-    #              - Check HTX status for errors
+    #              - Check for errors
+
+    Set Suite Variable  ${iteration}  ${iteration + 1}
+    ${loop_count}=  Catenate  Starting iteration: ${iteration}
+    Rprintn
+    Rpvars  loop_count
+
+    REST Power On  stack_mode=skip
 
     # Collect data before the test starts.
     Collect NVIDIA Log File  start
 
+    # Collect NVIDIA maximum limits.
+    ${power_max}=  Get GPU Power Limit
+    ${temperature_max}=  Get GPU Temperature Limit
+    ${clock_max}=  Get GPU Clock Limit
+
     Run Keyword If  '${HTX_MDT_PROFILE}' == 'mdt.bu'
     ...  Create Default MDT Profile
 
@@ -49,10 +102,34 @@
     # Post test loop look out for dmesg error logged.
     Check For Errors On OS Dmesg Log
 
+    # Check NVIDIA power, temperature, and clocks.
+    ${power}=  Get GPU Power
+    ${temperature}=  Get GPU Temperature
+    ${clock}=  Get GPU Clock
+    Rprintn
+    Rpvars  power  power_max  temperature  temperature_max  clock  clock_max
+    Run Keyword If  ${power} > ${power_max}  Fail
+    ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
+    ${errmsg}=  Canenate  GPU temperature of ${temperature} exceeds limit
+    ...  of ${temperature_max}.
+    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${errmsg}
+    Run Keyword If  ${clock} > ${clock_max}  Fail
+    ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
+
     Shutdown HTX Exerciser
 
+    Collect NVIDIA Log File  end
+    Error Logs Should Not Exist
+    REST Power Off
+
+    Flush REST Sessions
+
     Rprint Timen  HTX Test ran for: ${HTX_DURATION}
 
+    ${loop_count}=  Catenate  Ending iteration: ${iteration}
+    Rprintn
+    Rpvars  loop_count
+
 
 Loop HTX Health Check
     [Documentation]  Run until HTX exerciser fails.
@@ -62,25 +139,36 @@
     ...  AND  Sleep  ${HTX_INTERVAL}
 
 
+Test Setup Execution
+    [Documentation]  Do the initial test setup.
+
+    REST Power On  stack_mode=skip
+    Delete All Error Logs
+    Tool Exist  lspci
+    Tool Exist  htxcmdline
+    Tool Exist  nvidia-smi
+
+    # Get number of GPUs reported by the OS.
+    ${cmd}=  Catenate  lspci | grep NVIDIA | wc -l
+    ${num_os_gpus}  ${stderr}  ${rc}=  OS Execute Command  ${cmd}
+    Rprintn
+    Rpvars  num_os_gpus
+
+    # If no GPUs detected, we cannot continue.
+    Run Keyword If  '${num_os_gpus}' == '${0}'  Fail
+    ...  msg=No GPUs detected so cannot run test.
+
+    Set Suite Variable  ${num_os_gpus}  children=true
+
+
+
 Test Teardown Execution
     [Documentation]  Do the post test teardown.
-    #  Shut down HTX exerciser if test Failed.
-    #  Collect NVIDIA log.
 
     # Keep HTX running if user set HTX_KEEP_RUNNING to 1.
     Run Keyword If  '${TEST_STATUS}' == 'FAIL' and ${HTX_KEEP_RUNNING} == ${0}
     ...  Shutdown HTX Exerciser
 
-    # Collect nvidia-smi output data on exit.
-    Collect NVIDIA Log File  end
-
-
-Suite Teardown Execution
-    [Documentation]  Do the final teardown and cleanup.
-    #  Stop SOL Console Logging.
-    #  Collect FFDC if Test Case Fail.
-    #  Close Connections.
-
     ${keyword_buf}=  Catenate  Stop SOL Console Logging
     ...  \ targ_file_path=${EXECDIR}${/}logs${/}SOL.log
     Run Key  ${keyword_buf}
diff --git a/systest/htx_hardbootme_test.robot b/systest/htx_hardbootme_test.robot
index f0a0fba..c1b2fd5 100755
--- a/systest/htx_hardbootme_test.robot
+++ b/systest/htx_hardbootme_test.robot
@@ -12,8 +12,9 @@
 # HTX_INTERVAL        The time delay between consecutive checks of HTX
 #                     status, for example, 15m.
 #                     In summary: Run HTX for $HTX_DURATION, looping
-#                     $HTX_LOOP times checking for errors every $HTX_INTERVAL.
-#                     Then allow extra time for OS Boot, HTX startup, shutdown.
+#                     $HTX_LOOP times checking for errors every
+#                     $HTX_INTERVAL.  Then allow extra time for OS
+#                     Boot, HTX startup, shutdown.
 # HTX_KEEP_RUNNING    If set to 1, this indicates that the HTX is to
 #                     continue running after an error was found.
 # CHECK_INVENTORY     If set to 0 or False, OS inventory checking before
@@ -115,7 +116,7 @@
     Rprintn
     Rpvars  loop_count  estimated_loop_time   estimated_time_remaining
 
-    Boot To OS
+    REST Power On  stack_mode=skip
 
     # Post Power off and on, the OS SSH session needs to be established.
     Login To OS
@@ -208,6 +209,19 @@
     ...  AND  Sleep  ${HTX_INTERVAL}
 
 
+Test Setup Execution
+    [Documentation]  Do the initial test setup.
+
+    REST Power On  stack_mode=skip
+    Delete All Error Logs
+    Tool Exist  htxcmdline
+
+    # Shutdown if HTX is running.
+    ${status}=  Run Keyword And Return Status  Is HTX Running
+    Run Keyword If  '${status}' == 'True'
+    ...  Shutdown HTX Exerciser
+
+
 Test Teardown Execution
     [Documentation]  Do the post test teardown.
     # 1. Shut down HTX exerciser if test Failed.
diff --git a/systest/network_stability_test.robot b/systest/network_stability_test.robot
old mode 100644
new mode 100755
index 3e233f4..da09939
--- a/systest/network_stability_test.robot
+++ b/systest/network_stability_test.robot
@@ -35,7 +35,7 @@
     #              - Check HTX status for errors
     #              - Shutdown HTX if no error when timer expires
 
-    Boot To OS
+    REST Power On  stack_mode=skip
 
     # Post Power off and on, the OS SSH session needs to be established.
     Login To OS
@@ -74,6 +74,19 @@
     REST Upload File To BMC
 
 
+Test Setup Execution
+    [Documentation]  Do the initial test setup.
+
+    REST Power On  stack_mode=skip
+    Delete All Error Logs
+    Tool Exist  htxcmdline
+
+    # Shutdown if HTX is running.
+    ${status}=  Run Keyword And Return Status  Is HTX Running
+    Run Keyword If  '${status}' == 'True'
+    ...  Shutdown HTX Exerciser
+
+
 Test Teardown Execution
     [Documentation]  Do the post test teardown.
     # 1. Shut down HTX exerciser if test Failed.
diff --git a/systest/test_bmc_poll_errors.robot b/systest/test_bmc_poll_errors.robot
old mode 100644
new mode 100755
index 7ceed27..ce5c0a2
--- a/systest/test_bmc_poll_errors.robot
+++ b/systest/test_bmc_poll_errors.robot
@@ -6,7 +6,7 @@
 Resource          ../lib/resource.txt
 Resource          ../lib/boot_utils.robot
 
-Suite Setup      Setup The Suite
+Suite Setup      Suite Setup Execution
 Test Teardown    Post Test Case Execution
 
 *** Variables ***
@@ -37,7 +37,7 @@
     Error Logs Should Not Exist
 
 
-Setup The Suite
+Suite Setup Execution
     [Documentation]  Do test setup initialization.
 
     Should Not Be Empty