GPU test related outputs utility keywords
Added:
- Methods for parsing HTX error logs
- Methods for parsing dmesg logs
- Methods for parsing nvidia output
Resolves openbmc/openbmc-test-automation#635
Change-Id: I932d57b40a1586b561b7c0dec13ff2de3f6c0d34
Signed-off-by: George Keishing <gkeishin@in.ibm.com>
diff --git a/syslib/utils_keywords.py b/syslib/utils_keywords.py
index 8d47d89..e6a4e0e 100644
--- a/syslib/utils_keywords.py
+++ b/syslib/utils_keywords.py
@@ -8,10 +8,12 @@
import time
from robot.libraries.BuiltIn import BuiltIn
from robot.libraries import DateTime
+import re
###############################################################################
-def run_until_keyword_fails(retry, retry_interval, name, *args):
+
+def run_until_keyword_fails(retry, retry_interval, name, *args):
r"""
Execute a robot keyword repeatedly until it either fails or the timeout
value is exceeded.
@@ -25,30 +27,196 @@
"""
# Convert the retry time in seconds
- retry_seconds= DateTime.convert_time(retry)
+ retry_seconds = DateTime.convert_time(retry)
timeout = time.time() + int(retry_seconds)
# Convert the interval time in seconds
- interval_seconds= DateTime.convert_time(retry_interval)
+ interval_seconds = DateTime.convert_time(retry_interval)
interval = int(interval_seconds)
BuiltIn().log(timeout)
BuiltIn().log(interval)
while True:
- status= BuiltIn().run_keyword_and_return_status(name, *args)
+ status = BuiltIn().run_keyword_and_return_status(name, *args)
# Return if keywords returns as failure.
- if status==False:
+ if status is False:
BuiltIn().log("Failed as expected")
- return False
+ return False
# Return if retry timeout as success.
elif time.time() > timeout > 0:
BuiltIn().log("Max retry timeout")
- return True
+ return True
time.sleep(interval)
BuiltIn().log(time.time())
- return True
+ return True
+###############################################################################
+
+
+###############################################################################
+def htx_error_log_to_list(htx_error_log_output):
+
+ r"""
+ Parse htx error log output string and return list of strings in the form
+ "<field name>:<field value>".
+ The output of this function may be passed to the build_error_dict function.
+
+ Description of argument(s):
+ htx_error_log_output Error entry string containing the stdout
+ generated by "htxcmdline -geterrlog".
+
+ Example of htx_error_log_output contents:
+
+ ######################## Result Starts Here ###############################
+ Currently running ECG/MDT : /usr/lpp/htx/mdt/mdt.whit
+ ===========================
+ ---------------------------------------------------------------------
+ Device id:/dev/nvidia0
+ Timestamp:Mar 29 19:41:54 2017
+ err=00000027
+ sev=1
+ Exerciser Name:hxenvidia
+ Serial No:Not Available
+ Part No:Not Available
+ Location:Not Available
+ FRU Number:Not Available
+ Device:Not Available
+ Error Text:cudaEventSynchronize for stopEvent returned err = 0039 from file
+ , line 430.
+ ---------------------------------------------------------------------
+ ---------------------------------------------------------------------
+ Device id:/dev/nvidia0
+ Timestamp:Mar 29 19:41:54 2017
+ err=00000027
+ sev=1
+ Exerciser Name:hxenvidia
+ Serial No:Not Available
+ Part No:Not Available
+ Location:Not Available
+ FRU Number:Not Available
+ Device:Not Available
+ Error Text:Hardware Exerciser stopped on error
+ ---------------------------------------------------------------------
+ ######################### Result Ends Here ################################
+
+ Example output:
+ Returns the lists of error string per entry
+ ['Device id:/dev/nvidia0',
+ 'Timestamp:Mar 29 19:41:54 2017',
+ 'err=00000027',
+ 'sev=1',
+ 'Exerciser Name:hxenvidia',
+ 'Serial No:Not Available',
+ 'Part No:Not Available',
+ 'Location:Not Available',
+ 'FRU Number:Not Available',
+ 'Device:Not Available',
+ 'Error Text:cudaEventSynchronize for stopEvent returned err = 0039
+ from file , line 430.']
+ """
+
+ # List which will hold all the list of entries.
+ error_list = []
+
+ temp_error_list = []
+ parse_walk = False
+
+ for line in htx_error_log_output.splitlines():
+ # Skip lines starting with "#"
+ if line.startswith("#"):
+ continue
+
+ # Mark line starting with "-" and set parse flag.
+ if line.startswith("-") and parse_walk is False:
+ parse_walk = True
+ continue
+ # Mark line starting with "-" and reset parse flag.
+ # Set temp error list to EMPTY.
+ elif line.startswith("-"):
+ error_list.append(temp_error_list)
+ parse_walk = False
+ temp_error_list = []
+ # Add entry to list if line is not emtpy
+ elif parse_walk:
+ temp_error_list.append(str(line))
+
+ return error_list
+###############################################################################
+
+
+###############################################################################
+def build_error_dict(htx_error_log_output):
+
+ r"""
+ Builds error list into a list of dictionary entries.
+
+ Description of argument(s):
+ error_list Error list entries.
+
+ Example output dictionary:
+ {
+ 0:
+ {
+ 'sev': '1',
+ 'err': '00000027',
+ 'Timestamp': 'Mar 29 19:41:54 2017',
+ 'Part No': 'Not Available',
+ 'Serial No': 'Not Available',
+ 'Device': 'Not Available',
+ 'FRU Number': 'Not Available',
+ 'Location': 'Not Available',
+ 'Device id': '/dev/nvidia0',
+ 'Error Text': 'cudaEventSynchronize for stopEvent returned err = 0039
+ from file , line 430.',
+ 'Exerciser Name': 'hxenvidia'
+ },
+ 1:
+ {
+ 'sev': '1',
+ 'err': '00000027',
+ 'Timestamp': 'Mar 29 19:41:54 2017',
+ 'Part No': 'Not Available',
+ 'Serial No': 'Not Available',
+ 'Device': 'Not Available',
+ 'FRU Number': 'Not Available',
+ 'Location': 'Not Available',
+ 'Device id': '/dev/nvidia0',
+ 'Error Text': 'Hardware Exerciser stopped on error',
+ 'Exerciser Name': 'hxenvidia'
+ }
+ },
+
+ """
+
+ # List which will hold all the list of entries.
+ error_list = []
+ error_list = htx_error_log_to_list(htx_error_log_output)
+
+ # dictionary which holds the error dictionry entry.
+ error_dict = {}
+
+ temp_error_dict = {}
+ error_index = 0
+
+ # Loop through the error list.
+ for entry_list in error_list:
+ # Loop through the first error list entry.
+ for entry in entry_list:
+ # Split string into list for key value update.
+ # Example: 'Device id:/dev/nvidia0'
+ # Example: 'err=00000027'
+ parm_split = re.split("[:=]", entry)
+ # Populate temp dictionary with key value pair data.
+ temp_error_dict[str(parm_split[0])] = parm_split[1]
+
+ # Update the master dictionary per entry index.
+ error_dict[error_index] = temp_error_dict
+ # Reset temp dict to EMPTY and increment index count.
+ temp_error_dict = {}
+ error_index += 1
+
+ return error_dict
###############################################################################
diff --git a/syslib/utils_os.robot b/syslib/utils_os.robot
index 0f4f314..206164c 100755
--- a/syslib/utils_os.robot
+++ b/syslib/utils_os.robot
@@ -17,6 +17,11 @@
${htx_log_dir_path} ${EXECDIR}${/}logs${/}
+# Error strings to check from dmesg.
+${ERROR_REGEX} error|GPU|NVRM|nvidia
+
+# GPU specific error message from dmesg.
+${ERROR_DBE_MSG} (DBE) has been detected on GPU
*** Keywords ***
@@ -152,3 +157,53 @@
# Switch back to OS SSH connection.
Switch Connection os_connection
+
+Check For Errors On OS Dmesg Log
+ [Documentation] Check if dmesg has nvidia errors logged.
+
+ ${dmesg_log}= Execute Command On OS dmesg | egrep '${ERROR_REGEX}'
+ # To enable multiple string check.
+ Should Not Contain Any ${dmesg_log} ${ERROR_DBE_MSG}
+
+
+Collect NVIDIA Log File
+ [Documentation] Collect ndivia-smi command output.
+
+ # Collects the output of ndivia-smi cmd output.
+ # TODO: GPU current temperature threshold check.
+ # openbmc/openbmc-test-automation#637
+ # +-----------------------------------------------------------------------------+
+ # | NVIDIA-SMI 361.89 Driver Version: 361.89 |
+ # |-------------------------------+----------------------+----------------------+
+ # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
+ # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
+ # |===============================+======================+======================|
+ # | 0 Tesla P100-SXM2... On | 0002:01:00.0 Off | 0 |
+ # | N/A 25C P0 35W / 300W | 931MiB / 16280MiB | 0% Default |
+ # +-------------------------------+----------------------+----------------------+
+ # | 1 Tesla P100-SXM2... On | 0003:01:00.0 Off | 0 |
+ # | N/A 26C P0 40W / 300W | 1477MiB / 16280MiB | 0% Default |
+ # +-------------------------------+----------------------+----------------------+
+ # | 2 Tesla P100-SXM2... On | 0006:01:00.0 Off | 0 |
+ # | N/A 25C P0 35W / 300W | 931MiB / 16280MiB | 0% Default |
+ # +-------------------------------+----------------------+----------------------+
+ # | 3 Tesla P100-SXM2... On | 0007:01:00.0 Off | 0 |
+ # | N/A 44C P0 290W / 300W | 965MiB / 16280MiB | 99% Default |
+ # +-------------------------------+----------------------+----------------------+
+ # +-----------------------------------------------------------------------------+
+ # | Processes: GPU Memory |
+ # | GPU PID Type Process name Usage |
+ # |=============================================================================|
+ # | 0 28459 C hxenvidia 929MiB |
+ # | 1 28460 C hxenvidia 1475MiB |
+ # | 2 28461 C hxenvidia 929MiB |
+ # | 3 28462 C hxenvidia 963MiB |
+ # +-----------------------------------------------------------------------------+
+
+ # Create logs directory and get current datetime.
+ Create Directory ${htx_log_dir_path}
+ ${cur_datetime}= Get Current Date result_format=%Y%m%d%H%M%S%f
+
+ ${nvidia_out}= Execute Command On BMC nvidia-smi
+ Write Log Data To File
+ ... ${nvidia_out} ${htx_log_dir_path}/${OS_HOST}_${cur_datetime}.nvidia