GPU test related outputs utility keywords Added: - Methods for parsing HTX error logs - Methods for parsing dmesg logs - Methods for parsing nvidia output Resolves openbmc/openbmc-test-automation#635 Change-Id: I932d57b40a1586b561b7c0dec13ff2de3f6c0d34 Signed-off-by: George Keishing <gkeishin@in.ibm.com>

commit: 4bbf5207a44eabddafd5813dad1e26e39110c3e9 [log] [tgz]
author: George Keishing <gkeishin@in.ibm.com> Thu May 18 06:55:53 2017 -0500
committer: George Keishing <gkeishin@in.ibm.com> Mon Jun 05 08:07:11 2017 -0500
tree: 1fbd7cc1536606422748b2ca1529f3b815bc6d06
parent: ee5c9f53467799054223c53803d1a32436ba347f [diff]
diff --git a/syslib/utils_keywords.py b/syslib/utils_keywords.py
index 8d47d89..e6a4e0e 100644
--- a/syslib/utils_keywords.py
+++ b/syslib/utils_keywords.py

@@ -8,10 +8,12 @@
 import time
 from robot.libraries.BuiltIn import BuiltIn
 from robot.libraries import DateTime
+import re
 
 ###############################################################################
-def run_until_keyword_fails(retry, retry_interval, name, *args):
 
+
+def run_until_keyword_fails(retry, retry_interval, name, *args):
     r"""
     Execute a robot keyword repeatedly until it either fails or the timeout
     value is exceeded.
@@ -25,30 +27,196 @@
     """
 
     # Convert the retry time in seconds
-    retry_seconds= DateTime.convert_time(retry)
+    retry_seconds = DateTime.convert_time(retry)
     timeout = time.time() + int(retry_seconds)
 
     # Convert the interval time in seconds
-    interval_seconds=  DateTime.convert_time(retry_interval)
+    interval_seconds = DateTime.convert_time(retry_interval)
     interval = int(interval_seconds)
 
     BuiltIn().log(timeout)
     BuiltIn().log(interval)
 
     while True:
-        status= BuiltIn().run_keyword_and_return_status(name, *args)
+        status = BuiltIn().run_keyword_and_return_status(name, *args)
 
         # Return if keywords returns as failure.
-        if status==False:
+        if status is False:
             BuiltIn().log("Failed as expected")
-            return  False
+            return False
         # Return if retry timeout as success.
         elif time.time() > timeout > 0:
             BuiltIn().log("Max retry timeout")
-            return  True
+            return True
         time.sleep(interval)
         BuiltIn().log(time.time())
 
-    return  True
+    return True
+###############################################################################
+
+
+###############################################################################
+def htx_error_log_to_list(htx_error_log_output):
+
+    r"""
+    Parse htx error log output string and return list of strings in the form
+    "<field name>:<field value>".
+    The output of this function may be passed to the build_error_dict function.
+
+    Description of argument(s):
+    htx_error_log_output        Error entry string containing the stdout
+                                generated by "htxcmdline -geterrlog".
+
+    Example of htx_error_log_output contents:
+
+    ######################## Result Starts Here ###############################
+    Currently running ECG/MDT : /usr/lpp/htx/mdt/mdt.whit
+    ===========================
+    ---------------------------------------------------------------------
+    Device id:/dev/nvidia0
+    Timestamp:Mar 29 19:41:54 2017
+    err=00000027
+    sev=1
+    Exerciser Name:hxenvidia
+    Serial No:Not Available
+    Part No:Not Available
+    Location:Not Available
+    FRU Number:Not Available
+    Device:Not Available
+    Error Text:cudaEventSynchronize for stopEvent returned err = 0039 from file
+               , line 430.
+    ---------------------------------------------------------------------
+    ---------------------------------------------------------------------
+    Device id:/dev/nvidia0
+    Timestamp:Mar 29 19:41:54 2017
+    err=00000027
+    sev=1
+    Exerciser Name:hxenvidia
+    Serial No:Not Available
+    Part No:Not Available
+    Location:Not Available
+    FRU Number:Not Available
+    Device:Not Available
+    Error Text:Hardware Exerciser stopped on error
+    ---------------------------------------------------------------------
+    ######################### Result Ends Here ################################
+
+    Example output:
+    Returns the lists of error string per entry
+    ['Device id:/dev/nvidia0',
+     'Timestamp:Mar 29 19:41:54 2017',
+     'err=00000027',
+     'sev=1',
+     'Exerciser Name:hxenvidia',
+     'Serial No:Not Available',
+     'Part No:Not Available',
+     'Location:Not Available',
+     'FRU Number:Not Available',
+     'Device:Not Available',
+     'Error Text:cudaEventSynchronize for stopEvent returned err = 0039
+                 from file , line 430.']
+    """
+
+    # List which will hold all the list of entries.
+    error_list = []
+
+    temp_error_list = []
+    parse_walk = False
+
+    for line in htx_error_log_output.splitlines():
+        # Skip lines starting with "#"
+        if line.startswith("#"):
+            continue
+
+        # Mark line starting with "-" and set parse flag.
+        if line.startswith("-") and parse_walk is False:
+            parse_walk = True
+            continue
+        # Mark line starting with "-" and reset parse flag.
+        # Set temp error list to EMPTY.
+        elif line.startswith("-"):
+            error_list.append(temp_error_list)
+            parse_walk = False
+            temp_error_list = []
+        # Add entry to list if line is not emtpy
+        elif parse_walk:
+            temp_error_list.append(str(line))
+
+    return error_list
+###############################################################################
+
+
+###############################################################################
+def build_error_dict(htx_error_log_output):
+
+    r"""
+    Builds error list into a list of dictionary entries.
+
+    Description of argument(s):
+    error_list        Error list entries.
+
+    Example output dictionary:
+    {
+      0:
+        {
+          'sev': '1',
+          'err': '00000027',
+          'Timestamp': 'Mar 29 19:41:54 2017',
+          'Part No': 'Not Available',
+          'Serial No': 'Not Available',
+          'Device': 'Not Available',
+          'FRU Number': 'Not Available',
+          'Location': 'Not Available',
+          'Device id': '/dev/nvidia0',
+          'Error Text': 'cudaEventSynchronize for stopEvent returned err = 0039
+                         from file , line 430.',
+          'Exerciser Name': 'hxenvidia'
+        },
+      1:
+        {
+          'sev': '1',
+          'err': '00000027',
+          'Timestamp': 'Mar 29 19:41:54 2017',
+          'Part No': 'Not Available',
+          'Serial No': 'Not Available',
+          'Device': 'Not Available',
+          'FRU Number': 'Not Available',
+          'Location': 'Not Available',
+          'Device id': '/dev/nvidia0',
+          'Error Text': 'Hardware Exerciser stopped on error',
+          'Exerciser Name': 'hxenvidia'
+        }
+    },
+
+    """
+
+    # List which will hold all the list of entries.
+    error_list = []
+    error_list = htx_error_log_to_list(htx_error_log_output)
+
+    # dictionary which holds the error dictionry entry.
+    error_dict = {}
+
+    temp_error_dict = {}
+    error_index = 0
+
+    # Loop through the error list.
+    for entry_list in error_list:
+        # Loop through the first error list entry.
+        for entry in entry_list:
+            # Split string into list for key value update.
+            # Example: 'Device id:/dev/nvidia0'
+            # Example: 'err=00000027'
+            parm_split = re.split("[:=]", entry)
+            # Populate temp dictionary with key value pair data.
+            temp_error_dict[str(parm_split[0])] = parm_split[1]
+
+        # Update the master dictionary per entry index.
+        error_dict[error_index] = temp_error_dict
+        # Reset temp dict to EMPTY and increment index count.
+        temp_error_dict = {}
+        error_index += 1
+
+    return error_dict
 
 ###############################################################################

diff --git a/syslib/utils_os.robot b/syslib/utils_os.robot
index 0f4f314..206164c 100755
--- a/syslib/utils_os.robot
+++ b/syslib/utils_os.robot

@@ -17,6 +17,11 @@
 
 ${htx_log_dir_path}   ${EXECDIR}${/}logs${/}
 
+# Error strings to check from dmesg.
+${ERROR_REGEX}     error|GPU|NVRM|nvidia
+
+# GPU specific error message from dmesg.
+${ERROR_DBE_MSG}   (DBE) has been detected on GPU
 
 *** Keywords ***
 
@@ -152,3 +157,53 @@
     # Switch back to OS SSH connection.
     Switch Connection  os_connection
 
+
+Check For Errors On OS Dmesg Log
+    [Documentation]  Check if dmesg has nvidia errors logged.
+
+    ${dmesg_log}=  Execute Command On OS  dmesg | egrep '${ERROR_REGEX}'
+    # To enable multiple string check.
+    Should Not Contain Any  ${dmesg_log}  ${ERROR_DBE_MSG}
+
+
+Collect NVIDIA Log File
+    [Documentation]  Collect ndivia-smi command output.
+
+    # Collects the output of ndivia-smi cmd output.
+    # TODO: GPU current temperature threshold check.
+    #       openbmc/openbmc-test-automation#637
+    # +-----------------------------------------------------------------------------+
+    # | NVIDIA-SMI 361.89                 Driver Version: 361.89                    |
+    # |-------------------------------+----------------------+----------------------+
+    # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+    # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+    # |===============================+======================+======================|
+    # |   0  Tesla P100-SXM2...  On   | 0002:01:00.0     Off |                    0 |
+    # | N/A   25C    P0    35W / 300W |    931MiB / 16280MiB |      0%      Default |
+    # +-------------------------------+----------------------+----------------------+
+    # |   1  Tesla P100-SXM2...  On   | 0003:01:00.0     Off |                    0 |
+    # | N/A   26C    P0    40W / 300W |   1477MiB / 16280MiB |      0%      Default |
+    # +-------------------------------+----------------------+----------------------+
+    # |   2  Tesla P100-SXM2...  On   | 0006:01:00.0     Off |                    0 |
+    # | N/A   25C    P0    35W / 300W |    931MiB / 16280MiB |      0%      Default |
+    # +-------------------------------+----------------------+----------------------+
+    # |   3  Tesla P100-SXM2...  On   | 0007:01:00.0     Off |                    0 |
+    # | N/A   44C    P0   290W / 300W |    965MiB / 16280MiB |     99%      Default |
+    # +-------------------------------+----------------------+----------------------+
+    # +-----------------------------------------------------------------------------+
+    # | Processes:                                                       GPU Memory |
+    # |  GPU       PID  Type  Process name                               Usage      |
+    # |=============================================================================|
+    # |    0     28459    C   hxenvidia                                      929MiB |
+    # |    1     28460    C   hxenvidia                                     1475MiB |
+    # |    2     28461    C   hxenvidia                                      929MiB |
+    # |    3     28462    C   hxenvidia                                      963MiB |
+    # +-----------------------------------------------------------------------------+
+
+    # Create logs directory and get current datetime.
+    Create Directory  ${htx_log_dir_path}
+    ${cur_datetime}=  Get Current Date  result_format=%Y%m%d%H%M%S%f
+
+    ${nvidia_out}=  Execute Command On BMC  nvidia-smi
+    Write Log Data To File
+    ...  ${nvidia_out}  ${htx_log_dir_path}/${OS_HOST}_${cur_datetime}.nvidia
commit	4bbf5207a44eabddafd5813dad1e26e39110c3e9	[log] [tgz]
author	George Keishing <gkeishin@in.ibm.com>	Thu May 18 06:55:53 2017 -0500
committer	George Keishing <gkeishin@in.ibm.com>	Mon Jun 05 08:07:11 2017 -0500
tree	1fbd7cc1536606422748b2ca1529f3b815bc6d06
parent	ee5c9f53467799054223c53803d1a32436ba347f [diff]