Add gpu_core_temperature from REST into GPU stress test

Add library routine to get the core_temperature from REST, and
check that the REST temperature matches the nvidia-smi reported
temperature.

Resolves openbmc/openbmc-test-automation#1355

Change-Id: Ib136054e0dfff16482711e4f8c95eb2b24315558
Signed-off-by: Steven Sombar <ssombar@us.ibm.com>
diff --git a/syslib/utils_os.robot b/syslib/utils_os.robot
index 1f28d7c..629a504 100755
--- a/syslib/utils_os.robot
+++ b/syslib/utils_os.robot
@@ -346,6 +346,29 @@
     [Return]  ${nvidia_out}
 
 
+Get GPU Temperature Via REST
+    [Documentation]  Return the temperature in degrees C of the warmest GPU
+    ...  as reportd by REST.
+
+    # NOTE: This endpoint path is not defined until system has been powered-on.
+    ${temperature_objs}=  Read Properties  ${SENSORS_URI}temperature/enumerate
+    ...  timeout=30  quiet=1
+
+    ${core_temperatures_list}=  Catenate  {k:v for k,v in $temperature_objs.iteritems()
+    ...  if re.match('${SENSORS_URI}temperature/.*_core_temp', k)}
+    ${gpu_temperature_objs_dict}=  Evaluate  ${core_temperatures_list}  modules=re
+
+    # Create a list containing all of the GPU temperatures.
+    ${gpu_temperatures}=  Evaluate
+    ...  [ x['Value'] for x in $gpu_temperature_objs_dict.values() ]
+
+    # Find the max temperature value and divide by 1000 to get just the integer
+    # portion.
+    ${max_gpu_temperature}=  Evaluate  max(map(int, $gpu_temperatures))/1000
+
+    [Return]  ${max_gpu_temperature}
+
+
 Get GPU Clock Limit
     [Documentation]  Get NVIDIA GPU maximum permitted graphics clock.
 
diff --git a/systest/gpu_stress_test.robot b/systest/gpu_stress_test.robot
index 8788ad5..ffa070e 100755
--- a/systest/gpu_stress_test.robot
+++ b/systest/gpu_stress_test.robot
@@ -104,17 +104,30 @@
     # Check NVIDIA power, temperature, and clocks.
     ${power}=  Get GPU Power
     ${temperature}=  Get GPU Temperature
+    ${temperature_via_rest}=  Get GPU Temperature Via REST
     ${clock}=  Get GPU Clock
     Rprintn
-    Rpvars  power  power_max  temperature  temperature_max  clock  clock_max
+    Rpvars  power  power_max  temperature  temperature_via_rest
+    ...  temperature_max  clock  clock_max
+
     Run Keyword If  ${power} > ${power_max}  Fail
     ...  msg=GPU Power ${power} exceeds limit of ${power_max}.
-    ${errmsg}=  Catenate  GPU temperature of ${temperature} exceeds limit
+
+    ${err_msg}=  Catenate  GPU temperature of ${temperature} exceeds limit
     ...  of ${temperature_max}.
-    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${errmsg}
+    Run Keyword If  ${temperature} > ${temperature_max}  Fail  msg=${err_msg}
+
     Run Keyword If  ${clock} > ${clock_max}  Fail
     ...  msg=GPU clock of ${clock} exceeds limit of ${clock_max}.
 
+    ${err_msg}=  Catenate  The GPU temperature reported by REST is not within
+    ...  5 degrees of the nvidia_smi reported temperature.
+    ${upper_limit}=  Evaluate  ${temperature_via_rest}+5
+    ${lower_limit}=  Evaluate  ${temperature_via_rest}-5
+    Run Keyword If
+    ...  ${temperature} > ${upper_limit} or ${temperature} < ${lower_limit}
+    ...  Fail  msg=${err_msg}
+
     Shutdown HTX Exerciser
 
     Collect NVIDIA Log File  end