Force fans to high when no valid temp from sensors

DDR5 DDIMS can have redundant DRAM thermal sensors. If a valid
temperature is not available for either of these sensors, and
one of the sensors failed, the fans should be set to max.
If both DRAM sensors are reporting 0 (not available), fans are
not set to max.

No change if there are not redundant sensors:
If there are not redundant sensors and the temperature is:
- 0/not available: don't use sensor for fan control
- error: fans will be set to high speed

Tested on HW by forcing invalid address
OCC reported temps:
                   D0010004 01 29 59 00 (41C membuf)
                   D0010004 07 00 55 00 (0C pmic)
                   D0010004 02 00 45 00 (0C dimm)
                   D0010004 02 FF 45 00 (ERROR dimm)
Fans ramped up:
TARGET SENSOR    TARGET(RPM)   FEEDBACK SENSOR    FEEDBACK(RPM)
===============================================================
fan0_0                 10400            fan0_0            10190
fan1_0                 10400            fan1_0            10330

Change-Id: I6ae920a4c45f3cc44dd3c1d614c495dad138b62c
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 9bd83ea..70adeca 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -760,7 +760,7 @@
 }
 
 #ifdef READ_OCC_SENSORS
-void Manager::readTempSensors(const fs::path& path, uint32_t id)
+void Manager::readTempSensors(const fs::path& path, uint32_t occInstance)
 {
     // There may be more than one sensor with the same FRU type
     // and label so make two passes: the first to read the temps
@@ -816,13 +816,15 @@
 
         if (fruTypeValue == VRMVdd)
         {
-            sensorPath.append("vrm_vdd" + std::to_string(id) + "_temp");
+            sensorPath.append("vrm_vdd" + std::to_string(occInstance) +
+                              "_temp");
         }
         else if (fruTypeValue == processorIoRing)
         {
-            sensorPath.append("proc" + std::to_string(id) + "_ioring_temp");
+            sensorPath.append("proc" + std::to_string(occInstance) +
+                              "_ioring_temp");
             dvfsTempPath = std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
-                           std::to_string(id) + "_ioring_dvfs_temp";
+                           std::to_string(occInstance) + "_ioring_dvfs_temp";
         }
         else
         {
@@ -863,13 +865,13 @@
                     // core mode, so use a big core name.
                     uint16_t coreNum = instanceID / 2;
                     uint16_t tempNum = instanceID % 2;
-                    sensorPath.append("proc" + std::to_string(id) + "_core" +
-                                      std::to_string(coreNum) + "_" +
+                    sensorPath.append("proc" + std::to_string(occInstance) +
+                                      "_core" + std::to_string(coreNum) + "_" +
                                       std::to_string(tempNum) + "_temp");
 
-                    dvfsTempPath = std::string{OCC_SENSORS_ROOT} +
-                                   "/temperature/proc" + std::to_string(id) +
-                                   "_core_dvfs_temp";
+                    dvfsTempPath =
+                        std::string{OCC_SENSORS_ROOT} + "/temperature/proc" +
+                        std::to_string(occInstance) + "_core_dvfs_temp";
                 }
                 else
                 {
@@ -917,44 +919,41 @@
             continue;
         }
 
-        // NOTE: if OCC sends back 0xFF kernal sets this fault value to 1.
+        double tempValue{0};
+        // NOTE: if OCC sends back 0xFF, kernal sets this fault value to 1.
         if (faultValue != 0)
         {
-            // For cases when there are multiple readings per fru type/label,
-            // don't overwrite a good value with an NaN.
-            if (!sensorData.contains(sensorPath))
-            {
-                sensorData[sensorPath] =
-                    std::numeric_limits<double>::quiet_NaN();
-            }
-            continue;
+            tempValue = std::numeric_limits<double>::quiet_NaN();
         }
-
-        double tempValue{0};
-
-        try
+        else
         {
-            tempValue = readFile<double>(filePathString + inputSuffix);
-        }
-        catch (const std::system_error& e)
-        {
-            log<level::DEBUG>(
-                fmt::format("readTempSensors: Failed reading {}, errno = {}",
-                            filePathString + inputSuffix, e.code().value())
-                    .c_str());
-
-            // if errno == EAGAIN(Resource temporarily unavailable) then set
-            // temp to 0, to avoid using old temp, and affecting FAN Control.
-            if (e.code().value() == EAGAIN)
+            // Read the temperature
+            try
             {
-                tempValue = 0;
+                tempValue = readFile<double>(filePathString + inputSuffix);
             }
-            // else the errno would be something like
-            //     EBADF(Bad file descriptor)
-            // or ENOENT(No such file or directory)
-            else
+            catch (const std::system_error& e)
             {
-                continue;
+                log<level::DEBUG>(
+                    fmt::format(
+                        "readTempSensors: Failed reading {}, errno = {}",
+                        filePathString + inputSuffix, e.code().value())
+                        .c_str());
+
+                // if errno == EAGAIN(Resource temporarily unavailable) then set
+                // temp to 0, to avoid using old temp, and affecting FAN
+                // Control.
+                if (e.code().value() == EAGAIN)
+                {
+                    tempValue = 0;
+                }
+                // else the errno would be something like
+                //     EBADF(Bad file descriptor)
+                // or ENOENT(No such file or directory)
+                else
+                {
+                    continue;
+                }
             }
         }
 
@@ -963,6 +962,16 @@
         auto existing = sensorData.find(sensorPath);
         if (existing != sensorData.end())
         {
+            // Multiple sensors found for this FRU type
+            if ((std::isnan(existing->second) && (tempValue == 0)) ||
+                ((existing->second == 0) && std::isnan(tempValue)))
+            {
+                // One of the redundant sensors has failed (0xFF/nan), and the
+                // other sensor has no reading (0), so set the FRU to NaN to
+                // force fan increase
+                tempValue = std::numeric_limits<double>::quiet_NaN();
+                existing->second = tempValue;
+            }
             if (std::isnan(existing->second) || (tempValue > existing->second))
             {
                 existing->second = tempValue;
@@ -970,6 +979,7 @@
         }
         else
         {
+            // First sensor for this FRU type
             sensorData[sensorPath] = tempValue;
         }
     }
@@ -989,7 +999,7 @@
                 objectPath);
         }
 
-        existingSensors[objectPath] = id;
+        existingSensors[objectPath] = occInstance;
     }
 }