Improve BMC error handling for OCC comm failures

- Delay starting OCC reset until all OCCs have been detected (or
timeout). It will prevent multiple resets from being triggered and to
help detecting when reset is completed (active sensor being set after
reset is complete)
- Wait for PLDM response to OCC reset and HRESET requests and retry if
they fail
- If HRESET returns NOT_READY, collect SBE FFDC and try OCC reset. A
persistent failure will put the system in safe state.

- Prevent overwriting dvfs over-temp filename for p10 and beyond since
that old file is only present in old kernel
- Prevent assert when opening sysfs files. (added catch and then created
an OCC Comm failure PEL, which will force an OCC reset.)
- Check return code after reading sysfs files to confirm success. If
read fails, try reset to recover.

- Updated traces to include which processor/OCC encountered issues.
- Better recovery to close windows that were leaving system in partial
good state.

JIRA: PFES-66
Change-Id: I0b087d0e05bd8562682062e1c662f9e18164a720
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_status.cpp b/occ_status.cpp
index f8a3e8d..7d654b9 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -36,15 +36,30 @@
             // Set the device active
             device.setActive(true);
 
-            // Update the OCC active sensor
-            Base::Status::occActive(value);
-
-            // Start watching for errors
-            addErrorWatch();
-
             // Reset last OCC state
             lastState = 0;
 
+            // Start watching for errors (throttles, etc)
+            try
+            {
+                addErrorWatch();
+            }
+            catch (const OpenFailure& e)
+            {
+                // Failed to add watch for throttle events, request reset to try
+                // to recover comm
+                log<level::ERR>(
+                    std::format(
+                        "Status::occActive: Unable to add error watch(s) for OCC{} watch: {}",
+                        instance, e.what())
+                        .c_str());
+                deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
+                return Base::Status::occActive(false);
+            }
+
+            // Update the OCC active sensor
+            Base::Status::occActive(value);
+
             if (device.master())
             {
                 // Update powercap bounds from OCC
@@ -104,7 +119,22 @@
         device.setActive(true);
 
         // Add error watch again
-        addErrorWatch();
+        try
+        {
+            addErrorWatch();
+        }
+        catch (const OpenFailure& e)
+        {
+            // Failed to add watch for throttle events, request reset to try to
+            // recover comm
+            log<level::ERR>(
+                std::format(
+                    "Status::occActive: Unable to add error watch(s) again for OCC{} watch: {}",
+                    instance, e.what())
+                    .c_str());
+            deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
+            return Base::Status::occActive(false);
+        }
     }
     else if (!value && device.active())
     {
@@ -150,6 +180,7 @@
         std::format(">>Status::resetOCC() - requesting reset for OCC{}",
                     instance)
             .c_str());
+    this->occActive(false);
 #ifdef PLDM
     if (resetCallBack)
     {
@@ -206,7 +237,11 @@
 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
 void Status::readOccState()
 {
-    currentOccReadRetriesCount = occReadRetries;
+    if (stateValid)
+    {
+        // Reset retry count (since state is good)
+        currentOccReadRetriesCount = occReadRetries;
+    }
     occReadStateNow();
 }
 
@@ -318,8 +353,8 @@
 
         if (status == CmdStatus::COMM_FAILURE)
         {
-            // Disable and reset to try recovering
-            deviceError();
+            // Disable due to OCC comm failure and reset to try recovering
+            deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
         }
     }
 
@@ -333,7 +368,7 @@
     {
         log<level::INFO>(
             std::format(
-                "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
+                "safeStateDelayExpired: OCC{} state missing or not valid, requesting reset",
                 instance)
                 .c_str());
         // Disable and reset to try recovering
@@ -352,7 +387,7 @@
 
         if (!hwmonPath.empty())
         {
-            log<level::ERR>(
+            log<level::WARNING>(
                 std::format("Status::getHwmonPath(): path no longer exists: {}",
                             hwmonPath.c_str())
                     .c_str());
@@ -405,7 +440,7 @@
     return hwmonPath;
 }
 
-// Called to read state and upon failure to read after occReadStateFailTimer.
+// Called to read state and handle any errors
 void Status::occReadStateNow()
 {
     unsigned int state;
@@ -425,8 +460,38 @@
     {
         goodFile = true;
         file >> state;
+        // Read the error code (if any) to check status of the read
+        std::ios_base::iostate readState = file.rdstate();
+        if (readState)
+        {
+            // There was a failure reading the file
+            if (lastOccReadStatus != -1)
+            {
+                // Trace error bits
+                std::string errorBits = "";
+                if (readState & std::ios_base::eofbit)
+                {
+                    errorBits += " EOF";
+                }
+                if (readState & std::ios_base::failbit)
+                {
+                    errorBits += " failbit";
+                }
+                if (readState & std::ios_base::badbit)
+                {
+                    errorBits += " badbit";
+                }
+                log<level::ERR>(
+                    std::format(
+                        "readOccState: Failed to read OCC{} state: Read error on I/O operation -{}",
+                        instance, errorBits)
+                        .c_str());
+                lastOccReadStatus = -1;
+            }
+            goodFile = false;
+        }
 
-        if (state != lastState)
+        if (goodFile && (state != lastState))
         {
             // Trace OCC state changes
             log<level::INFO>(
@@ -474,18 +539,22 @@
                     safeStateDelayTimer.setEnabled(false);
                 }
             }
-            // Else not Valid state We would be in SAFE mode.
-            // This captures both SAFE mode, and 0x00, or other invalid
-            // state values.
             else
             {
+                // OCC is in SAFE or some other unsupported state
                 if (!safeStateDelayTimer.isEnabled())
                 {
+                    log<level::ERR>(
+                        std::format(
+                            "readOccState: Invalid OCC{} state of {}, starting safe state delay timer",
+                            instance, state)
+                            .c_str());
                     // start safe delay timer (before requesting reset)
                     using namespace std::literals::chrono_literals;
                     safeStateDelayTimer.restartOnce(60s);
                 }
-                // Not valid state, update sensors to Nan & not functional.
+                // Not a supported state (update sensors to NaN and not
+                // functional)
                 stateValid = false;
             }
 #else
@@ -494,6 +563,13 @@
 #endif
         }
     }
+#ifdef POWER10
+    else
+    {
+        // Unable to read state
+        stateValid = false;
+    }
+#endif
     file.close();
 
     // if failed to Read a state or not a valid state -> Attempt retry
@@ -503,10 +579,15 @@
         if (!goodFile)
         {
             // If not able to read, OCC may be offline
-            log<level::ERR>(
-                std::format("Status::readOccState: open failed (errno={})",
-                            openErrno)
-                    .c_str());
+            if (openErrno != lastOccReadStatus)
+            {
+                log<level::ERR>(
+                    std::format(
+                        "Status::readOccState: open/read failed trying to read OCC{} state (open errno={})",
+                        instance, openErrno)
+                        .c_str());
+                lastOccReadStatus = openErrno;
+            }
         }
         else
         {
@@ -529,26 +610,14 @@
         if (currentOccReadRetriesCount > 0)
         {
             --currentOccReadRetriesCount;
-#ifdef POWER10
-            using namespace std::chrono_literals;
-            occReadStateFailTimer.restartOnce(1s);
-#endif
         }
         else
         {
-#ifdef POWER10
-            if (!stateValid && occActive())
-            {
-                if (!safeStateDelayTimer.isEnabled())
-                {
-                    log<level::ERR>(
-                        "Starting 60 sec delay timer before requesting a reset");
-                    // start safe delay timer (before requesting reset)
-                    using namespace std::literals::chrono_literals;
-                    safeStateDelayTimer.restartOnce(60s);
-                }
-            }
-#else
+            log<level::ERR>(
+                std::format("readOccState: failed to read OCC{} state!",
+                            instance)
+                    .c_str());
+
             // State could not be determined, set it to NO State.
             lastState = 0;
 
@@ -556,9 +625,23 @@
             // Active again.
             stateValid = false;
 
-            // Disable and reset to try recovering
-            deviceError();
-#endif
+            // Disable due to OCC comm failure and reset to try recovering
+            deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
+
+            // Reset retry count (for next attempt after recovery)
+            currentOccReadRetriesCount = occReadRetries;
+        }
+    }
+    else
+    {
+        if (lastOccReadStatus != 0)
+        {
+            log<level::INFO>(
+                std::format(
+                    "Status::readOccState: successfully read OCC{} state: {}",
+                    instance, state)
+                    .c_str());
+            lastOccReadStatus = 0; // no error
         }
     }
 }