Clean up error cases on boot or when app restarted

- Detect when no or duplicate masters are found (force reset)
- Prevent sending commands to OCC when OCC is disabled
- Detect OCC state changes to SAFE (after 60 sec force reset)

Tested on Everest and Rainier hardware

Signed-off-by: Chris Cain <cjcain@us.ibm.com>
Change-Id: I490f182405e11da207b42a0607a532566479bfd9
diff --git a/occ_status.cpp b/occ_status.cpp
index ad0d0ab..a2ab6ab 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -52,6 +52,14 @@
                 this->callBack(value);
             }
 
+#ifdef POWER10
+            if (safeStateDelayTimer.isEnabled())
+            {
+                // stop safe delay timer
+                safeStateDelayTimer.setEnabled(false);
+            }
+#endif
+
             // Stop watching for errors
             removeErrorWatch();
 
@@ -184,13 +192,14 @@
             lastState = state;
 
 #ifdef POWER10
-            if ((OccState(state) == OccState::ACTIVE) && (device.master()))
-            {
-                // Kernel detected that the master OCC went to active state
-                occsWentActive();
-            }
             if (OccState(state) == OccState::ACTIVE)
             {
+                if (device.master())
+                {
+                    // Special processing by master OCC when it goes active
+                    occsWentActive();
+                }
+
                 CmdStatus status = sendAmbient();
                 if (status != CmdStatus::SUCCESS)
                 {
@@ -201,6 +210,18 @@
                             .c_str());
                 }
             }
+
+            if (OccState(state) == OccState::SAFE)
+            {
+                // start safe delay timer (before requesting reset)
+                using namespace std::literals::chrono_literals;
+                safeStateDelayTimer.restartOnce(60s);
+            }
+            else if (safeStateDelayTimer.isEnabled())
+            {
+                // stop safe delay timer (no longer in SAFE state)
+                safeStateDelayTimer.setEnabled(false);
+            }
 #endif
         }
         file.close();
@@ -676,6 +697,21 @@
 
     return status;
 }
+
+// Called when safe timer expires to determine if OCCs need to be reset
+void Status::safeStateDelayExpired()
+{
+    if (this->occActive())
+    {
+        log<level::INFO>(
+            fmt::format(
+                "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
+                instance)
+                .c_str());
+        // Disable and reset to try recovering
+        deviceError();
+    }
+}
 #endif // POWER10
 
 } // namespace occ