Prevent multiple PM complex resets from being queued

- Clear any prior reset request when notified that OCCs are active
- If OCC state is safe/not valid, prevent immediate request for reset.
Start a safe state delay timer and if it does not recover then request a
reset.
- If unable to read the OCC state after a retry, then request a reset.
(no change to this behavior)

Problem: A system where the OCC went to safe state, and BMC requested
a reset, but HTMT had already requested a reset, so the PM complex got
reset multiple times when not necessary.

Tested on Rainier/Fuji

Change-Id: Id40b00e6d3708358478271bb6d5acef804715d4a
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 699f4a8..3ae11f6 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -437,6 +437,16 @@
         lg2::error(
             "initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
             "INST", instance);
+
+        // Make sure ALL OCC comm stops to all OCCs before the reset
+        for (auto& obj : statusObjects)
+        {
+            if (obj->occActive())
+            {
+                obj->occActive(false);
+            }
+        }
+
 #ifdef PLDM
         pldmHandle->resetOCC(instance);
 #endif
@@ -507,9 +517,20 @@
 #endif
         }
 
-        // Start poll timer if not already started
+        // Start poll timer if not already started (since at least one OCC is
+        // running)
         if (!_pollTimer->isEnabled())
         {
+            // An OCC just went active, PM Complex is just coming online so
+            // clear any outstanding reset requests
+            if (resetRequired)
+            {
+                resetRequired = false;
+                lg2::error(
+                    "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})",
+                    "INST", instance, "RIP", resetInProgress);
+            }
+
             lg2::info("Manager: OCCs will be polled every {TIME} seconds",
                       "TIME", pollInterval);
 
diff --git a/occ_status.cpp b/occ_status.cpp
index 220fcdd..ed44403 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -28,6 +28,7 @@
                   instance, "STATE", value);
         if (value)
         {
+            // OCC is active
             // Clear prior throttle reason (before setting device active)
             updateThrottle(false, THROTTLED_ALL);
 
@@ -70,7 +71,15 @@
         }
         else
         {
+            // OCC is no longer active
 #ifdef POWER10
+            if (sensorsValid)
+            {
+                sensorsValid = false;
+                // Sensors not supported (update to NaN and not functional)
+                manager.setSensorValueToNaN(instance);
+            }
+
             if (pmode && device.master())
             {
                 // Prevent mode changes
@@ -411,7 +420,7 @@
         fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
 
     std::ifstream file;
-    bool goodFile = false;
+    bool stateWasRead = false;
 
     // open file.
     file.open(filename, std::ios::in);
@@ -420,7 +429,7 @@
     // File is open and state can be used.
     if (file.is_open() && file.good())
     {
-        goodFile = true;
+        stateWasRead = true;
         file >> state;
         // Read the error code (if any) to check status of the read
         std::ios_base::iostate readState = file.rdstate();
@@ -448,10 +457,10 @@
                     "INST", instance, "ERROR", errorBits);
                 lastOccReadStatus = -1;
             }
-            goodFile = false;
+            stateWasRead = false;
         }
 
-        if (goodFile && (state != lastState))
+        if (stateWasRead && (state != lastState))
         {
             // Trace OCC state changes
             lg2::info(
@@ -489,6 +498,7 @@
             {
                 // Good OCC State then sensors valid again
                 stateValid = true;
+                sensorsValid = true;
 
                 if (safeStateDelayTimer.isEnabled())
                 {
@@ -502,15 +512,20 @@
                 if (!safeStateDelayTimer.isEnabled())
                 {
                     lg2::error(
-                        "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer",
-                        "INST", instance, "STATE", state);
+                        "readOccState: Invalid OCC{INST} state of {STATE} (last state: {PRIOR}), starting safe state delay timer",
+                        "INST", instance, "STATE", lg2::hex, state, "PRIOR",
+                        lg2::hex, lastState);
                     // start safe delay timer (before requesting reset)
                     using namespace std::literals::chrono_literals;
                     safeStateDelayTimer.restartOnce(60s);
                 }
-                // Not a supported state (update sensors to NaN and not
-                // functional)
-                stateValid = false;
+
+                if (sensorsValid)
+                {
+                    sensorsValid = false;
+                    // Sensors not supported (update to NaN and not functional)
+                    manager.setSensorValueToNaN(instance);
+                }
             }
 #else
             // Before P10 state not checked, only used good file open.
@@ -527,37 +542,26 @@
 #endif
     file.close();
 
-    // if failed to Read a state or not a valid state -> Attempt retry
-    // after 1 Second delay if allowed.
-    if ((!goodFile) || (!stateValid))
+    // if failed to read the OCC state -> Attempt retry
+    if (!stateWasRead)
     {
-        if (!goodFile)
-        {
-            // If not able to read, OCC may be offline
-            if (openErrno != lastOccReadStatus)
-            {
-                lg2::error(
-                    "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
-                    "INST", instance, "ERROR", openErrno);
-                lastOccReadStatus = openErrno;
-            }
-        }
-        else
-        {
-            // else this failed due to state not valid.
-            if (state != lastState)
-            {
-                lg2::error(
-                    "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})",
-                    "INST", instance, "STATE", lg2::hex, state, "PRIOR",
-                    lg2::hex, lastState);
-            }
-        }
-
 #ifdef READ_OCC_SENSORS
-        manager.setSensorValueToNaN(instance);
+        if (sensorsValid)
+        {
+            sensorsValid = false;
+            manager.setSensorValueToNaN(instance);
+        }
 #endif
 
+        // If not able to read, OCC may be offline
+        if (openErrno != lastOccReadStatus)
+        {
+            lg2::error(
+                "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
+                "INST", instance, "ERROR", openErrno);
+            lastOccReadStatus = openErrno;
+        }
+
         // See occReadRetries for number of retry attempts.
         if (currentOccReadRetriesCount > 0)
         {
@@ -565,8 +569,9 @@
         }
         else
         {
-            lg2::error("readOccState: failed to read OCC{INST} state!", "INST",
-                       instance);
+            lg2::error(
+                "readOccState: failed to read OCC{INST} state! (last state: {PRIOR})",
+                "INST", instance, "PRIOR", lg2::hex, lastState);
 
             // State could not be determined, set it to NO State.
             lastState = 0;
@@ -584,15 +589,11 @@
             currentOccReadRetriesCount = occReadRetries;
         }
     }
-    else
+    else if (lastOccReadStatus != 0)
     {
-        if (lastOccReadStatus != 0)
-        {
-            lg2::info(
-                "Status::readOccState: successfully read OCC{INST} state: {STATE}",
-                "INST", instance, "STATE", state);
-            lastOccReadStatus = 0; // no error
-        }
+        lg2::info("readOccState: successfully read OCC{INST} state: {STATE}",
+                  "INST", instance, "STATE", state);
+        lastOccReadStatus = 0; // no error
     }
 }
 
diff --git a/occ_status.hpp b/occ_status.hpp
index 6493040..f7ea02c 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -287,6 +287,11 @@
     /** @brief The Trigger to indicate OCC State is valid or not. */
     bool stateValid = false;
 
+#ifdef POWER10
+    /** @brief The Trigger to indicate OCC sensors are valid or not. */
+    bool sensorsValid = false;
+#endif
+
     /** @brief OCC instance to Sensor definitions mapping */
     static const std::map<instanceID, sensorDefs> sensorMap;