Clean up error cases on boot or when app restarted

- Detect when no or duplicate masters are found (force reset)
- Prevent sending commands to OCC when OCC is disabled
- Detect OCC state changes to SAFE (after 60 sec force reset)

Tested on Everest and Rainier hardware

Signed-off-by: Chris Cain <cjcain@us.ibm.com>
Change-Id: I490f182405e11da207b42a0607a532566479bfd9
diff --git a/occ_manager.cpp b/occ_manager.cpp
index c1ecf5b..33711be 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -27,6 +27,7 @@
 constexpr auto maxSuffix = "max";
 
 using namespace phosphor::logging;
+using namespace std::literals::chrono_literals;
 
 template <typename T>
 T readFile(const std::string& path)
@@ -72,7 +73,6 @@
         // a chance to settle.
         prevOCCSearch = occs;
 
-        using namespace std::literals::chrono_literals;
         discoverTimer->restartOnce(10s);
     }
     else
@@ -182,41 +182,80 @@
         elog<InternalFailure>();
     }
 
-    activeCount += status ? 1 : -1;
-
-    // Only start presence detection if all the OCCs are bound
-    if (activeCount == statusObjects.size())
+    if (status == true)
     {
-        for (auto& obj : statusObjects)
+        // OCC went active
+        ++activeCount;
+
+#ifdef POWER10
+        if (activeCount == 1)
         {
-            obj->addPresenceWatchMaster();
-        }
-    }
-
-    if ((!_pollTimer->isEnabled()) && (activeCount > 0))
-    {
-        log<level::INFO>(
-            fmt::format(
-                "Manager::statusCallBack(): {} OCCs will be polled every {} seconds",
-                activeCount, pollInterval)
-                .c_str());
-
-        // Send poll and start OCC poll timer
-        pollerTimerExpired();
-    }
-    else if ((_pollTimer->isEnabled()) && (activeCount == 0))
-    {
-        // Stop OCC poll timer
-        log<level::INFO>(
-            "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
-        _pollTimer->setEnabled(false);
-
-#ifdef READ_OCC_SENSORS
-        for (auto& obj : statusObjects)
-        {
-            setSensorValueToNaN(obj->getOccInstanceID());
+            // First OCC went active (allow some time for all OCCs to go active)
+            waitForAllOccsTimer->restartOnce(30s);
         }
 #endif
+
+        if (activeCount == statusObjects.size())
+        {
+#ifdef POWER10
+            // All OCCs are now running
+            if (waitForAllOccsTimer->isEnabled())
+            {
+                // stop occ wait timer
+                waitForAllOccsTimer->setEnabled(false);
+            }
+#endif
+
+            // Verify master OCC and start presence monitor
+            validateOccMaster();
+        }
+
+        // Start poll timer if not already started
+        if (!_pollTimer->isEnabled())
+        {
+            log<level::INFO>(
+                fmt::format(
+                    "Manager::statusCallBack(): {} OCCs will be polled every {} seconds",
+                    activeCount, pollInterval)
+                    .c_str());
+
+            // Send poll and start OCC poll timer
+            pollerTimerExpired();
+        }
+    }
+    else
+    {
+        // OCC went away
+        --activeCount;
+
+        if (activeCount == 0)
+        {
+            // No OCCs are running
+
+            // Stop OCC poll timer
+            if (_pollTimer->isEnabled())
+            {
+                log<level::INFO>(
+                    "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
+                _pollTimer->setEnabled(false);
+            }
+
+#ifdef POWER10
+            // stop wait timer
+            if (waitForAllOccsTimer->isEnabled())
+            {
+                waitForAllOccsTimer->setEnabled(false);
+            }
+#endif
+
+#ifdef READ_OCC_SENSORS
+            // Clear OCC sensors
+            for (auto& obj : statusObjects)
+            {
+                setSensorValueToNaN(obj->getOccInstanceID());
+            }
+#endif
+        }
     }
 }
 
@@ -410,13 +449,6 @@
 
 void Manager::pollerTimerExpired()
 {
-    if (activeCount == 0)
-    {
-        // No OCCs running, so poll timer will not be restarted
-        log<level::INFO>(
-            "Manager::pollerTimerExpire(): No OCCs running, poll timer not restarted");
-    }
-
     if (!_pollTimer)
     {
         log<level::ERR>(
@@ -426,24 +458,40 @@
 
     for (auto& obj : statusObjects)
     {
+#ifdef READ_OCC_SENSORS
+        auto id = obj->getOccInstanceID();
+#endif
+        if (!obj->occActive())
+        {
+            // OCC is not running yet
+#ifdef READ_OCC_SENSORS
+            setSensorValueToNaN(id);
+#endif
+            continue;
+        }
+
         // Read sysfs to force kernel to poll OCC
         obj->readOccState();
 
 #ifdef READ_OCC_SENSORS
         // Read occ sensor values
-        auto id = obj->getOccInstanceID();
-        if (!obj->occActive())
-        {
-            // Occ not activated
-            setSensorValueToNaN(id);
-            continue;
-        }
         getSensorValues(id, obj->isMasterOcc());
 #endif
     }
 
-    // Restart OCC poll timer
-    _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
+    if (activeCount > 0)
+    {
+        // Restart OCC poll timer
+        _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
+    }
+    else
+    {
+        // No OCCs running, so poll timer will not be restarted
+        log<level::INFO>(
+            fmt::format(
+                "Manager::pollerTimerExpired: poll timer will not be restarted")
+                .c_str());
+    }
 }
 
 #ifdef READ_OCC_SENSORS
@@ -909,5 +957,66 @@
     }
 }
 
+#ifdef POWER10
+void Manager::occsNotAllRunning()
+{
+    // Function will also gets called when occ-control app gets restarted.
+    // (occ active sensors do not change, so the Status object does not
+    //  call Manager back for all OCCs)
+
+    if (activeCount != statusObjects.size())
+    {
+        // Not all OCCs went active
+        log<level::WARNING>(
+            fmt::format(
+                "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
+                activeCount, statusObjects.size())
+                .c_str());
+        // Procs may be garded, so may not need reset.
+    }
+
+    validateOccMaster();
+}
+#endif // POWER10
+
+// Verify single master OCC and start presence monitor
+void Manager::validateOccMaster()
+{
+    int masterInstance = -1;
+    for (auto& obj : statusObjects)
+    {
+        obj->addPresenceWatchMaster();
+        if (obj->isMasterOcc())
+        {
+            if (masterInstance == -1)
+            {
+                masterInstance = obj->getOccInstanceID();
+            }
+            else
+            {
+                log<level::ERR>(
+                    fmt::format(
+                        "validateOccMaster: Multiple OCC masters! ({} and {})",
+                        masterInstance, obj->getOccInstanceID())
+                        .c_str());
+                // request reset
+                obj->deviceError();
+            }
+        }
+    }
+    if (masterInstance < 0)
+    {
+        log<level::ERR>("validateOccMaster: Master OCC not found!");
+        // request reset
+        statusObjects.front()->deviceError();
+    }
+    else
+    {
+        log<level::INFO>(
+            fmt::format("validateOccMaster: OCC{} is master", masterInstance)
+                .c_str());
+    }
+}
+
 } // namespace occ
 } // namespace open_power
diff --git a/occ_manager.hpp b/occ_manager.hpp
index a137363..97bf2b1 100644
--- a/occ_manager.hpp
+++ b/occ_manager.hpp
@@ -101,7 +101,11 @@
         discoverTimer(
             std::make_unique<
                 sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>(
-                sdpEvent, std::bind(&Manager::findAndCreateObjects, this)))
+                sdpEvent, std::bind(&Manager::findAndCreateObjects, this))),
+        waitForAllOccsTimer(
+            std::make_unique<
+                sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>(
+                sdpEvent, std::bind(&Manager::occsNotAllRunning, this)))
 #endif
     {
 #ifdef I2C_OCC
@@ -295,6 +299,19 @@
      *        any were added since the last check.
      */
     std::vector<int> prevOCCSearch;
+
+    /**
+     * @brief Timer used when waiting for OCCs to go active.
+     */
+    std::unique_ptr<
+        sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>
+        waitForAllOccsTimer;
+
+    /** @brief Called when code times out waiting for all OCCs to be running or
+     *         after the app is restarted (Status does not callback into
+     * Manager).
+     */
+    void occsNotAllRunning();
 #endif
 
     /**
@@ -380,6 +397,11 @@
      *  @param[in]  msg - Data associated with subscribed signal
      */
     void ambientCallback(sdbusplus::message::message& msg);
+
+    /** @brief Confirm that a single OCC master was found and start presence
+     * monitoring
+     */
+    void validateOccMaster();
 };
 
 } // namespace occ
diff --git a/occ_presence.cpp b/occ_presence.cpp
index ab9b322..e06ecb8 100644
--- a/occ_presence.cpp
+++ b/occ_presence.cpp
@@ -46,9 +46,9 @@
     auto occsPresent = std::stoi(data, nullptr, 0);
     if (manager.getNumOCCs() != occsPresent)
     {
-        log<level::INFO>("OCC presence mismatch",
-                         entry("BMC_OCCS=%d", manager.getNumOCCs()),
-                         entry("OCC_OCCS=%d", occsPresent));
+        log<level::ERR>(fmt::format("OCC presence mismatch - BMC: {}, OCC: {}",
+                                    manager.getNumOCCs(), occsPresent)
+                            .c_str());
         if (callBack)
         {
             callBack(true);
diff --git a/occ_status.cpp b/occ_status.cpp
index ad0d0ab..a2ab6ab 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -52,6 +52,14 @@
                 this->callBack(value);
             }
 
+#ifdef POWER10
+            if (safeStateDelayTimer.isEnabled())
+            {
+                // stop safe delay timer
+                safeStateDelayTimer.setEnabled(false);
+            }
+#endif
+
             // Stop watching for errors
             removeErrorWatch();
 
@@ -184,13 +192,14 @@
             lastState = state;
 
 #ifdef POWER10
-            if ((OccState(state) == OccState::ACTIVE) && (device.master()))
-            {
-                // Kernel detected that the master OCC went to active state
-                occsWentActive();
-            }
             if (OccState(state) == OccState::ACTIVE)
             {
+                if (device.master())
+                {
+                    // Special processing by master OCC when it goes active
+                    occsWentActive();
+                }
+
                 CmdStatus status = sendAmbient();
                 if (status != CmdStatus::SUCCESS)
                 {
@@ -201,6 +210,18 @@
                             .c_str());
                 }
             }
+
+            if (OccState(state) == OccState::SAFE)
+            {
+                // start safe delay timer (before requesting reset)
+                using namespace std::literals::chrono_literals;
+                safeStateDelayTimer.restartOnce(60s);
+            }
+            else if (safeStateDelayTimer.isEnabled())
+            {
+                // stop safe delay timer (no longer in SAFE state)
+                safeStateDelayTimer.setEnabled(false);
+            }
 #endif
         }
         file.close();
@@ -676,6 +697,21 @@
 
     return status;
 }
+
+// Called when safe timer expires to determine if OCCs need to be reset
+void Status::safeStateDelayExpired()
+{
+    if (this->occActive())
+    {
+        log<level::INFO>(
+            fmt::format(
+                "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
+                instance)
+                .c_str());
+        // Disable and reset to try recovering
+        deviceError();
+    }
+}
 #endif // POWER10
 
 } // namespace occ
diff --git a/occ_status.hpp b/occ_status.hpp
index c449ec1..1cbcdb3 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -11,6 +11,10 @@
 #include <org/open_power/OCC/Status/server.hpp>
 #include <sdbusplus/bus.hpp>
 #include <sdbusplus/server/object.hpp>
+#ifdef POWER10
+#include <sdeventplus/event.hpp>
+#include <sdeventplus/utility/timer.hpp>
+#endif
 
 #include <functional>
 
@@ -100,6 +104,13 @@
         occCmd(instance, (fs::path(OCC_CONTROL_ROOT) /
                           (std::string(OCC_NAME) + std::to_string(instance)))
                              .c_str())
+#ifdef POWER10
+        ,
+        sdpEvent(sdeventplus::Event::get_default()),
+        safeStateDelayTimer(
+            sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
+                sdpEvent, std::bind(&Status::safeStateDelayExpired, this)))
+#endif
 #ifdef PLDM
         ,
         resetCallBack(resetCallBack)
@@ -226,6 +237,11 @@
     /** @brief Command object to send commands to the OCC */
     OccCommand occCmd;
 
+#ifdef POWER10
+    /** @brief timer event */
+    sdeventplus::Event sdpEvent;
+#endif
+
     /** @brief Callback function on host control signals
      *
      *  @param[in]  msg - Data associated with subscribed signal
@@ -261,6 +277,17 @@
      */
     bool getIPSParms(uint8_t& enterUtil, uint16_t& enterTime, uint8_t& exitUtil,
                      uint16_t& exitTime);
+
+    /**
+     * @brief Timer that is started when OCC is detected to be in safe mode
+     */
+    sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>
+        safeStateDelayTimer;
+
+    /** @brief Callback for timer that is started when OCC was detected to be in
+     * safe mode. Called to verify and then disable and reset the OCCs.
+     */
+    void safeStateDelayExpired();
 #endif // POWER10
 
     /** @brief Override the sensor name with name from the definition.