Handle OCC active sensor updates prior to host runtime

On some systems, occ-control was getting notified that the OCCs were
active before the host reached runtime state. This would prevent
occ-control from starting communication with the OCCs.

The fix will ignore the early OCC Active sensor enabled messages and
once the host gets to runtime, it will re-query the sensors to ensure
they are still active.

Verified on fresh boot, warm boot, BMC reset, warm boot after BMC reset
on a system that exhibited the early sensors and one that did not.

Also removes an unnecessary InternalFailure when a sensor was cleared,
but no OCC objets were found.

Change-Id: Idb6c107cf83d12272aef9179045de73298e6d6b6
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 17cd403..0fb5f1d 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -160,14 +160,21 @@
 {
     static bool allActiveSensorAvailable = false;
     static bool tracedSensorWait = false;
+    static bool waitingForHost = false;
 
-    // Start with the assumption that all are available
-    allActiveSensorAvailable = true;
-    for (auto& obj : statusObjects)
+    if (open_power::occ::utils::isHostRunning())
     {
-        if (!obj->occActive())
+        if (waitingForHost)
         {
-            if (!obj->getPldmSensorReceived())
+            waitingForHost = false;
+            log<level::INFO>("checkAllActiveSensors(): Host is now running");
+        }
+
+        // Start with the assumption that all are available
+        allActiveSensorAvailable = true;
+        for (auto& obj : statusObjects)
+        {
+            if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
             {
                 auto instance = obj->getOccInstanceID();
                 // Check if sensor was queued while waiting for discovery
@@ -200,6 +207,15 @@
             }
         }
     }
+    else
+    {
+        if (!waitingForHost)
+        {
+            waitingForHost = true;
+            log<level::INFO>(
+                "checkAllActiveSensors(): Waiting for host to start");
+        }
+    }
 
     if (allActiveSensorAvailable)
     {
@@ -319,20 +335,6 @@
 
 void Manager::statusCallBack(instanceID instance, bool status)
 {
-    using InternalFailure =
-        sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure;
-
-    // At this time, it won't happen but keeping it
-    // here just in case something changes in the future
-    if ((activeCount == 0) && (!status))
-    {
-        log<level::ERR>(
-            fmt::format("Invalid update on OCCActive with OCC{}", instance)
-                .c_str());
-
-        elog<InternalFailure>();
-    }
-
     if (status == true)
     {
         // OCC went active
@@ -376,7 +378,17 @@
     else
     {
         // OCC went away
-        --activeCount;
+        if (activeCount > 0)
+        {
+            --activeCount;
+        }
+        else
+        {
+            log<level::ERR>(
+                fmt::format("OCC{} disabled, but currently no active OCCs",
+                            instance)
+                    .c_str());
+        }
 
         if (activeCount == 0)
         {
@@ -469,18 +481,56 @@
                                 return instance == obj->getOccInstanceID();
                             });
 
+    const bool hostRunning = open_power::occ::utils::isHostRunning();
     if (obj != statusObjects.end())
     {
-        (*obj)->setPldmSensorReceived(true);
-        return (*obj)->occActive(status);
+        if (!hostRunning && (status == true))
+        {
+            log<level::WARNING>(
+                fmt::format(
+                    "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
+                    instance, status)
+                    .c_str());
+            (*obj)->setPldmSensorReceived(false);
+            if (!waitingForAllOccActiveSensors)
+            {
+                log<level::INFO>(
+                    "updateOCCActive: Waiting for Host and all OCC Active Sensors");
+                waitingForAllOccActiveSensors = true;
+            }
+            discoverTimer->restartOnce(30s);
+            return false;
+        }
+        else
+        {
+            log<level::INFO>(fmt::format("updateOCCActive: OCC{} active={}",
+                                         instance, status)
+                                 .c_str());
+            (*obj)->setPldmSensorReceived(true);
+            return (*obj)->occActive(status);
+        }
     }
     else
     {
-        log<level::WARNING>(
-            fmt::format(
-                "Manager::updateOCCActive: No status object to update for OCC{} (active={})",
-                instance, status)
-                .c_str());
+        if (hostRunning)
+        {
+            log<level::WARNING>(
+                fmt::format(
+                    "updateOCCActive: No status object to update for OCC{} (active={})",
+                    instance, status)
+                    .c_str());
+        }
+        else
+        {
+            if (status == true)
+            {
+                log<level::WARNING>(
+                    fmt::format(
+                        "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
+                        instance, status)
+                        .c_str());
+            }
+        }
         if (status == true)
         {
             // OCC went active
diff --git a/pldm.cpp b/pldm.cpp
index fb2c245..1bcf7a1 100644
--- a/pldm.cpp
+++ b/pldm.cpp
@@ -197,11 +197,6 @@
                         .c_str());
             }
 
-            if (!open_power::occ::utils::isHostRunning())
-            {
-                log<level::INFO>("PLDM: HOST is not running");
-                isRunning = false;
-            }
             callBack(instance, isRunning);
 
             return;
@@ -274,6 +269,7 @@
                 fmt::format("clearData: OCC{} / sensorID: 0x{:04X}",
                             entry.second, entry.first)
                     .c_str());
+            callBack(entry.second, false);
         }
         sensorToOCCInstance.clear();
     }
@@ -656,7 +652,7 @@
 {
     if (!pldmResponseReceived)
     {
-        log<level::ERR>(
+        log<level::WARNING>(
             fmt::format(
                 "pldmRspExpired: timerCallback - timeout waiting for pldm response for OCC{}",
                 pldmResponseOcc)