openpower-occ-control:failure to read OCC state.

Failure to read OCC state set the OCCs sensors to Nan/Not Functional
Tested: cronus error inject on OCC with and without OCC resets.
Signed-off-by: Sheldon Bailey <baileysh@us.ibm.com>
Change-Id: I2a6bb6a431f09ea816979b3a482b54a28e21db53
Signed-off-by: Sheldon Bailey <baileysh@us.ibm.com>
diff --git a/.clang-format b/.clang-format
index abedc25..ddfe2d7 100644
--- a/.clang-format
+++ b/.clang-format
@@ -49,6 +49,7 @@
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
+DeriveLineEnding: false
 DerivePointerAlignment: false
 PointerAlignment: Left
 DisableFormat:   false
@@ -110,5 +111,6 @@
 SpacesInSquareBrackets: false
 Standard:        Latest
 TabWidth:        4
+UseCRLF: false
 UseTab:          Never
 ...
diff --git a/occ_command.hpp b/occ_command.hpp
index aee4c58..9dde698 100644
--- a/occ_command.hpp
+++ b/occ_command.hpp
@@ -39,7 +39,7 @@
     OBSERVATION = 0x02,
     ACTIVE = 0x03,
     SAFE = 0x04,
-    CHARACTERIZATION = 0x05,
+    CHARACTERIZATION = 0x05
 };
 
 enum class SysPwrMode
diff --git a/occ_manager.cpp b/occ_manager.cpp
index a3f246c..eb116e7 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -162,7 +162,7 @@
         pmode,
 #endif
         std::bind(std::mem_fn(&Manager::statusCallBack), this,
-                  std::placeholders::_1)
+                  std::placeholders::_1, std::placeholders::_2)
 #ifdef PLDM
             ,
         std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
@@ -201,7 +201,7 @@
                                                                   ));
 }
 
-void Manager::statusCallBack(bool status)
+void Manager::statusCallBack(instanceID instance, bool status)
 {
     using InternalFailure =
         sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure;
@@ -210,7 +210,10 @@
     // here just in case something changes in the future
     if ((activeCount == 0) && (!status))
     {
-        log<level::ERR>("Invalid update on OCCActive");
+        log<level::ERR>(
+            fmt::format("Invalid update on OCCActive with OCC{}", instance)
+                .c_str());
+
         elog<InternalFailure>();
     }
 
@@ -278,15 +281,11 @@
                 waitForAllOccsTimer->setEnabled(false);
             }
 #endif
-
-#ifdef READ_OCC_SENSORS
-            // Clear OCC sensors
-            for (auto& obj : statusObjects)
-            {
-                setSensorValueToNaN(obj->getOccInstanceID());
-            }
-#endif
         }
+#ifdef READ_OCC_SENSORS
+        // Clear OCC sensors
+        setSensorValueToNonFunctional(instance);
+#endif
     }
 }
 
@@ -501,7 +500,7 @@
             // OCC is not running yet
 #ifdef READ_OCC_SENSORS
             auto id = obj->getOccInstanceID();
-            setSensorValueToNaN(id);
+            setSensorValueToNonFunctional(id);
 #endif
             continue;
         }
@@ -842,6 +841,22 @@
     return;
 }
 
+void Manager::setSensorValueToNonFunctional(uint32_t id) const
+{
+    for (const auto& [sensorPath, occId] : existingSensors)
+    {
+        if (occId == id)
+        {
+            dbus::OccDBusSensors::getOccDBus().setValue(
+                sensorPath, std::numeric_limits<double>::quiet_NaN());
+
+            dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
+                                                                    false);
+        }
+    }
+    return;
+}
+
 void Manager::getSensorValues(std::unique_ptr<Status>& occ)
 {
     static bool tracedError[8] = {0};
diff --git a/occ_manager.hpp b/occ_manager.hpp
index c06f6d3..d636976 100644
--- a/occ_manager.hpp
+++ b/occ_manager.hpp
@@ -144,6 +144,12 @@
     /** @brief Notify pcap object to update bounds */
     void updatePcapBounds() const;
 
+    /** @brief Set all sensor values of this OCC to NaN and non functional.
+     *
+     *  @param[in] id - Id of the OCC.
+     */
+    void setSensorValueToNonFunctional(uint32_t id) const;
+
   private:
     /** @brief Creates the OCC D-Bus objects.
      */
@@ -173,7 +179,7 @@
      *
      *  @param[in] status - OccActive status
      */
-    void statusCallBack(bool status);
+    void statusCallBack(instanceID instance, bool status);
 
     /** @brief Sends a Heartbeat command to host control command handler */
     void sendHeartBeat();
diff --git a/occ_status.cpp b/occ_status.cpp
index 7c3658c..9e65155 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -49,7 +49,7 @@
             // Call into Manager to let know that we have bound
             if (this->managerCallBack)
             {
-                this->managerCallBack(value);
+                this->managerCallBack(instance, value);
             }
         }
         else
@@ -70,7 +70,7 @@
             // Call into Manager to let know that we will unbind.
             if (this->managerCallBack)
             {
-                this->managerCallBack(value);
+                this->managerCallBack(instance, value);
             }
 
             // Stop watching for errors
@@ -191,83 +191,11 @@
     return;
 }
 
+// Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
 void Status::readOccState()
 {
-    unsigned int state;
-    const fs::path filename =
-        fs::path(DEV_PATH) /
-        fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
-
-    std::ifstream file(filename, std::ios::in);
-    const int open_errno = errno;
-    if (file)
-    {
-        file >> state;
-        if (state != lastState)
-        {
-            // Trace OCC state changes
-            log<level::INFO>(
-                fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
-                            instance, state)
-                    .c_str());
-            if (state & 0xFFFFFFF8)
-            {
-                log<level::ERR>(
-                    fmt::format("Status::readOccState: INVALID STATE from {}!!",
-                                filename.c_str())
-                        .c_str());
-            }
-            lastState = state;
-
-#ifdef POWER10
-            if (OccState(state) == OccState::ACTIVE)
-            {
-                if (pmode && device.master())
-                {
-                    // Set the master OCC on the PowerMode object
-                    pmode->setMasterOcc(path);
-                    // Enable mode changes
-                    pmode->setMasterActive();
-
-                    // Special processing by master OCC when it goes active
-                    occsWentActive();
-                }
-
-                CmdStatus status = sendAmbient();
-                if (status != CmdStatus::SUCCESS)
-                {
-                    log<level::ERR>(
-                        fmt::format(
-                            "readOccState: Sending Ambient failed with status {}",
-                            status)
-                            .c_str());
-                }
-            }
-
-            if (OccState(state) == OccState::SAFE)
-            {
-                // start safe delay timer (before requesting reset)
-                using namespace std::literals::chrono_literals;
-                safeStateDelayTimer.restartOnce(60s);
-            }
-            else if (safeStateDelayTimer.isEnabled())
-            {
-                // stop safe delay timer (no longer in SAFE state)
-                safeStateDelayTimer.setEnabled(false);
-            }
-#endif
-        }
-        file.close();
-    }
-    else
-    {
-        // If not able to read, OCC may be offline
-        log<level::DEBUG>(
-            fmt::format("Status::readOccState: open failed (errno={})",
-                        open_errno)
-                .c_str());
-        lastState = 0;
-    }
+    currentOccReadRetriesCount = occReadRetries;
+    occReadStateNow();
 }
 
 #ifdef POWER10
@@ -450,5 +378,145 @@
     return hwmonPath;
 }
 
+// Called to read state and upon failure to read after occReadStateFailTimer.
+void Status::occReadStateNow()
+{
+    unsigned int state;
+    const fs::path filename =
+        fs::path(DEV_PATH) /
+        fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
+
+    std::ifstream file;
+    bool goodFile = false;
+
+    // open file.
+    file.open(filename, std::ios::in);
+    const int openErrno = errno;
+
+    // File is open and state can be used.
+    if (file.is_open() && file.good())
+    {
+        goodFile = true;
+        file >> state;
+
+        if (state != lastState)
+        {
+            // Trace OCC state changes
+            log<level::INFO>(
+                fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
+                            instance, state)
+                    .c_str());
+            lastState = state;
+#ifdef POWER10
+            if (OccState(state) == OccState::ACTIVE)
+            {
+                if (pmode && device.master())
+                {
+                    // Set the master OCC on the PowerMode object
+                    pmode->setMasterOcc(path);
+                    // Enable mode changes
+                    pmode->setMasterActive();
+
+                    // Special processing by master OCC when it goes active
+                    occsWentActive();
+                }
+
+                CmdStatus status = sendAmbient();
+                if (status != CmdStatus::SUCCESS)
+                {
+                    log<level::ERR>(
+                        fmt::format(
+                            "readOccState: Sending Ambient failed with status {}",
+                            status)
+                            .c_str());
+                }
+            }
+
+            // If OCC in known Good State.
+            if ((OccState(state) == OccState::ACTIVE) ||
+                (OccState(state) == OccState::CHARACTERIZATION) ||
+                (OccState(state) == OccState::OBSERVATION))
+            {
+                // Good OCC State then sensors valid again
+                stateValid = true;
+
+                if (safeStateDelayTimer.isEnabled())
+                {
+                    // stop safe delay timer (no longer in SAFE state)
+                    safeStateDelayTimer.setEnabled(false);
+                }
+            }
+            // Else not Valid state We would be in SAFE mode.
+            // This captures both SAFE mode, and 0x00, or other invalid
+            // state values.
+            else
+            {
+                if (!safeStateDelayTimer.isEnabled())
+                {
+                    // start safe delay timer (before requesting reset)
+                    using namespace std::literals::chrono_literals;
+                    safeStateDelayTimer.restartOnce(60s);
+                }
+                // Not valid state, update sensors to Nan & not functional.
+                stateValid = false;
+            }
+#else
+            // Before P10 state not checked, only used good file open.
+            stateValid = true;
+#endif
+        }
+    }
+    file.close();
+
+    // if failed to Read a state or not a valid state -> Attempt retry
+    // after 1 Second delay if allowed.
+    if ((!goodFile) || (!stateValid))
+    {
+        if (!goodFile)
+        {
+            // If not able to read, OCC may be offline
+            log<level::ERR>(
+                fmt::format("Status::readOccState: open failed (errno={})",
+                            openErrno)
+                    .c_str());
+        }
+        else
+        {
+            // else this failed due to state not valid.
+            log<level::ERR>(
+                fmt::format(
+                    "Status::readOccState: OCC{} Invalid state 0x{:02X}",
+                    instance, state)
+                    .c_str());
+        }
+
+#ifdef READ_OCC_SENSORS
+        manager.setSensorValueToNonFunctional(instance);
+#endif
+
+        // See occReadRetries for number of retry attempts.
+        if (currentOccReadRetriesCount > 0)
+        {
+            --currentOccReadRetriesCount;
+#ifdef POWER10
+            using namespace std::chrono_literals;
+            occReadStateFailTimer.restartOnce(1s);
+#endif
+        }
+        else
+        {
+            // State could not be determined, set it to NO State.
+            lastState = 0;
+
+            // Disable the ability to send Failed actions until OCC is
+            // Active again.
+            stateValid = false;
+
+            // Disable and reset to try recovering
+            deviceError();
+        }
+    }
+}
+
 } // namespace occ
 } // namespace open_power
diff --git a/occ_status.hpp b/occ_status.hpp
index 6376b6c..e02aaea 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -79,7 +79,7 @@
 #ifdef POWER10
            std::unique_ptr<powermode::PowerMode>& powerModeRef,
 #endif
-           std::function<void(bool)> callBack = nullptr
+           std::function<void(instanceID, bool)> callBack = nullptr
 #ifdef PLDM
            ,
            std::function<void(instanceID)> resetCallBack = nullptr
@@ -117,8 +117,12 @@
         sdpEvent(sdeventplus::Event::get_default()),
         safeStateDelayTimer(
             sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
-                sdpEvent, std::bind(&Status::safeStateDelayExpired, this)))
+                sdpEvent, std::bind(&Status::safeStateDelayExpired, this))),
+        occReadStateFailTimer(
+            sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
+                sdpEvent, std::bind(&Status::occReadStateNow, this)))
 #endif
+
 #ifdef PLDM
         ,
         resetCallBack(resetCallBack)
@@ -213,7 +217,7 @@
     /** @brief Callback handler to be invoked during property change.
      *         This is a handler in Manager class
      */
-    std::function<void(bool)> managerCallBack;
+    std::function<void(instanceID, bool)> managerCallBack;
 
     /** @brief OCC instance number. Ex, 0,1, etc */
     unsigned int instance;
@@ -221,6 +225,15 @@
     /** @brief The last state read from the OCC */
     unsigned int lastState = 0;
 
+    /** @brief Number of retry attempts to open file and update state. */
+    const unsigned int occReadRetries = 1;
+
+    /** @brief Current number of retries attempted towards occReadRetries. */
+    size_t currentOccReadRetriesCount = 0;
+
+    /** @brief The Trigger to indicate OCC State is valid or not. */
+    bool stateValid = false;
+
     /** @brief OCC instance to Sensor definitions mapping */
     static const std::map<instanceID, sensorDefs> sensorMap;
 
@@ -284,7 +297,18 @@
      * safe mode. Called to verify and then disable and reset the OCCs.
      */
     void safeStateDelayExpired();
+
+    /**
+     * @brief Timer that is started when OCC read Valid state failed.
+     */
+    sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>
+        occReadStateFailTimer;
+
 #endif // POWER10
+    /** @brief Callback for timer that is started when OCC state
+     * was not able to be read. Called to attempt another read when needed.
+     */
+    void occReadStateNow();
 
     /** @brief Override the sensor name with name from the definition.
      *  @param[in]  estimatedPath - Estimated OCC Dbus object path