Improve BMC error handling for OCC comm failures

- Delay starting OCC reset until all OCCs have been detected (or
timeout). It will prevent multiple resets from being triggered and to
help detecting when reset is completed (active sensor being set after
reset is complete)
- Wait for PLDM response to OCC reset and HRESET requests and retry if
they fail
- If HRESET returns NOT_READY, collect SBE FFDC and try OCC reset. A
persistent failure will put the system in safe state.

- Prevent overwriting dvfs over-temp filename for p10 and beyond since
that old file is only present in old kernel
- Prevent assert when opening sysfs files. (added catch and then created
an OCC Comm failure PEL, which will force an OCC reset.)
- Check return code after reading sysfs files to confirm success. If
read fails, try reset to recover.

- Updated traces to include which processor/OCC encountered issues.
- Better recovery to close windows that were leaving system in partial
good state.

JIRA: PFES-66
Change-Id: I0b087d0e05bd8562682062e1c662f9e18164a720
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_device.hpp b/occ_device.hpp
index 0f7aaaf..d44409b 100644
--- a/occ_device.hpp
+++ b/occ_device.hpp
@@ -105,6 +105,9 @@
      */
     inline void addErrorWatch(bool poll = true)
     {
+#ifdef POWER10
+        throttleProcTemp.addWatch(poll);
+#else
         try
         {
             throttleProcTemp.addWatch(poll);
@@ -115,6 +118,7 @@
             throttleProcTemp.setFile(devPath / "occ_dvfs_ot");
             throttleProcTemp.addWatch(poll);
         }
+#endif
 
 #ifdef POWER10
         if (master())
diff --git a/occ_errors.cpp b/occ_errors.cpp
index 5c9c48f..bfff111 100644
--- a/occ_errors.cpp
+++ b/occ_errors.cpp
@@ -32,7 +32,8 @@
     if (fd < 0)
     {
         log<level::ERR>(
-            std::format("Error::openFile: open failed (errno={})", open_errno)
+            std::format("Error::openFile: open of {} failed (errno={})",
+                        file.c_str(), open_errno)
                 .c_str());
         elog<OpenFailure>(phosphor::logging::org::open_power::OCC::Device::
                               OpenFailure::CALLOUT_ERRNO(open_errno),
diff --git a/occ_errors.hpp b/occ_errors.hpp
index 8cd97af..e3ae412 100644
--- a/occ_errors.hpp
+++ b/occ_errors.hpp
@@ -20,6 +20,8 @@
 constexpr auto SAFE_ERROR_PATH = "org.open_power.OCC.Device.Error.SafeState";
 constexpr auto MISSING_OCC_SENSORS_PATH =
     "org.open_power.OCC.Firmware.Error.MissingOCCSensors";
+constexpr auto OCC_COMM_ERROR_PATH =
+    "org.open_power.OCC.Device.Error.OpenFailure";
 
 /** @class Error
  *  @brief Monitors for OCC device error condition
diff --git a/occ_manager.cpp b/occ_manager.cpp
index ada95ef..308c67e 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -236,7 +236,11 @@
 #endif
                     }
 #ifdef PLDM
-                    pldmHandle->checkActiveSensor(obj->getOccInstanceID());
+                    // Ignore active sensor check if the OCCs are being reset
+                    if (!resetInProgress)
+                    {
+                        pldmHandle->checkActiveSensor(obj->getOccInstanceID());
+                    }
 #endif
                     break;
                 }
@@ -284,6 +288,20 @@
             log<level::INFO>(
                 "checkAllActiveSensors(): OCC Active sensors are available");
             waitingForAllOccActiveSensors = false;
+
+            if (resetRequired)
+            {
+                initiateOccRequest(resetInstance);
+
+                if (!waitForAllOccsTimer->isEnabled())
+                {
+                    log<level::WARNING>(
+                        "occsNotAllRunning: Restarting waitForAllOccTimer");
+                    // restart occ wait timer to check status after reset
+                    // completes
+                    waitForAllOccsTimer->restartOnce(60s);
+                }
+            }
         }
         queuedActiveState.clear();
         tracedSensorWait = false;
@@ -353,7 +371,9 @@
                   std::placeholders::_1, std::placeholders::_2)
 #ifdef PLDM
             ,
-        std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
+        // Callback will set flag indicating reset needs to be done
+        // instead of immediately issuing a reset via PLDM.
+        std::bind(std::mem_fn(&Manager::resetOccRequest), this,
                   std::placeholders::_1)
 #endif
             ));
@@ -388,10 +408,71 @@
         ));
 }
 
+// If a reset is not already outstanding, set a flag to indicate that a reset is
+// needed.
+void Manager::resetOccRequest(instanceID instance)
+{
+    if (!resetRequired)
+    {
+        resetRequired = true;
+        resetInstance = instance;
+        log<level::ERR>(
+            std::format(
+                "resetOccRequest: PM Complex reset was requested due to OCC{}",
+                instance)
+                .c_str());
+    }
+    else if (instance != resetInstance)
+    {
+        log<level::WARNING>(
+            std::format(
+                "resetOccRequest: Ignoring PM Complex reset request for OCC{}, because reset already outstanding for OCC{}",
+                instance, resetInstance)
+                .c_str());
+    }
+}
+
+// If a reset has not been started, initiate an OCC reset via PLDM
+void Manager::initiateOccRequest(instanceID instance)
+{
+    if (!resetInProgress)
+    {
+        resetInProgress = true;
+        resetInstance = instance;
+        log<level::ERR>(
+            std::format(
+                "initiateOccRequest: Initiating PM Complex reset due to OCC{}",
+                instance)
+                .c_str());
+#ifdef PLDM
+        pldmHandle->resetOCC(instance);
+#endif
+        resetRequired = false;
+    }
+    else
+    {
+        log<level::WARNING>(
+            std::format(
+                "initiateOccRequest: Ignoring PM Complex reset request for OCC{}, because reset already in process for OCC{}",
+                instance, resetInstance)
+                .c_str());
+    }
+}
+
 void Manager::statusCallBack(instanceID instance, bool status)
 {
     if (status == true)
     {
+        if (resetInProgress)
+        {
+            log<level::INFO>(
+                std::format(
+                    "statusCallBack: Ignoring OCC{} activate because a reset has been initiated due to OCC{}",
+                    instance, resetInstance)
+                    .c_str());
+            return;
+        }
+
         // OCC went active
         ++activeCount;
 
@@ -412,10 +493,29 @@
                 // stop occ wait timer
                 waitForAllOccsTimer->setEnabled(false);
             }
-#endif
 
+            // All OCCs have been found, check if we need a reset
+            if (resetRequired)
+            {
+                initiateOccRequest(resetInstance);
+
+                if (!waitForAllOccsTimer->isEnabled())
+                {
+                    log<level::WARNING>(
+                        "occsNotAllRunning: Restarting waitForAllOccTimer");
+                    // restart occ wait timer
+                    waitForAllOccsTimer->restartOnce(60s);
+                }
+            }
+            else
+            {
+                // Verify master OCC and start presence monitor
+                validateOccMaster();
+            }
+#else
             // Verify master OCC and start presence monitor
             validateOccMaster();
+#endif
         }
 
         // Start poll timer if not already started
@@ -439,7 +539,7 @@
         }
         else
         {
-            log<level::ERR>(
+            log<level::INFO>(
                 std::format("OCC{} disabled, but currently no active OCCs",
                             instance)
                     .c_str());
@@ -449,6 +549,19 @@
         {
             // No OCCs are running
 
+            if (resetInProgress)
+            {
+                // All OCC active sensors are clear (reset should be in
+                // progress)
+                log<level::INFO>(
+                    std::format(
+                        "statusCallBack: Clearing resetInProgress (activeCount={}, OCC{}, status={})",
+                        activeCount, instance, status)
+                        .c_str());
+                resetInProgress = false;
+                resetInstance = 255;
+            }
+
             // Stop OCC poll timer
             if (_pollTimer->isEnabled())
             {
@@ -465,6 +578,14 @@
             }
 #endif
         }
+        else if (resetInProgress)
+        {
+            log<level::INFO>(
+                std::format(
+                    "statusCallBack: Skipping clear of resetInProgress (activeCount={}, OCC{}, status={})",
+                    activeCount, instance, status)
+                    .c_str());
+        }
 #ifdef READ_OCC_SENSORS
         // Clear OCC sensors
         setSensorValueToNaN(instance);
@@ -680,6 +801,10 @@
             }
         }
     }
+
+    // SBE Reset failed, try PM Complex reset
+    log<level::ERR>("sbeHRESETResult: Forcing PM Complex reset");
+    resetOccRequest(instance);
 }
 
 bool Manager::sbeCanDump(unsigned int instance)
@@ -767,11 +892,27 @@
 {
     if (!_pollTimer)
     {
-        log<level::ERR>(
-            "Manager::pollerTimerExpired() ERROR: Timer not defined");
+        log<level::ERR>("pollerTimerExpired() ERROR: Timer not defined");
         return;
     }
 
+#ifdef POWER10
+    if (resetRequired)
+    {
+        log<level::ERR>("pollerTimerExpired() - Initiating PM Complex reset");
+        initiateOccRequest(resetInstance);
+
+        if (!waitForAllOccsTimer->isEnabled())
+        {
+            log<level::WARNING>(
+                "pollerTimerExpired: Restarting waitForAllOccTimer");
+            // restart occ wait timer
+            waitForAllOccsTimer->restartOnce(60s);
+        }
+        return;
+    }
+#endif
+
     for (auto& obj : statusObjects)
     {
         if (!obj->occActive())
@@ -1359,6 +1500,12 @@
 // After the first OCC goes active, this timer will be started (60 seconds)
 void Manager::occsNotAllRunning()
 {
+    if (resetInProgress)
+    {
+        log<level::WARNING>(
+            "occsNotAllRunning: Ignoring waitForAllOccsTimer because reset is in progress");
+        return;
+    }
     if (activeCount != statusObjects.size())
     {
         // Not all OCCs went active
@@ -1370,7 +1517,22 @@
         // Procs may be garded, so may be expected
     }
 
-    validateOccMaster();
+    if (resetRequired)
+    {
+        initiateOccRequest(resetInstance);
+
+        if (!waitForAllOccsTimer->isEnabled())
+        {
+            log<level::WARNING>(
+                "occsNotAllRunning: Restarting waitForAllOccTimer");
+            // restart occ wait timer
+            waitForAllOccsTimer->restartOnce(60s);
+        }
+    }
+    else
+    {
+        validateOccMaster();
+    }
 }
 
 #ifdef PLDM
diff --git a/occ_manager.hpp b/occ_manager.hpp
index 9745682..0c573b6 100644
--- a/occ_manager.hpp
+++ b/occ_manager.hpp
@@ -197,6 +197,13 @@
      */
     void statusCallBack(instanceID instance, bool status);
 
+    /** @brief Set flag that a PM Complex reset is needed (to be initiated
+     * later) */
+    void resetOccRequest(instanceID instance);
+
+    /** @brief Initiate the request to reset the PM Complex (PLDM -> HBRT) */
+    void initiateOccRequest(instanceID instance);
+
     /** @brief Sends a Heartbeat command to host control command handler */
     void sendHeartBeat();
 
@@ -254,6 +261,14 @@
     /** @brief Subscribe to ambient temperature changed events */
     sdbusplus::bus::match_t ambientPropChanged;
 
+    /** @brief Flag to indicate that a PM complex reset needs to happen */
+    bool resetRequired = false;
+    /** @brief Instance number of the OCC/processor that triggered the reset */
+    uint8_t resetInstance = 255;
+    /** @brief Set when a PM complex reset has been issued (to prevent multiple
+     * requests) */
+    bool resetInProgress = false;
+
 #ifdef I2C_OCC
     /** @brief Init Status objects for I2C OCC devices
      *
diff --git a/occ_status.cpp b/occ_status.cpp
index f8a3e8d..7d654b9 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -36,15 +36,30 @@
             // Set the device active
             device.setActive(true);
 
-            // Update the OCC active sensor
-            Base::Status::occActive(value);
-
-            // Start watching for errors
-            addErrorWatch();
-
             // Reset last OCC state
             lastState = 0;
 
+            // Start watching for errors (throttles, etc)
+            try
+            {
+                addErrorWatch();
+            }
+            catch (const OpenFailure& e)
+            {
+                // Failed to add watch for throttle events, request reset to try
+                // to recover comm
+                log<level::ERR>(
+                    std::format(
+                        "Status::occActive: Unable to add error watch(s) for OCC{} watch: {}",
+                        instance, e.what())
+                        .c_str());
+                deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
+                return Base::Status::occActive(false);
+            }
+
+            // Update the OCC active sensor
+            Base::Status::occActive(value);
+
             if (device.master())
             {
                 // Update powercap bounds from OCC
@@ -104,7 +119,22 @@
         device.setActive(true);
 
         // Add error watch again
-        addErrorWatch();
+        try
+        {
+            addErrorWatch();
+        }
+        catch (const OpenFailure& e)
+        {
+            // Failed to add watch for throttle events, request reset to try to
+            // recover comm
+            log<level::ERR>(
+                std::format(
+                    "Status::occActive: Unable to add error watch(s) again for OCC{} watch: {}",
+                    instance, e.what())
+                    .c_str());
+            deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
+            return Base::Status::occActive(false);
+        }
     }
     else if (!value && device.active())
     {
@@ -150,6 +180,7 @@
         std::format(">>Status::resetOCC() - requesting reset for OCC{}",
                     instance)
             .c_str());
+    this->occActive(false);
 #ifdef PLDM
     if (resetCallBack)
     {
@@ -206,7 +237,11 @@
 // Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
 void Status::readOccState()
 {
-    currentOccReadRetriesCount = occReadRetries;
+    if (stateValid)
+    {
+        // Reset retry count (since state is good)
+        currentOccReadRetriesCount = occReadRetries;
+    }
     occReadStateNow();
 }
 
@@ -318,8 +353,8 @@
 
         if (status == CmdStatus::COMM_FAILURE)
         {
-            // Disable and reset to try recovering
-            deviceError();
+            // Disable due to OCC comm failure and reset to try recovering
+            deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
         }
     }
 
@@ -333,7 +368,7 @@
     {
         log<level::INFO>(
             std::format(
-                "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
+                "safeStateDelayExpired: OCC{} state missing or not valid, requesting reset",
                 instance)
                 .c_str());
         // Disable and reset to try recovering
@@ -352,7 +387,7 @@
 
         if (!hwmonPath.empty())
         {
-            log<level::ERR>(
+            log<level::WARNING>(
                 std::format("Status::getHwmonPath(): path no longer exists: {}",
                             hwmonPath.c_str())
                     .c_str());
@@ -405,7 +440,7 @@
     return hwmonPath;
 }
 
-// Called to read state and upon failure to read after occReadStateFailTimer.
+// Called to read state and handle any errors
 void Status::occReadStateNow()
 {
     unsigned int state;
@@ -425,8 +460,38 @@
     {
         goodFile = true;
         file >> state;
+        // Read the error code (if any) to check status of the read
+        std::ios_base::iostate readState = file.rdstate();
+        if (readState)
+        {
+            // There was a failure reading the file
+            if (lastOccReadStatus != -1)
+            {
+                // Trace error bits
+                std::string errorBits = "";
+                if (readState & std::ios_base::eofbit)
+                {
+                    errorBits += " EOF";
+                }
+                if (readState & std::ios_base::failbit)
+                {
+                    errorBits += " failbit";
+                }
+                if (readState & std::ios_base::badbit)
+                {
+                    errorBits += " badbit";
+                }
+                log<level::ERR>(
+                    std::format(
+                        "readOccState: Failed to read OCC{} state: Read error on I/O operation -{}",
+                        instance, errorBits)
+                        .c_str());
+                lastOccReadStatus = -1;
+            }
+            goodFile = false;
+        }
 
-        if (state != lastState)
+        if (goodFile && (state != lastState))
         {
             // Trace OCC state changes
             log<level::INFO>(
@@ -474,18 +539,22 @@
                     safeStateDelayTimer.setEnabled(false);
                 }
             }
-            // Else not Valid state We would be in SAFE mode.
-            // This captures both SAFE mode, and 0x00, or other invalid
-            // state values.
             else
             {
+                // OCC is in SAFE or some other unsupported state
                 if (!safeStateDelayTimer.isEnabled())
                 {
+                    log<level::ERR>(
+                        std::format(
+                            "readOccState: Invalid OCC{} state of {}, starting safe state delay timer",
+                            instance, state)
+                            .c_str());
                     // start safe delay timer (before requesting reset)
                     using namespace std::literals::chrono_literals;
                     safeStateDelayTimer.restartOnce(60s);
                 }
-                // Not valid state, update sensors to Nan & not functional.
+                // Not a supported state (update sensors to NaN and not
+                // functional)
                 stateValid = false;
             }
 #else
@@ -494,6 +563,13 @@
 #endif
         }
     }
+#ifdef POWER10
+    else
+    {
+        // Unable to read state
+        stateValid = false;
+    }
+#endif
     file.close();
 
     // if failed to Read a state or not a valid state -> Attempt retry
@@ -503,10 +579,15 @@
         if (!goodFile)
         {
             // If not able to read, OCC may be offline
-            log<level::ERR>(
-                std::format("Status::readOccState: open failed (errno={})",
-                            openErrno)
-                    .c_str());
+            if (openErrno != lastOccReadStatus)
+            {
+                log<level::ERR>(
+                    std::format(
+                        "Status::readOccState: open/read failed trying to read OCC{} state (open errno={})",
+                        instance, openErrno)
+                        .c_str());
+                lastOccReadStatus = openErrno;
+            }
         }
         else
         {
@@ -529,26 +610,14 @@
         if (currentOccReadRetriesCount > 0)
         {
             --currentOccReadRetriesCount;
-#ifdef POWER10
-            using namespace std::chrono_literals;
-            occReadStateFailTimer.restartOnce(1s);
-#endif
         }
         else
         {
-#ifdef POWER10
-            if (!stateValid && occActive())
-            {
-                if (!safeStateDelayTimer.isEnabled())
-                {
-                    log<level::ERR>(
-                        "Starting 60 sec delay timer before requesting a reset");
-                    // start safe delay timer (before requesting reset)
-                    using namespace std::literals::chrono_literals;
-                    safeStateDelayTimer.restartOnce(60s);
-                }
-            }
-#else
+            log<level::ERR>(
+                std::format("readOccState: failed to read OCC{} state!",
+                            instance)
+                    .c_str());
+
             // State could not be determined, set it to NO State.
             lastState = 0;
 
@@ -556,9 +625,23 @@
             // Active again.
             stateValid = false;
 
-            // Disable and reset to try recovering
-            deviceError();
-#endif
+            // Disable due to OCC comm failure and reset to try recovering
+            deviceError(Error::Descriptor(OCC_COMM_ERROR_PATH));
+
+            // Reset retry count (for next attempt after recovery)
+            currentOccReadRetriesCount = occReadRetries;
+        }
+    }
+    else
+    {
+        if (lastOccReadStatus != 0)
+        {
+            log<level::INFO>(
+                std::format(
+                    "Status::readOccState: successfully read OCC{} state: {}",
+                    instance, state)
+                    .c_str());
+            lastOccReadStatus = 0; // no error
         }
     }
 }
diff --git a/occ_status.hpp b/occ_status.hpp
index a07c272..6493040 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -132,10 +132,7 @@
         sdpEvent(sdeventplus::Event::get_default()),
         safeStateDelayTimer(
             sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
-                sdpEvent, std::bind(&Status::safeStateDelayExpired, this))),
-        occReadStateFailTimer(
-            sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
-                sdpEvent, std::bind(&Status::occReadStateNow, this)))
+                sdpEvent, std::bind(&Status::safeStateDelayExpired, this)))
 #endif
 
 #ifdef PLDM
@@ -278,6 +275,9 @@
     /** @brief The last state read from the OCC */
     unsigned int lastState = 0;
 
+    /** @brief The last OCC read status (0 = no error) */
+    int lastOccReadStatus = 0;
+
     /** @brief Number of retry attempts to open file and update state. */
     const unsigned int occReadRetries = 1;
 
@@ -353,14 +353,8 @@
      * safe mode. Called to verify and then disable and reset the OCCs.
      */
     void safeStateDelayExpired();
-
-    /**
-     * @brief Timer that is started when OCC read Valid state failed.
-     */
-    sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>
-        occReadStateFailTimer;
-
 #endif // POWER10
+
     /** @brief Callback for timer that is started when OCC state
      * was not able to be read. Called to attempt another read when needed.
      */
diff --git a/pldm.cpp b/pldm.cpp
index 3703a02..4e57ece 100644
--- a/pldm.cpp
+++ b/pldm.cpp
@@ -34,6 +34,8 @@
 using Clock = sdeventplus::Clock<clockId>;
 using Timer = Time<clockId>;
 bool Interface::throttleTraces = false;
+enum pldm_msg_type Interface::msgType = MSG_UNDEFINED;
+open_power::occ::instanceID Interface::resetInstance = 0;
 
 void Interface::fetchSensorInfo(uint16_t stateSetId,
                                 SensorToInstance& sensorInstanceMap,
@@ -200,8 +202,9 @@
             else
             {
                 log<level::WARNING>(
-                    std::format("PLDM: Unexpected PLDM state {} for OCC{}",
-                                eventState, instance)
+                    std::format(
+                        "PLDM: Unexpected OCC Active sensor state {} for OCC{}",
+                        eventState, instance)
                         .c_str());
                 validEvent = false;
             }
@@ -212,7 +215,7 @@
                     // Waiting for a response for this OCC, can stop waiting
                     pldmClose();
                 }
-                callBack(instance, isRunning);
+                occActiveCallBack(instance, isRunning);
             }
             return;
         }
@@ -232,10 +235,21 @@
                 outstandingHResets.erase(match);
                 if (eventState == static_cast<EventState>(SBE_HRESET_NOT_READY))
                 {
-                    log<level::INFO>(
+                    log<level::ERR>(
                         std::format("pldm: HRESET is NOT READY (OCC{})",
                                     instance)
                             .c_str());
+                    // Stop OCC comm - OCC not usable until it becomes READY
+                    occActiveCallBack(instance, false);
+                    // Collect SBE FFDC
+                    sbeCallBack(instance, false);
+                    // Try PM Complex reset
+                    log<level::ERR>(
+                        std::format(
+                            "sensorEvent: Requesting OCC reset for OCC{}",
+                            instance)
+                            .c_str());
+                    resetOCC(resetInstance);
                 }
                 else if (eventState ==
                          static_cast<EventState>(SBE_HRESET_READY))
@@ -247,6 +261,17 @@
                 {
                     sbeCallBack(instance, false);
                 }
+                else
+                {
+                    if (eventState ==
+                        static_cast<EventState>(SBE_HRESET_FAILED))
+                        log<level::ERR>(
+                            std::format(
+                                "pldm: Unexpected HRESET state {} (OCC{})",
+                                eventState, instance)
+                                .c_str());
+                    sbeCallBack(instance, false);
+                }
             }
             // else request was not from us
         }
@@ -284,7 +309,7 @@
                 std::format("clearData: OCC{} / sensorID: 0x{:04X}",
                             entry.second, entry.first)
                     .c_str());
-            callBack(entry.second, false);
+            occActiveCallBack(entry.second, false);
         }
         sensorToOCCInstance.clear();
     }
@@ -466,8 +491,10 @@
             return;
         }
 
-        // Send request to reset the OCCs/PM Complex (ignore response)
-        sendPldm(request, occInstanceId, false);
+        // Send request to reset the OCCs/PM Complex (and wait for response)
+        msgType = MSG_OCC_RESET;
+        resetInstance = occInstanceId;
+        sendPldm(request, occInstanceId, true);
     }
     else
     {
@@ -510,8 +537,10 @@
             return;
         }
 
-        // Send request to issue HRESET of SBE (ignore response)
-        sendPldm(request, sbeInstanceId, false);
+        // Send request to issue HRESET of SBE (and wait for response)
+        msgType = MSG_HRESET;
+        resetInstance = sbeInstanceId;
+        sendPldm(request, sbeInstanceId, true);
         outstandingHResets.insert(sbeInstanceId);
     }
     else
@@ -691,6 +720,10 @@
 #elif defined(PLDM_TRANSPORT_WITH_AF_MCTP)
     return openAfMctpTransport();
 #else
+    log<level::ERR>(
+        std::format("pldmOpen: Undefined pldmTransport!, errno={}/{}", errno,
+                    strerror(errno))
+            .c_str());
     return -1;
 #endif
 
@@ -725,13 +758,20 @@
         // Register callback when response is available
         registerPldmRspCallback();
 
+        using namespace std::literals::chrono_literals;
+        std::chrono::duration timeout = 8s;
+        if ((msgType == MSG_OCC_RESET) || (msgType == MSG_HRESET))
+        {
+            timeout = 30s;
+        }
+
         // Send PLDM request
         if (!throttleTraces)
         {
             log<level::INFO>(
                 std::format(
-                    "sendPldm: calling pldm_transport_send_msg(OCC{}, instance:{}, {} bytes)",
-                    instance, pldmInstanceID.value(), request.size())
+                    "sendPldm: calling pldm_transport_send_msg(OCC{}, instance:{}, {} bytes, timeout {})",
+                    instance, pldmInstanceID.value(), request.size(), timeout)
                     .c_str());
         }
         pldmResponseReceived = false;
@@ -755,8 +795,7 @@
         }
 
         // start timer waiting for the response
-        using namespace std::literals::chrono_literals;
-        pldmRspTimer.restartOnce(8s);
+        pldmRspTimer.restartOnce(timeout);
 
         // Wait for response/timeout
     }
@@ -792,14 +831,23 @@
 void Interface::registerPldmRspCallback()
 {
     decltype(eventSource.get()) sourcePtr = nullptr;
-    auto rc = sd_event_add_io(event.get(), &sourcePtr, pldmFd, EPOLLIN,
-                              pldmRspCallback, this);
+    int rc = 0;
+    if ((msgType == MSG_OCC_RESET) || (msgType == MSG_HRESET))
+    {
+        rc = sd_event_add_io(event.get(), &sourcePtr, pldmFd, EPOLLIN,
+                             pldmResetCallback, this);
+    }
+    else
+    {
+        rc = sd_event_add_io(event.get(), &sourcePtr, pldmFd, EPOLLIN,
+                             pldmRspCallback, this);
+    }
     if (rc < 0)
     {
         log<level::ERR>(
             std::format(
-                "registerPldmRspCallback: sd_event_add_io: Error({})={} : fd={}",
-                rc, strerror(-rc), pldmFd)
+                "registerPldmRspCallback: sd_event_add_io: Error({})={} : fd={} (msgType={})",
+                rc, strerror(-rc), pldmFd, msgType)
                 .c_str());
     }
     else
@@ -818,8 +866,8 @@
         {
             log<level::WARNING>(
                 std::format(
-                    "pldmRspExpired: timerCallback - timeout waiting for pldm response for OCC{}",
-                    pldmResponseOcc)
+                    "pldmRspExpired: timerCallback - timeout waiting for pldm response to msg:{} for OCC{}",
+                    msgType, pldmResponseOcc)
                     .c_str());
         }
         pldmResponseTimeout = true;
@@ -827,6 +875,15 @@
         {
             pldmClose();
         }
+        if (msgType == MSG_OCC_RESET)
+        {
+            // reset not acked, try again
+            log<level::ERR>(
+                std::format("pldmRspExpired: retrying reset request for OCC{}",
+                            pldmResponseOcc)
+                    .c_str());
+            resetOCC(pldmResponseOcc);
+        }
     }
     return;
 };
@@ -963,7 +1020,7 @@
     {
         log<level::INFO>(
             std::format("pldmRspCallback: OCC{} is RUNNING", instance).c_str());
-        pldmIface->callBack(instance, true);
+        pldmIface->occActiveCallBack(instance, true);
     }
     else if (occSensorState ==
              PLDM_STATE_SET_OPERATIONAL_RUNNING_STATUS_DORMANT)
@@ -977,7 +1034,7 @@
         // Setting safe mode true
         pldmIface->safeModeCallBack(true);
 
-        pldmIface->callBack(instance, false);
+        pldmIface->occActiveCallBack(instance, false);
     }
     else if (occSensorState ==
              PLDM_STATE_SET_OPERATIONAL_RUNNING_STATUS_STOPPED)
@@ -985,7 +1042,7 @@
         log<level::INFO>(
             std::format("pldmRspCallback: OCC{} is not running", instance)
                 .c_str());
-        pldmIface->callBack(instance, false);
+        pldmIface->occActiveCallBack(instance, false);
     }
     else
     {
@@ -1007,6 +1064,114 @@
     return 0;
 };
 
+int Interface::pldmResetCallback(sd_event_source* /*es*/,
+                                 __attribute__((unused)) int fd,
+                                 uint32_t revents, void* userData)
+{
+    if (!(revents & EPOLLIN))
+    {
+        log<level::INFO>(
+            std::format("pldmResetCallback - revents={:08X}", revents).c_str());
+        return -1;
+    }
+
+    auto pldmIface = static_cast<Interface*>(userData);
+
+    if (!pldmIface->pldmInstanceID)
+    {
+        log<level::ERR>(
+            "pldmResetCallback: No outstanding PLDM Instance ID found");
+        return -1;
+    }
+
+    uint8_t* responseMsg = nullptr;
+    size_t responseMsgSize{};
+    pldm_tid_t pldmTID = static_cast<pldm_tid_t>(mctpEid);
+
+    if (!throttleTraces)
+    {
+        log<level::INFO>(
+            std::format(
+                "pldmResetCallback: calling pldm_transport_recv_msg() instance:{}",
+                pldmIface->pldmInstanceID.value())
+                .c_str());
+    }
+    auto rc = pldm_transport_recv_msg(pldmIface->pldmTransport, &pldmTID,
+                                      (void**)&responseMsg, &responseMsgSize);
+    int lastErrno = errno;
+    if (rc)
+    {
+        if (!throttleTraces)
+        {
+            log<level::ERR>(
+                std::format(
+                    "pldmResetCallback: pldm_transport_recv_msg failed with rc={}, errno={}/{}",
+                    static_cast<
+                        std::underlying_type_t<pldm_requester_error_codes>>(rc),
+                    lastErrno, strerror(lastErrno))
+                    .c_str());
+        }
+        return -1;
+    }
+
+    // We got the response for the PLDM request msg that was sent
+    if (!throttleTraces)
+    {
+        log<level::INFO>(
+            std::format(
+                "pldmResetCallback: pldm_transport_recv_msg() rsp was {} bytes",
+                responseMsgSize)
+                .c_str());
+    }
+
+    if (pldmIface->pldmRspTimer.isEnabled())
+    {
+        // stop PLDM response timer
+        pldmIface->pldmRspTimer.setEnabled(false);
+    }
+
+    // instance ID should be freed
+    pldmIface->pldmInstanceID = std::nullopt;
+
+    // Set pointer to autodelete
+    std::unique_ptr<uint8_t, decltype(std::free)*> responseMsgPtr{
+        responseMsg, std::free};
+
+    auto response = reinterpret_cast<pldm_msg*>(responseMsgPtr.get());
+    if (response->payload[0] != PLDM_SUCCESS)
+    {
+        log<level::ERR>(
+            std::format(
+                "pldmResetCallback: Reset FAILED ({}) - payload[0] was not success: {}",
+                msgType, response->payload[0])
+                .c_str());
+        pldmIface->pldmClose();
+
+        if (msgType == MSG_OCC_RESET)
+        {
+            // Retry reset request
+            log<level::ERR>(
+                std::format(
+                    "pldmResetCallback: retrying reset request for OCC{}",
+                    resetInstance)
+                    .c_str());
+            pldmIface->resetOCC(resetInstance);
+        }
+        return -1;
+    }
+    else
+    {
+        log<level::INFO>(
+            "pldmResetCallback: Reset has been successfully started");
+    }
+
+    pldmIface->pldmClose();
+
+    pldmIface->pldmResponseReceived = true;
+
+    return 0;
+}
+
 std::vector<uint8_t>
     Interface::encodeGetStateSensorRequest(uint8_t instance, uint16_t sensorId)
 {
@@ -1084,6 +1249,7 @@
         }
 
         // Send request to PLDM and setup callback for response
+        msgType = MSG_SENSOR_STATUS;
         sendPldm(request, instance, true);
     }
     else
diff --git a/pldm.hpp b/pldm.hpp
index 7238c02..6907a93 100644
--- a/pldm.hpp
+++ b/pldm.hpp
@@ -14,6 +14,14 @@
 #include <sdeventplus/event.hpp>
 #include <sdeventplus/utility/timer.hpp>
 
+enum pldm_msg_type
+{
+    MSG_UNDEFINED = 0,
+    MSG_SENSOR_STATUS = 1,
+    MSG_OCC_RESET = 2,
+    MSG_HRESET = 3
+};
+
 namespace pldm
 {
 
@@ -57,14 +65,19 @@
 
     /** @brief Constructs the PLDM Interface object for OCC functions
      *
-     *  @param[in] callBack - callBack handler to invoke when the OCC state
-     *                        changes.
+     *  @param[in] occActiveCallBack - callBack handler to invoke when the OCC
+     *                                 state changes.
+     *  @param[in] sbeCallBack       - callBack handler to invoke when the SBE
+     *                                 state changes.
+     *  @param[in] safeModeCallBack  - callBack handler to invoke when the
+     *                                 system is in safe mode.
      */
     explicit Interface(
-        std::function<bool(open_power::occ::instanceID, bool)> callBack,
+        std::function<bool(open_power::occ::instanceID, bool)>
+            occActiveCallBack,
         std::function<void(open_power::occ::instanceID, bool)> sbeCallBack,
         std::function<void(bool)> safeModeCallBack, EventPtr& event) :
-        callBack(callBack), sbeCallBack(sbeCallBack),
+        occActiveCallBack(occActiveCallBack), sbeCallBack(sbeCallBack),
         safeModeCallBack(safeModeCallBack), event(event),
         pldmEventSignal(
             open_power::occ::utils::getBus(),
@@ -178,7 +191,8 @@
     /** @brief Callback handler to be invoked when the state of the OCC
      *         changes
      */
-    std::function<bool(open_power::occ::instanceID, bool)> callBack = nullptr;
+    std::function<bool(open_power::occ::instanceID, bool)> occActiveCallBack =
+        nullptr;
 
     /** @brief Callback handler to be invoked when the maintenance state of the
      *         SBE changes
@@ -256,6 +270,8 @@
     /** pldm transport instance  */
     struct pldm_transport* pldmTransport = NULL;
 
+    static enum pldm_msg_type msgType;
+
     union TransportImpl
     {
         struct pldm_transport_mctp_demux* mctpDemux;
@@ -272,6 +288,9 @@
      */
     bool pldmResponseTimeout = false;
 
+    /** @brief The instance ID for the OCC/HRESET request */
+    static open_power::occ::instanceID resetInstance;
+
     /** @brief timer event */
     sdeventplus::Event sdpEvent;
 
@@ -396,6 +415,29 @@
      */
     static int pldmRspCallback(sd_event_source* es, int fd, uint32_t revents,
                                void* userData);
+
+    /** @brief callback for a OCC / HRESET response event
+     *
+     *  @param[in] es       - Populated event source
+     *  @param[in] fd       - Associated File descriptor
+     *  @param[in] revents  - Type of event
+     *  @param[in] userData - User data that was passed during registration
+     *
+     *  @return             - 0 or positive number on success and negative
+     *                        errno otherwise
+     */
+    static int pldmResetCallback(sd_event_source* /*es*/,
+                                 __attribute__((unused)) int fd,
+                                 uint32_t revents, void* userData);
 };
 
 } // namespace pldm
+
+template <>
+struct std::formatter<enum pldm_msg_type> : formatter<int>
+{
+    auto format(enum pldm_msg_type m, format_context& ctx) const
+    {
+        return formatter<int>::format(std::to_underlying(m), ctx);
+    }
+};