Ignore HRESET_NOT_READY state until HRESET completes
After HRESET has been requested, code will wait for HRESET_READY or
HRESET_FAILED status before attempting OCC communication again.
Code will also not clear the outstandingHReset until READY/FAILED, since
the reset should still be in progress.
OCC comm will get disabled before the HRESET and re-enabled if
reset completes successfully. If failed, no further comm will work.
My testing found that pldm instance ids were not getting freed
automatically when receiving a response. So this change will also free
those IDs when the response is received.
Tested on Rainier with recoverable and unrecoverable SBE injects.
'''
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: readOccState: Failed to read OCC0 state: Read error on I/O operation - failbit badbit
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: Status::readOccState: open/read failed trying to read OCC0 state (open errno=0)
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: readOccState: Failed to read OCC0 state: Read error on I/O operation - failbit badbit
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: Status::readOccState: open/read failed trying to read OCC0 state (open errno=11)
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: SBE timeout, requesting HRESET (OCC0)
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: Status::occActive OCC0 changed to False
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: got id 15 and set PldmInstanceId to 15
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: openMctpDemuxTransport: pldmFd has fd=9
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: sendPldm: calling pldm_transport_send_msg(OCC0, instance:15, 8 bytes, timeout 30)
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: pldmResetCallback: calling pldm_transport_recv_msg() instance:15
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: pldmResetCallback: pldm_transport_recv_msg() rsp was 4 bytes
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: pldmResetCallback: Reset has been successfully started
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: Freed PLDM instance ID 15
Feb 13 18:33:29 p10bmc openpower-occ-control[22740]: pldm: HRESET is NOT READY (OCC0)
Feb 13 18:34:30 p10bmc openpower-occ-control[22740]: HRESET succeeded (OCC0)
Feb 13 18:34:30 p10bmc openpower-occ-control[22740]: Status::occActive OCC0 changed to True
Feb 13 18:34:30 p10bmc openpower-occ-control[22740]: validateOccMaster: OCC0 is master of 4 OCCs
Feb 13 18:34:34 p10bmc openpower-occ-control[22740]: Status::readOccState: OCC0 state 0x3 (lastState: 0x0)
Feb 13 18:34:34 p10bmc openpower-occ-control[22740]: PowerMode::sendModeChange: SET_MODE(12,0) command to OCC0 (9 bytes)
Feb 13 18:34:34 p10bmc openpower-occ-control[22740]: Idle Power Saver Parameters: enabled:True, enter:8%/240s, exit:12%/10s
Feb 13 18:34:34 p10bmc openpower-occ-control[22740]: PowerMode::sendIpsData: SET_CFG_DATA[IPS] command to OCC0 (12 bytes)
Feb 13 18:34:34 p10bmc openpower-occ-control[22740]: Status::readOccState: successfully read OCC0 state: 3
'''
Change-Id: I7e5bc60576e4e8fa6cba4253be535220cb8048ec
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 6b0282c..f89c530 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -457,7 +457,7 @@
if (resetInProgress)
{
lg2::info(
- "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{INST}",
+ "statusCallBack: Ignoring OCC{INST} activate because a reset has been initiated due to OCC{RINST}",
"INST", instance, "RINST", resetInstance);
return;
}
@@ -628,6 +628,9 @@
setSBEState(instance, SBE_STATE_NOT_USABLE);
#endif
+ // Stop communication with this OCC
+ (*obj)->occActive(false);
+
pldmHandle->sendHRESET(instance);
}
}
@@ -723,6 +726,16 @@
setSBEState(instance, SBE_STATE_BOOTED);
#endif
+ // Re-enable communication with this OCC
+ auto obj = std::find_if(statusObjects.begin(), statusObjects.end(),
+ [instance](const auto& obj) {
+ return instance == obj->getOccInstanceID();
+ });
+ if (obj != statusObjects.end() && (!(*obj)->occActive()))
+ {
+ (*obj)->occActive(true);
+ }
+
return;
}
diff --git a/pldm.cpp b/pldm.cpp
index 6893845..4b1582d 100644
--- a/pldm.cpp
+++ b/pldm.cpp
@@ -216,42 +216,45 @@
outstandingHResets.end(), instance);
if (match != outstandingHResets.end())
{
- outstandingHResets.erase(match);
if (eventState == static_cast<EventState>(SBE_HRESET_NOT_READY))
{
- lg2::error("pldm: HRESET is NOT READY (OCC{INST})", "INST",
- instance);
- // Stop OCC comm - OCC not usable until it becomes READY
- occActiveCallBack(instance, false);
- // Collect SBE FFDC
- sbeCallBack(instance, false);
- // Try PM Complex reset
- lg2::error(
- "sensorEvent: Requesting OCC reset for OCC{INST}",
- "INST", instance);
- resetOCC(resetInstance);
+ lg2::warning("pldm: HRESET is NOT READY (OCC{INST})",
+ "INST", instance);
+ // Keep waiting for status from HRESET
}
else if (eventState ==
static_cast<EventState>(SBE_HRESET_READY))
{
+ // Reset success, clear reset request
+ outstandingHResets.erase(match);
sbeCallBack(instance, true);
}
else if (eventState ==
static_cast<EventState>(SBE_HRESET_FAILED))
{
+ // Reset failed, clear reset request and collect SBE dump
+ outstandingHResets.erase(match);
sbeCallBack(instance, false);
}
else
{
- if (eventState ==
- static_cast<EventState>(SBE_HRESET_FAILED))
- lg2::error(
- "pldm: Unexpected HRESET state {STATE} (OCC{INST})",
- "STATE", eventState, "INST", instance);
- sbeCallBack(instance, false);
+ lg2::warning(
+ "pldm: Unexpected HRESET state {STATE} (OCC{INST})",
+ "STATE", eventState, "INST", instance);
}
}
- // else request was not from us
+ else // request was not due to our HRESET request
+ {
+ if (eventState == static_cast<EventState>(SBE_HRESET_FAILED))
+ {
+ lg2::error(
+ "pldm: Unexpected HRESET state {FAILED} (OCC{INST}) when HRESET not outstanding",
+ "INST", instance);
+
+ // No recovery from failed state, so ensure comm was stopped
+ occActiveCallBack(instance, false);
+ }
+ }
}
}
}
@@ -891,8 +894,7 @@
pldmIface->pldmRspTimer.setEnabled(false);
}
- // instance ID should be freed
- pldmIface->pldmInstanceID = std::nullopt;
+ // instance ID will get freed on pldmClose()
// Set pointer to autodelete
std::unique_ptr<uint8_t, decltype(std::free)*> responseMsgPtr{
@@ -1031,8 +1033,7 @@
pldmIface->pldmRspTimer.setEnabled(false);
}
- // instance ID should be freed
- pldmIface->pldmInstanceID = std::nullopt;
+ // instance ID will get freed on pldmClose()
// Set pointer to autodelete
std::unique_ptr<uint8_t, decltype(std::free)*> responseMsgPtr{