Prevent multiple PM complex resets from being queued
- Clear any prior reset request when notified that OCCs are active
- If OCC state is safe/not valid, prevent immediate request for reset.
Start a safe state delay timer and if it does not recover then request a
reset.
- If unable to read the OCC state after a retry, then request a reset.
(no change to this behavior)
Problem: A system where the OCC went to safe state, and BMC requested
a reset, but HTMT had already requested a reset, so the PM complex got
reset multiple times when not necessary.
Tested on Rainier/Fuji
Change-Id: Id40b00e6d3708358478271bb6d5acef804715d4a
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 699f4a8..3ae11f6 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -437,6 +437,16 @@
lg2::error(
"initiateOccRequest: Initiating PM Complex reset due to OCC{INST}",
"INST", instance);
+
+ // Make sure ALL OCC comm stops to all OCCs before the reset
+ for (auto& obj : statusObjects)
+ {
+ if (obj->occActive())
+ {
+ obj->occActive(false);
+ }
+ }
+
#ifdef PLDM
pldmHandle->resetOCC(instance);
#endif
@@ -507,9 +517,20 @@
#endif
}
- // Start poll timer if not already started
+ // Start poll timer if not already started (since at least one OCC is
+ // running)
if (!_pollTimer->isEnabled())
{
+ // An OCC just went active, PM Complex is just coming online so
+ // clear any outstanding reset requests
+ if (resetRequired)
+ {
+ resetRequired = false;
+ lg2::error(
+ "statusCallBack: clearing resetRequired (since OCC{INST} went active, resetInProgress={RIP})",
+ "INST", instance, "RIP", resetInProgress);
+ }
+
lg2::info("Manager: OCCs will be polled every {TIME} seconds",
"TIME", pollInterval);
diff --git a/occ_status.cpp b/occ_status.cpp
index 220fcdd..ed44403 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -28,6 +28,7 @@
instance, "STATE", value);
if (value)
{
+ // OCC is active
// Clear prior throttle reason (before setting device active)
updateThrottle(false, THROTTLED_ALL);
@@ -70,7 +71,15 @@
}
else
{
+ // OCC is no longer active
#ifdef POWER10
+ if (sensorsValid)
+ {
+ sensorsValid = false;
+ // Sensors not supported (update to NaN and not functional)
+ manager.setSensorValueToNaN(instance);
+ }
+
if (pmode && device.master())
{
// Prevent mode changes
@@ -411,7 +420,7 @@
fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
std::ifstream file;
- bool goodFile = false;
+ bool stateWasRead = false;
// open file.
file.open(filename, std::ios::in);
@@ -420,7 +429,7 @@
// File is open and state can be used.
if (file.is_open() && file.good())
{
- goodFile = true;
+ stateWasRead = true;
file >> state;
// Read the error code (if any) to check status of the read
std::ios_base::iostate readState = file.rdstate();
@@ -448,10 +457,10 @@
"INST", instance, "ERROR", errorBits);
lastOccReadStatus = -1;
}
- goodFile = false;
+ stateWasRead = false;
}
- if (goodFile && (state != lastState))
+ if (stateWasRead && (state != lastState))
{
// Trace OCC state changes
lg2::info(
@@ -489,6 +498,7 @@
{
// Good OCC State then sensors valid again
stateValid = true;
+ sensorsValid = true;
if (safeStateDelayTimer.isEnabled())
{
@@ -502,15 +512,20 @@
if (!safeStateDelayTimer.isEnabled())
{
lg2::error(
- "readOccState: Invalid OCC{INST} state of {STATE}, starting safe state delay timer",
- "INST", instance, "STATE", state);
+ "readOccState: Invalid OCC{INST} state of {STATE} (last state: {PRIOR}), starting safe state delay timer",
+ "INST", instance, "STATE", lg2::hex, state, "PRIOR",
+ lg2::hex, lastState);
// start safe delay timer (before requesting reset)
using namespace std::literals::chrono_literals;
safeStateDelayTimer.restartOnce(60s);
}
- // Not a supported state (update sensors to NaN and not
- // functional)
- stateValid = false;
+
+ if (sensorsValid)
+ {
+ sensorsValid = false;
+ // Sensors not supported (update to NaN and not functional)
+ manager.setSensorValueToNaN(instance);
+ }
}
#else
// Before P10 state not checked, only used good file open.
@@ -527,37 +542,26 @@
#endif
file.close();
- // if failed to Read a state or not a valid state -> Attempt retry
- // after 1 Second delay if allowed.
- if ((!goodFile) || (!stateValid))
+ // if failed to read the OCC state -> Attempt retry
+ if (!stateWasRead)
{
- if (!goodFile)
- {
- // If not able to read, OCC may be offline
- if (openErrno != lastOccReadStatus)
- {
- lg2::error(
- "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
- "INST", instance, "ERROR", openErrno);
- lastOccReadStatus = openErrno;
- }
- }
- else
- {
- // else this failed due to state not valid.
- if (state != lastState)
- {
- lg2::error(
- "Status::readOccState: OCC{INST} Invalid state {STATE} (last state: {PRIOR})",
- "INST", instance, "STATE", lg2::hex, state, "PRIOR",
- lg2::hex, lastState);
- }
- }
-
#ifdef READ_OCC_SENSORS
- manager.setSensorValueToNaN(instance);
+ if (sensorsValid)
+ {
+ sensorsValid = false;
+ manager.setSensorValueToNaN(instance);
+ }
#endif
+ // If not able to read, OCC may be offline
+ if (openErrno != lastOccReadStatus)
+ {
+ lg2::error(
+ "Status::readOccState: open/read failed trying to read OCC{INST} state (open errno={ERROR})",
+ "INST", instance, "ERROR", openErrno);
+ lastOccReadStatus = openErrno;
+ }
+
// See occReadRetries for number of retry attempts.
if (currentOccReadRetriesCount > 0)
{
@@ -565,8 +569,9 @@
}
else
{
- lg2::error("readOccState: failed to read OCC{INST} state!", "INST",
- instance);
+ lg2::error(
+ "readOccState: failed to read OCC{INST} state! (last state: {PRIOR})",
+ "INST", instance, "PRIOR", lg2::hex, lastState);
// State could not be determined, set it to NO State.
lastState = 0;
@@ -584,15 +589,11 @@
currentOccReadRetriesCount = occReadRetries;
}
}
- else
+ else if (lastOccReadStatus != 0)
{
- if (lastOccReadStatus != 0)
- {
- lg2::info(
- "Status::readOccState: successfully read OCC{INST} state: {STATE}",
- "INST", instance, "STATE", state);
- lastOccReadStatus = 0; // no error
- }
+ lg2::info("readOccState: successfully read OCC{INST} state: {STATE}",
+ "INST", instance, "STATE", state);
+ lastOccReadStatus = 0; // no error
}
}
diff --git a/occ_status.hpp b/occ_status.hpp
index 6493040..f7ea02c 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -287,6 +287,11 @@
/** @brief The Trigger to indicate OCC State is valid or not. */
bool stateValid = false;
+#ifdef POWER10
+ /** @brief The Trigger to indicate OCC sensors are valid or not. */
+ bool sensorsValid = false;
+#endif
+
/** @brief OCC instance to Sensor definitions mapping */
static const std::map<instanceID, sensorDefs> sensorMap;