Clean up error cases on boot or when app restarted
- Detect when no or duplicate masters are found (force reset)
- Prevent sending commands to OCC when OCC is disabled
- Detect OCC state changes to SAFE (after 60 sec force reset)
Tested on Everest and Rainier hardware
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
Change-Id: I490f182405e11da207b42a0607a532566479bfd9
diff --git a/occ_status.cpp b/occ_status.cpp
index ad0d0ab..a2ab6ab 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -52,6 +52,14 @@
this->callBack(value);
}
+#ifdef POWER10
+ if (safeStateDelayTimer.isEnabled())
+ {
+ // stop safe delay timer
+ safeStateDelayTimer.setEnabled(false);
+ }
+#endif
+
// Stop watching for errors
removeErrorWatch();
@@ -184,13 +192,14 @@
lastState = state;
#ifdef POWER10
- if ((OccState(state) == OccState::ACTIVE) && (device.master()))
- {
- // Kernel detected that the master OCC went to active state
- occsWentActive();
- }
if (OccState(state) == OccState::ACTIVE)
{
+ if (device.master())
+ {
+ // Special processing by master OCC when it goes active
+ occsWentActive();
+ }
+
CmdStatus status = sendAmbient();
if (status != CmdStatus::SUCCESS)
{
@@ -201,6 +210,18 @@
.c_str());
}
}
+
+ if (OccState(state) == OccState::SAFE)
+ {
+ // start safe delay timer (before requesting reset)
+ using namespace std::literals::chrono_literals;
+ safeStateDelayTimer.restartOnce(60s);
+ }
+ else if (safeStateDelayTimer.isEnabled())
+ {
+ // stop safe delay timer (no longer in SAFE state)
+ safeStateDelayTimer.setEnabled(false);
+ }
#endif
}
file.close();
@@ -676,6 +697,21 @@
return status;
}
+
+// Called when safe timer expires to determine if OCCs need to be reset
+void Status::safeStateDelayExpired()
+{
+ if (this->occActive())
+ {
+ log<level::INFO>(
+ fmt::format(
+ "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
+ instance)
+ .c_str());
+ // Disable and reset to try recovering
+ deviceError();
+ }
+}
#endif // POWER10
} // namespace occ