Clean up error cases on boot or when app restarted
- Detect when no or duplicate masters are found (force reset)
- Prevent sending commands to OCC when OCC is disabled
- Detect OCC state changes to SAFE (after 60 sec force reset)
Tested on Everest and Rainier hardware
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
Change-Id: I490f182405e11da207b42a0607a532566479bfd9
diff --git a/occ_manager.cpp b/occ_manager.cpp
index c1ecf5b..33711be 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -27,6 +27,7 @@
constexpr auto maxSuffix = "max";
using namespace phosphor::logging;
+using namespace std::literals::chrono_literals;
template <typename T>
T readFile(const std::string& path)
@@ -72,7 +73,6 @@
// a chance to settle.
prevOCCSearch = occs;
- using namespace std::literals::chrono_literals;
discoverTimer->restartOnce(10s);
}
else
@@ -182,41 +182,80 @@
elog<InternalFailure>();
}
- activeCount += status ? 1 : -1;
-
- // Only start presence detection if all the OCCs are bound
- if (activeCount == statusObjects.size())
+ if (status == true)
{
- for (auto& obj : statusObjects)
+ // OCC went active
+ ++activeCount;
+
+#ifdef POWER10
+ if (activeCount == 1)
{
- obj->addPresenceWatchMaster();
- }
- }
-
- if ((!_pollTimer->isEnabled()) && (activeCount > 0))
- {
- log<level::INFO>(
- fmt::format(
- "Manager::statusCallBack(): {} OCCs will be polled every {} seconds",
- activeCount, pollInterval)
- .c_str());
-
- // Send poll and start OCC poll timer
- pollerTimerExpired();
- }
- else if ((_pollTimer->isEnabled()) && (activeCount == 0))
- {
- // Stop OCC poll timer
- log<level::INFO>(
- "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
- _pollTimer->setEnabled(false);
-
-#ifdef READ_OCC_SENSORS
- for (auto& obj : statusObjects)
- {
- setSensorValueToNaN(obj->getOccInstanceID());
+ // First OCC went active (allow some time for all OCCs to go active)
+ waitForAllOccsTimer->restartOnce(30s);
}
#endif
+
+ if (activeCount == statusObjects.size())
+ {
+#ifdef POWER10
+ // All OCCs are now running
+ if (waitForAllOccsTimer->isEnabled())
+ {
+ // stop occ wait timer
+ waitForAllOccsTimer->setEnabled(false);
+ }
+#endif
+
+ // Verify master OCC and start presence monitor
+ validateOccMaster();
+ }
+
+ // Start poll timer if not already started
+ if (!_pollTimer->isEnabled())
+ {
+ log<level::INFO>(
+ fmt::format(
+ "Manager::statusCallBack(): {} OCCs will be polled every {} seconds",
+ activeCount, pollInterval)
+ .c_str());
+
+ // Send poll and start OCC poll timer
+ pollerTimerExpired();
+ }
+ }
+ else
+ {
+ // OCC went away
+ --activeCount;
+
+ if (activeCount == 0)
+ {
+ // No OCCs are running
+
+ // Stop OCC poll timer
+ if (_pollTimer->isEnabled())
+ {
+ log<level::INFO>(
+ "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
+ _pollTimer->setEnabled(false);
+ }
+
+#ifdef POWER10
+ // stop wait timer
+ if (waitForAllOccsTimer->isEnabled())
+ {
+ waitForAllOccsTimer->setEnabled(false);
+ }
+#endif
+
+#ifdef READ_OCC_SENSORS
+ // Clear OCC sensors
+ for (auto& obj : statusObjects)
+ {
+ setSensorValueToNaN(obj->getOccInstanceID());
+ }
+#endif
+ }
}
}
@@ -410,13 +449,6 @@
void Manager::pollerTimerExpired()
{
- if (activeCount == 0)
- {
- // No OCCs running, so poll timer will not be restarted
- log<level::INFO>(
- "Manager::pollerTimerExpire(): No OCCs running, poll timer not restarted");
- }
-
if (!_pollTimer)
{
log<level::ERR>(
@@ -426,24 +458,40 @@
for (auto& obj : statusObjects)
{
+#ifdef READ_OCC_SENSORS
+ auto id = obj->getOccInstanceID();
+#endif
+ if (!obj->occActive())
+ {
+ // OCC is not running yet
+#ifdef READ_OCC_SENSORS
+ setSensorValueToNaN(id);
+#endif
+ continue;
+ }
+
// Read sysfs to force kernel to poll OCC
obj->readOccState();
#ifdef READ_OCC_SENSORS
// Read occ sensor values
- auto id = obj->getOccInstanceID();
- if (!obj->occActive())
- {
- // Occ not activated
- setSensorValueToNaN(id);
- continue;
- }
getSensorValues(id, obj->isMasterOcc());
#endif
}
- // Restart OCC poll timer
- _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
+ if (activeCount > 0)
+ {
+ // Restart OCC poll timer
+ _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
+ }
+ else
+ {
+ // No OCCs running, so poll timer will not be restarted
+ log<level::INFO>(
+ fmt::format(
+ "Manager::pollerTimerExpired: poll timer will not be restarted")
+ .c_str());
+ }
}
#ifdef READ_OCC_SENSORS
@@ -909,5 +957,66 @@
}
}
+#ifdef POWER10
+void Manager::occsNotAllRunning()
+{
+ // Function will also gets called when occ-control app gets restarted.
+ // (occ active sensors do not change, so the Status object does not
+ // call Manager back for all OCCs)
+
+ if (activeCount != statusObjects.size())
+ {
+ // Not all OCCs went active
+ log<level::WARNING>(
+ fmt::format(
+ "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
+ activeCount, statusObjects.size())
+ .c_str());
+ // Procs may be garded, so may not need reset.
+ }
+
+ validateOccMaster();
+}
+#endif // POWER10
+
+// Verify single master OCC and start presence monitor
+void Manager::validateOccMaster()
+{
+ int masterInstance = -1;
+ for (auto& obj : statusObjects)
+ {
+ obj->addPresenceWatchMaster();
+ if (obj->isMasterOcc())
+ {
+ if (masterInstance == -1)
+ {
+ masterInstance = obj->getOccInstanceID();
+ }
+ else
+ {
+ log<level::ERR>(
+ fmt::format(
+ "validateOccMaster: Multiple OCC masters! ({} and {})",
+ masterInstance, obj->getOccInstanceID())
+ .c_str());
+ // request reset
+ obj->deviceError();
+ }
+ }
+ }
+ if (masterInstance < 0)
+ {
+ log<level::ERR>("validateOccMaster: Master OCC not found!");
+ // request reset
+ statusObjects.front()->deviceError();
+ }
+ else
+ {
+ log<level::INFO>(
+ fmt::format("validateOccMaster: OCC{} is master", masterInstance)
+ .c_str());
+ }
+}
+
} // namespace occ
} // namespace open_power