Clean up error cases on boot or when app restarted
- Detect when no or duplicate masters are found (force reset)
- Prevent sending commands to OCC when OCC is disabled
- Detect OCC state changes to SAFE (after 60 sec force reset)
Tested on Everest and Rainier hardware
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
Change-Id: I490f182405e11da207b42a0607a532566479bfd9
diff --git a/occ_manager.cpp b/occ_manager.cpp
index c1ecf5b..33711be 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -27,6 +27,7 @@
constexpr auto maxSuffix = "max";
using namespace phosphor::logging;
+using namespace std::literals::chrono_literals;
template <typename T>
T readFile(const std::string& path)
@@ -72,7 +73,6 @@
// a chance to settle.
prevOCCSearch = occs;
- using namespace std::literals::chrono_literals;
discoverTimer->restartOnce(10s);
}
else
@@ -182,41 +182,80 @@
elog<InternalFailure>();
}
- activeCount += status ? 1 : -1;
-
- // Only start presence detection if all the OCCs are bound
- if (activeCount == statusObjects.size())
+ if (status == true)
{
- for (auto& obj : statusObjects)
+ // OCC went active
+ ++activeCount;
+
+#ifdef POWER10
+ if (activeCount == 1)
{
- obj->addPresenceWatchMaster();
- }
- }
-
- if ((!_pollTimer->isEnabled()) && (activeCount > 0))
- {
- log<level::INFO>(
- fmt::format(
- "Manager::statusCallBack(): {} OCCs will be polled every {} seconds",
- activeCount, pollInterval)
- .c_str());
-
- // Send poll and start OCC poll timer
- pollerTimerExpired();
- }
- else if ((_pollTimer->isEnabled()) && (activeCount == 0))
- {
- // Stop OCC poll timer
- log<level::INFO>(
- "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
- _pollTimer->setEnabled(false);
-
-#ifdef READ_OCC_SENSORS
- for (auto& obj : statusObjects)
- {
- setSensorValueToNaN(obj->getOccInstanceID());
+ // First OCC went active (allow some time for all OCCs to go active)
+ waitForAllOccsTimer->restartOnce(30s);
}
#endif
+
+ if (activeCount == statusObjects.size())
+ {
+#ifdef POWER10
+ // All OCCs are now running
+ if (waitForAllOccsTimer->isEnabled())
+ {
+ // stop occ wait timer
+ waitForAllOccsTimer->setEnabled(false);
+ }
+#endif
+
+ // Verify master OCC and start presence monitor
+ validateOccMaster();
+ }
+
+ // Start poll timer if not already started
+ if (!_pollTimer->isEnabled())
+ {
+ log<level::INFO>(
+ fmt::format(
+ "Manager::statusCallBack(): {} OCCs will be polled every {} seconds",
+ activeCount, pollInterval)
+ .c_str());
+
+ // Send poll and start OCC poll timer
+ pollerTimerExpired();
+ }
+ }
+ else
+ {
+ // OCC went away
+ --activeCount;
+
+ if (activeCount == 0)
+ {
+ // No OCCs are running
+
+ // Stop OCC poll timer
+ if (_pollTimer->isEnabled())
+ {
+ log<level::INFO>(
+ "Manager::statusCallBack(): OCCs are not running, stopping poll timer");
+ _pollTimer->setEnabled(false);
+ }
+
+#ifdef POWER10
+ // stop wait timer
+ if (waitForAllOccsTimer->isEnabled())
+ {
+ waitForAllOccsTimer->setEnabled(false);
+ }
+#endif
+
+#ifdef READ_OCC_SENSORS
+ // Clear OCC sensors
+ for (auto& obj : statusObjects)
+ {
+ setSensorValueToNaN(obj->getOccInstanceID());
+ }
+#endif
+ }
}
}
@@ -410,13 +449,6 @@
void Manager::pollerTimerExpired()
{
- if (activeCount == 0)
- {
- // No OCCs running, so poll timer will not be restarted
- log<level::INFO>(
- "Manager::pollerTimerExpire(): No OCCs running, poll timer not restarted");
- }
-
if (!_pollTimer)
{
log<level::ERR>(
@@ -426,24 +458,40 @@
for (auto& obj : statusObjects)
{
+#ifdef READ_OCC_SENSORS
+ auto id = obj->getOccInstanceID();
+#endif
+ if (!obj->occActive())
+ {
+ // OCC is not running yet
+#ifdef READ_OCC_SENSORS
+ setSensorValueToNaN(id);
+#endif
+ continue;
+ }
+
// Read sysfs to force kernel to poll OCC
obj->readOccState();
#ifdef READ_OCC_SENSORS
// Read occ sensor values
- auto id = obj->getOccInstanceID();
- if (!obj->occActive())
- {
- // Occ not activated
- setSensorValueToNaN(id);
- continue;
- }
getSensorValues(id, obj->isMasterOcc());
#endif
}
- // Restart OCC poll timer
- _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
+ if (activeCount > 0)
+ {
+ // Restart OCC poll timer
+ _pollTimer->restartOnce(std::chrono::seconds(pollInterval));
+ }
+ else
+ {
+ // No OCCs running, so poll timer will not be restarted
+ log<level::INFO>(
+ fmt::format(
+ "Manager::pollerTimerExpired: poll timer will not be restarted")
+ .c_str());
+ }
}
#ifdef READ_OCC_SENSORS
@@ -909,5 +957,66 @@
}
}
+#ifdef POWER10
+void Manager::occsNotAllRunning()
+{
+ // Function will also gets called when occ-control app gets restarted.
+ // (occ active sensors do not change, so the Status object does not
+ // call Manager back for all OCCs)
+
+ if (activeCount != statusObjects.size())
+ {
+ // Not all OCCs went active
+ log<level::WARNING>(
+ fmt::format(
+ "occsNotAllRunning: Active OCC count ({}) does not match expected count ({})",
+ activeCount, statusObjects.size())
+ .c_str());
+ // Procs may be garded, so may not need reset.
+ }
+
+ validateOccMaster();
+}
+#endif // POWER10
+
+// Verify single master OCC and start presence monitor
+void Manager::validateOccMaster()
+{
+ int masterInstance = -1;
+ for (auto& obj : statusObjects)
+ {
+ obj->addPresenceWatchMaster();
+ if (obj->isMasterOcc())
+ {
+ if (masterInstance == -1)
+ {
+ masterInstance = obj->getOccInstanceID();
+ }
+ else
+ {
+ log<level::ERR>(
+ fmt::format(
+ "validateOccMaster: Multiple OCC masters! ({} and {})",
+ masterInstance, obj->getOccInstanceID())
+ .c_str());
+ // request reset
+ obj->deviceError();
+ }
+ }
+ }
+ if (masterInstance < 0)
+ {
+ log<level::ERR>("validateOccMaster: Master OCC not found!");
+ // request reset
+ statusObjects.front()->deviceError();
+ }
+ else
+ {
+ log<level::INFO>(
+ fmt::format("validateOccMaster: OCC{} is master", masterInstance)
+ .c_str());
+ }
+}
+
} // namespace occ
} // namespace open_power
diff --git a/occ_manager.hpp b/occ_manager.hpp
index a137363..97bf2b1 100644
--- a/occ_manager.hpp
+++ b/occ_manager.hpp
@@ -101,7 +101,11 @@
discoverTimer(
std::make_unique<
sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>(
- sdpEvent, std::bind(&Manager::findAndCreateObjects, this)))
+ sdpEvent, std::bind(&Manager::findAndCreateObjects, this))),
+ waitForAllOccsTimer(
+ std::make_unique<
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>(
+ sdpEvent, std::bind(&Manager::occsNotAllRunning, this)))
#endif
{
#ifdef I2C_OCC
@@ -295,6 +299,19 @@
* any were added since the last check.
*/
std::vector<int> prevOCCSearch;
+
+ /**
+ * @brief Timer used when waiting for OCCs to go active.
+ */
+ std::unique_ptr<
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>>
+ waitForAllOccsTimer;
+
+ /** @brief Called when code times out waiting for all OCCs to be running or
+ * after the app is restarted (Status does not callback into
+ * Manager).
+ */
+ void occsNotAllRunning();
#endif
/**
@@ -380,6 +397,11 @@
* @param[in] msg - Data associated with subscribed signal
*/
void ambientCallback(sdbusplus::message::message& msg);
+
+ /** @brief Confirm that a single OCC master was found and start presence
+ * monitoring
+ */
+ void validateOccMaster();
};
} // namespace occ
diff --git a/occ_presence.cpp b/occ_presence.cpp
index ab9b322..e06ecb8 100644
--- a/occ_presence.cpp
+++ b/occ_presence.cpp
@@ -46,9 +46,9 @@
auto occsPresent = std::stoi(data, nullptr, 0);
if (manager.getNumOCCs() != occsPresent)
{
- log<level::INFO>("OCC presence mismatch",
- entry("BMC_OCCS=%d", manager.getNumOCCs()),
- entry("OCC_OCCS=%d", occsPresent));
+ log<level::ERR>(fmt::format("OCC presence mismatch - BMC: {}, OCC: {}",
+ manager.getNumOCCs(), occsPresent)
+ .c_str());
if (callBack)
{
callBack(true);
diff --git a/occ_status.cpp b/occ_status.cpp
index ad0d0ab..a2ab6ab 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -52,6 +52,14 @@
this->callBack(value);
}
+#ifdef POWER10
+ if (safeStateDelayTimer.isEnabled())
+ {
+ // stop safe delay timer
+ safeStateDelayTimer.setEnabled(false);
+ }
+#endif
+
// Stop watching for errors
removeErrorWatch();
@@ -184,13 +192,14 @@
lastState = state;
#ifdef POWER10
- if ((OccState(state) == OccState::ACTIVE) && (device.master()))
- {
- // Kernel detected that the master OCC went to active state
- occsWentActive();
- }
if (OccState(state) == OccState::ACTIVE)
{
+ if (device.master())
+ {
+ // Special processing by master OCC when it goes active
+ occsWentActive();
+ }
+
CmdStatus status = sendAmbient();
if (status != CmdStatus::SUCCESS)
{
@@ -201,6 +210,18 @@
.c_str());
}
}
+
+ if (OccState(state) == OccState::SAFE)
+ {
+ // start safe delay timer (before requesting reset)
+ using namespace std::literals::chrono_literals;
+ safeStateDelayTimer.restartOnce(60s);
+ }
+ else if (safeStateDelayTimer.isEnabled())
+ {
+ // stop safe delay timer (no longer in SAFE state)
+ safeStateDelayTimer.setEnabled(false);
+ }
#endif
}
file.close();
@@ -676,6 +697,21 @@
return status;
}
+
+// Called when safe timer expires to determine if OCCs need to be reset
+void Status::safeStateDelayExpired()
+{
+ if (this->occActive())
+ {
+ log<level::INFO>(
+ fmt::format(
+ "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
+ instance)
+ .c_str());
+ // Disable and reset to try recovering
+ deviceError();
+ }
+}
#endif // POWER10
} // namespace occ
diff --git a/occ_status.hpp b/occ_status.hpp
index c449ec1..1cbcdb3 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -11,6 +11,10 @@
#include <org/open_power/OCC/Status/server.hpp>
#include <sdbusplus/bus.hpp>
#include <sdbusplus/server/object.hpp>
+#ifdef POWER10
+#include <sdeventplus/event.hpp>
+#include <sdeventplus/utility/timer.hpp>
+#endif
#include <functional>
@@ -100,6 +104,13 @@
occCmd(instance, (fs::path(OCC_CONTROL_ROOT) /
(std::string(OCC_NAME) + std::to_string(instance)))
.c_str())
+#ifdef POWER10
+ ,
+ sdpEvent(sdeventplus::Event::get_default()),
+ safeStateDelayTimer(
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
+ sdpEvent, std::bind(&Status::safeStateDelayExpired, this)))
+#endif
#ifdef PLDM
,
resetCallBack(resetCallBack)
@@ -226,6 +237,11 @@
/** @brief Command object to send commands to the OCC */
OccCommand occCmd;
+#ifdef POWER10
+ /** @brief timer event */
+ sdeventplus::Event sdpEvent;
+#endif
+
/** @brief Callback function on host control signals
*
* @param[in] msg - Data associated with subscribed signal
@@ -261,6 +277,17 @@
*/
bool getIPSParms(uint8_t& enterUtil, uint16_t& enterTime, uint8_t& exitUtil,
uint16_t& exitTime);
+
+ /**
+ * @brief Timer that is started when OCC is detected to be in safe mode
+ */
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>
+ safeStateDelayTimer;
+
+ /** @brief Callback for timer that is started when OCC was detected to be in
+ * safe mode. Called to verify and then disable and reset the OCCs.
+ */
+ void safeStateDelayExpired();
#endif // POWER10
/** @brief Override the sensor name with name from the definition.