Handle OCC active sensor updates prior to host runtime
On some systems, occ-control was getting notified that the OCCs were
active before the host reached runtime state. This would prevent
occ-control from starting communication with the OCCs.
The fix will ignore the early OCC Active sensor enabled messages and
once the host gets to runtime, it will re-query the sensors to ensure
they are still active.
Verified on fresh boot, warm boot, BMC reset, warm boot after BMC reset
on a system that exhibited the early sensors and one that did not.
Also removes an unnecessary InternalFailure when a sensor was cleared,
but no OCC objets were found.
Change-Id: Idb6c107cf83d12272aef9179045de73298e6d6b6
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 17cd403..0fb5f1d 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -160,14 +160,21 @@
{
static bool allActiveSensorAvailable = false;
static bool tracedSensorWait = false;
+ static bool waitingForHost = false;
- // Start with the assumption that all are available
- allActiveSensorAvailable = true;
- for (auto& obj : statusObjects)
+ if (open_power::occ::utils::isHostRunning())
{
- if (!obj->occActive())
+ if (waitingForHost)
{
- if (!obj->getPldmSensorReceived())
+ waitingForHost = false;
+ log<level::INFO>("checkAllActiveSensors(): Host is now running");
+ }
+
+ // Start with the assumption that all are available
+ allActiveSensorAvailable = true;
+ for (auto& obj : statusObjects)
+ {
+ if ((!obj->occActive()) && (!obj->getPldmSensorReceived()))
{
auto instance = obj->getOccInstanceID();
// Check if sensor was queued while waiting for discovery
@@ -200,6 +207,15 @@
}
}
}
+ else
+ {
+ if (!waitingForHost)
+ {
+ waitingForHost = true;
+ log<level::INFO>(
+ "checkAllActiveSensors(): Waiting for host to start");
+ }
+ }
if (allActiveSensorAvailable)
{
@@ -319,20 +335,6 @@
void Manager::statusCallBack(instanceID instance, bool status)
{
- using InternalFailure =
- sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure;
-
- // At this time, it won't happen but keeping it
- // here just in case something changes in the future
- if ((activeCount == 0) && (!status))
- {
- log<level::ERR>(
- fmt::format("Invalid update on OCCActive with OCC{}", instance)
- .c_str());
-
- elog<InternalFailure>();
- }
-
if (status == true)
{
// OCC went active
@@ -376,7 +378,17 @@
else
{
// OCC went away
- --activeCount;
+ if (activeCount > 0)
+ {
+ --activeCount;
+ }
+ else
+ {
+ log<level::ERR>(
+ fmt::format("OCC{} disabled, but currently no active OCCs",
+ instance)
+ .c_str());
+ }
if (activeCount == 0)
{
@@ -469,18 +481,56 @@
return instance == obj->getOccInstanceID();
});
+ const bool hostRunning = open_power::occ::utils::isHostRunning();
if (obj != statusObjects.end())
{
- (*obj)->setPldmSensorReceived(true);
- return (*obj)->occActive(status);
+ if (!hostRunning && (status == true))
+ {
+ log<level::WARNING>(
+ fmt::format(
+ "updateOCCActive: Host is not running yet (OCC{} active={}), clearing sensor received",
+ instance, status)
+ .c_str());
+ (*obj)->setPldmSensorReceived(false);
+ if (!waitingForAllOccActiveSensors)
+ {
+ log<level::INFO>(
+ "updateOCCActive: Waiting for Host and all OCC Active Sensors");
+ waitingForAllOccActiveSensors = true;
+ }
+ discoverTimer->restartOnce(30s);
+ return false;
+ }
+ else
+ {
+ log<level::INFO>(fmt::format("updateOCCActive: OCC{} active={}",
+ instance, status)
+ .c_str());
+ (*obj)->setPldmSensorReceived(true);
+ return (*obj)->occActive(status);
+ }
}
else
{
- log<level::WARNING>(
- fmt::format(
- "Manager::updateOCCActive: No status object to update for OCC{} (active={})",
- instance, status)
- .c_str());
+ if (hostRunning)
+ {
+ log<level::WARNING>(
+ fmt::format(
+ "updateOCCActive: No status object to update for OCC{} (active={})",
+ instance, status)
+ .c_str());
+ }
+ else
+ {
+ if (status == true)
+ {
+ log<level::WARNING>(
+ fmt::format(
+ "updateOCCActive: No status objects and Host is not running yet (OCC{} active={})",
+ instance, status)
+ .c_str());
+ }
+ }
if (status == true)
{
// OCC went active
diff --git a/pldm.cpp b/pldm.cpp
index fb2c245..1bcf7a1 100644
--- a/pldm.cpp
+++ b/pldm.cpp
@@ -197,11 +197,6 @@
.c_str());
}
- if (!open_power::occ::utils::isHostRunning())
- {
- log<level::INFO>("PLDM: HOST is not running");
- isRunning = false;
- }
callBack(instance, isRunning);
return;
@@ -274,6 +269,7 @@
fmt::format("clearData: OCC{} / sensorID: 0x{:04X}",
entry.second, entry.first)
.c_str());
+ callBack(entry.second, false);
}
sensorToOCCInstance.clear();
}
@@ -656,7 +652,7 @@
{
if (!pldmResponseReceived)
{
- log<level::ERR>(
+ log<level::WARNING>(
fmt::format(
"pldmRspExpired: timerCallback - timeout waiting for pldm response for OCC{}",
pldmResponseOcc)