openpower-occ-control:failure to read OCC state.
Failure to read OCC state set the OCCs sensors to Nan/Not Functional
Tested: cronus error inject on OCC with and without OCC resets.
Signed-off-by: Sheldon Bailey <baileysh@us.ibm.com>
Change-Id: I2a6bb6a431f09ea816979b3a482b54a28e21db53
Signed-off-by: Sheldon Bailey <baileysh@us.ibm.com>
diff --git a/.clang-format b/.clang-format
index abedc25..ddfe2d7 100644
--- a/.clang-format
+++ b/.clang-format
@@ -49,6 +49,7 @@
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
+DeriveLineEnding: false
DerivePointerAlignment: false
PointerAlignment: Left
DisableFormat: false
@@ -110,5 +111,6 @@
SpacesInSquareBrackets: false
Standard: Latest
TabWidth: 4
+UseCRLF: false
UseTab: Never
...
diff --git a/occ_command.hpp b/occ_command.hpp
index aee4c58..9dde698 100644
--- a/occ_command.hpp
+++ b/occ_command.hpp
@@ -39,7 +39,7 @@
OBSERVATION = 0x02,
ACTIVE = 0x03,
SAFE = 0x04,
- CHARACTERIZATION = 0x05,
+ CHARACTERIZATION = 0x05
};
enum class SysPwrMode
diff --git a/occ_manager.cpp b/occ_manager.cpp
index a3f246c..eb116e7 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -162,7 +162,7 @@
pmode,
#endif
std::bind(std::mem_fn(&Manager::statusCallBack), this,
- std::placeholders::_1)
+ std::placeholders::_1, std::placeholders::_2)
#ifdef PLDM
,
std::bind(std::mem_fn(&pldm::Interface::resetOCC), pldmHandle.get(),
@@ -201,7 +201,7 @@
));
}
-void Manager::statusCallBack(bool status)
+void Manager::statusCallBack(instanceID instance, bool status)
{
using InternalFailure =
sdbusplus::xyz::openbmc_project::Common::Error::InternalFailure;
@@ -210,7 +210,10 @@
// here just in case something changes in the future
if ((activeCount == 0) && (!status))
{
- log<level::ERR>("Invalid update on OCCActive");
+ log<level::ERR>(
+ fmt::format("Invalid update on OCCActive with OCC{}", instance)
+ .c_str());
+
elog<InternalFailure>();
}
@@ -278,15 +281,11 @@
waitForAllOccsTimer->setEnabled(false);
}
#endif
-
-#ifdef READ_OCC_SENSORS
- // Clear OCC sensors
- for (auto& obj : statusObjects)
- {
- setSensorValueToNaN(obj->getOccInstanceID());
- }
-#endif
}
+#ifdef READ_OCC_SENSORS
+ // Clear OCC sensors
+ setSensorValueToNonFunctional(instance);
+#endif
}
}
@@ -501,7 +500,7 @@
// OCC is not running yet
#ifdef READ_OCC_SENSORS
auto id = obj->getOccInstanceID();
- setSensorValueToNaN(id);
+ setSensorValueToNonFunctional(id);
#endif
continue;
}
@@ -842,6 +841,22 @@
return;
}
+void Manager::setSensorValueToNonFunctional(uint32_t id) const
+{
+ for (const auto& [sensorPath, occId] : existingSensors)
+ {
+ if (occId == id)
+ {
+ dbus::OccDBusSensors::getOccDBus().setValue(
+ sensorPath, std::numeric_limits<double>::quiet_NaN());
+
+ dbus::OccDBusSensors::getOccDBus().setOperationalStatus(sensorPath,
+ false);
+ }
+ }
+ return;
+}
+
void Manager::getSensorValues(std::unique_ptr<Status>& occ)
{
static bool tracedError[8] = {0};
diff --git a/occ_manager.hpp b/occ_manager.hpp
index c06f6d3..d636976 100644
--- a/occ_manager.hpp
+++ b/occ_manager.hpp
@@ -144,6 +144,12 @@
/** @brief Notify pcap object to update bounds */
void updatePcapBounds() const;
+ /** @brief Set all sensor values of this OCC to NaN and non functional.
+ *
+ * @param[in] id - Id of the OCC.
+ */
+ void setSensorValueToNonFunctional(uint32_t id) const;
+
private:
/** @brief Creates the OCC D-Bus objects.
*/
@@ -173,7 +179,7 @@
*
* @param[in] status - OccActive status
*/
- void statusCallBack(bool status);
+ void statusCallBack(instanceID instance, bool status);
/** @brief Sends a Heartbeat command to host control command handler */
void sendHeartBeat();
diff --git a/occ_status.cpp b/occ_status.cpp
index 7c3658c..9e65155 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -49,7 +49,7 @@
// Call into Manager to let know that we have bound
if (this->managerCallBack)
{
- this->managerCallBack(value);
+ this->managerCallBack(instance, value);
}
}
else
@@ -70,7 +70,7 @@
// Call into Manager to let know that we will unbind.
if (this->managerCallBack)
{
- this->managerCallBack(value);
+ this->managerCallBack(instance, value);
}
// Stop watching for errors
@@ -191,83 +191,11 @@
return;
}
+// Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
void Status::readOccState()
{
- unsigned int state;
- const fs::path filename =
- fs::path(DEV_PATH) /
- fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
-
- std::ifstream file(filename, std::ios::in);
- const int open_errno = errno;
- if (file)
- {
- file >> state;
- if (state != lastState)
- {
- // Trace OCC state changes
- log<level::INFO>(
- fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
- instance, state)
- .c_str());
- if (state & 0xFFFFFFF8)
- {
- log<level::ERR>(
- fmt::format("Status::readOccState: INVALID STATE from {}!!",
- filename.c_str())
- .c_str());
- }
- lastState = state;
-
-#ifdef POWER10
- if (OccState(state) == OccState::ACTIVE)
- {
- if (pmode && device.master())
- {
- // Set the master OCC on the PowerMode object
- pmode->setMasterOcc(path);
- // Enable mode changes
- pmode->setMasterActive();
-
- // Special processing by master OCC when it goes active
- occsWentActive();
- }
-
- CmdStatus status = sendAmbient();
- if (status != CmdStatus::SUCCESS)
- {
- log<level::ERR>(
- fmt::format(
- "readOccState: Sending Ambient failed with status {}",
- status)
- .c_str());
- }
- }
-
- if (OccState(state) == OccState::SAFE)
- {
- // start safe delay timer (before requesting reset)
- using namespace std::literals::chrono_literals;
- safeStateDelayTimer.restartOnce(60s);
- }
- else if (safeStateDelayTimer.isEnabled())
- {
- // stop safe delay timer (no longer in SAFE state)
- safeStateDelayTimer.setEnabled(false);
- }
-#endif
- }
- file.close();
- }
- else
- {
- // If not able to read, OCC may be offline
- log<level::DEBUG>(
- fmt::format("Status::readOccState: open failed (errno={})",
- open_errno)
- .c_str());
- lastState = 0;
- }
+ currentOccReadRetriesCount = occReadRetries;
+ occReadStateNow();
}
#ifdef POWER10
@@ -450,5 +378,145 @@
return hwmonPath;
}
+// Called to read state and upon failure to read after occReadStateFailTimer.
+void Status::occReadStateNow()
+{
+ unsigned int state;
+ const fs::path filename =
+ fs::path(DEV_PATH) /
+ fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
+
+ std::ifstream file;
+ bool goodFile = false;
+
+ // open file.
+ file.open(filename, std::ios::in);
+ const int openErrno = errno;
+
+ // File is open and state can be used.
+ if (file.is_open() && file.good())
+ {
+ goodFile = true;
+ file >> state;
+
+ if (state != lastState)
+ {
+ // Trace OCC state changes
+ log<level::INFO>(
+ fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
+ instance, state)
+ .c_str());
+ lastState = state;
+#ifdef POWER10
+ if (OccState(state) == OccState::ACTIVE)
+ {
+ if (pmode && device.master())
+ {
+ // Set the master OCC on the PowerMode object
+ pmode->setMasterOcc(path);
+ // Enable mode changes
+ pmode->setMasterActive();
+
+ // Special processing by master OCC when it goes active
+ occsWentActive();
+ }
+
+ CmdStatus status = sendAmbient();
+ if (status != CmdStatus::SUCCESS)
+ {
+ log<level::ERR>(
+ fmt::format(
+ "readOccState: Sending Ambient failed with status {}",
+ status)
+ .c_str());
+ }
+ }
+
+ // If OCC in known Good State.
+ if ((OccState(state) == OccState::ACTIVE) ||
+ (OccState(state) == OccState::CHARACTERIZATION) ||
+ (OccState(state) == OccState::OBSERVATION))
+ {
+ // Good OCC State then sensors valid again
+ stateValid = true;
+
+ if (safeStateDelayTimer.isEnabled())
+ {
+ // stop safe delay timer (no longer in SAFE state)
+ safeStateDelayTimer.setEnabled(false);
+ }
+ }
+ // Else not Valid state We would be in SAFE mode.
+ // This captures both SAFE mode, and 0x00, or other invalid
+ // state values.
+ else
+ {
+ if (!safeStateDelayTimer.isEnabled())
+ {
+ // start safe delay timer (before requesting reset)
+ using namespace std::literals::chrono_literals;
+ safeStateDelayTimer.restartOnce(60s);
+ }
+ // Not valid state, update sensors to Nan & not functional.
+ stateValid = false;
+ }
+#else
+ // Before P10 state not checked, only used good file open.
+ stateValid = true;
+#endif
+ }
+ }
+ file.close();
+
+ // if failed to Read a state or not a valid state -> Attempt retry
+ // after 1 Second delay if allowed.
+ if ((!goodFile) || (!stateValid))
+ {
+ if (!goodFile)
+ {
+ // If not able to read, OCC may be offline
+ log<level::ERR>(
+ fmt::format("Status::readOccState: open failed (errno={})",
+ openErrno)
+ .c_str());
+ }
+ else
+ {
+ // else this failed due to state not valid.
+ log<level::ERR>(
+ fmt::format(
+ "Status::readOccState: OCC{} Invalid state 0x{:02X}",
+ instance, state)
+ .c_str());
+ }
+
+#ifdef READ_OCC_SENSORS
+ manager.setSensorValueToNonFunctional(instance);
+#endif
+
+ // See occReadRetries for number of retry attempts.
+ if (currentOccReadRetriesCount > 0)
+ {
+ --currentOccReadRetriesCount;
+#ifdef POWER10
+ using namespace std::chrono_literals;
+ occReadStateFailTimer.restartOnce(1s);
+#endif
+ }
+ else
+ {
+ // State could not be determined, set it to NO State.
+ lastState = 0;
+
+ // Disable the ability to send Failed actions until OCC is
+ // Active again.
+ stateValid = false;
+
+ // Disable and reset to try recovering
+ deviceError();
+ }
+ }
+}
+
} // namespace occ
} // namespace open_power
diff --git a/occ_status.hpp b/occ_status.hpp
index 6376b6c..e02aaea 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -79,7 +79,7 @@
#ifdef POWER10
std::unique_ptr<powermode::PowerMode>& powerModeRef,
#endif
- std::function<void(bool)> callBack = nullptr
+ std::function<void(instanceID, bool)> callBack = nullptr
#ifdef PLDM
,
std::function<void(instanceID)> resetCallBack = nullptr
@@ -117,8 +117,12 @@
sdpEvent(sdeventplus::Event::get_default()),
safeStateDelayTimer(
sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
- sdpEvent, std::bind(&Status::safeStateDelayExpired, this)))
+ sdpEvent, std::bind(&Status::safeStateDelayExpired, this))),
+ occReadStateFailTimer(
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>(
+ sdpEvent, std::bind(&Status::occReadStateNow, this)))
#endif
+
#ifdef PLDM
,
resetCallBack(resetCallBack)
@@ -213,7 +217,7 @@
/** @brief Callback handler to be invoked during property change.
* This is a handler in Manager class
*/
- std::function<void(bool)> managerCallBack;
+ std::function<void(instanceID, bool)> managerCallBack;
/** @brief OCC instance number. Ex, 0,1, etc */
unsigned int instance;
@@ -221,6 +225,15 @@
/** @brief The last state read from the OCC */
unsigned int lastState = 0;
+ /** @brief Number of retry attempts to open file and update state. */
+ const unsigned int occReadRetries = 1;
+
+ /** @brief Current number of retries attempted towards occReadRetries. */
+ size_t currentOccReadRetriesCount = 0;
+
+ /** @brief The Trigger to indicate OCC State is valid or not. */
+ bool stateValid = false;
+
/** @brief OCC instance to Sensor definitions mapping */
static const std::map<instanceID, sensorDefs> sensorMap;
@@ -284,7 +297,18 @@
* safe mode. Called to verify and then disable and reset the OCCs.
*/
void safeStateDelayExpired();
+
+ /**
+ * @brief Timer that is started when OCC read Valid state failed.
+ */
+ sdeventplus::utility::Timer<sdeventplus::ClockId::Monotonic>
+ occReadStateFailTimer;
+
#endif // POWER10
+ /** @brief Callback for timer that is started when OCC state
+ * was not able to be read. Called to attempt another read when needed.
+ */
+ void occReadStateNow();
/** @brief Override the sensor name with name from the definition.
* @param[in] estimatedPath - Estimated OCC Dbus object path