openpower-occ-control:failure to read OCC state.
Failure to read OCC state set the OCCs sensors to Nan/Not Functional
Tested: cronus error inject on OCC with and without OCC resets.
Signed-off-by: Sheldon Bailey <baileysh@us.ibm.com>
Change-Id: I2a6bb6a431f09ea816979b3a482b54a28e21db53
Signed-off-by: Sheldon Bailey <baileysh@us.ibm.com>
diff --git a/occ_status.cpp b/occ_status.cpp
index 7c3658c..9e65155 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -49,7 +49,7 @@
// Call into Manager to let know that we have bound
if (this->managerCallBack)
{
- this->managerCallBack(value);
+ this->managerCallBack(instance, value);
}
}
else
@@ -70,7 +70,7 @@
// Call into Manager to let know that we will unbind.
if (this->managerCallBack)
{
- this->managerCallBack(value);
+ this->managerCallBack(instance, value);
}
// Stop watching for errors
@@ -191,83 +191,11 @@
return;
}
+// Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
void Status::readOccState()
{
- unsigned int state;
- const fs::path filename =
- fs::path(DEV_PATH) /
- fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
-
- std::ifstream file(filename, std::ios::in);
- const int open_errno = errno;
- if (file)
- {
- file >> state;
- if (state != lastState)
- {
- // Trace OCC state changes
- log<level::INFO>(
- fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
- instance, state)
- .c_str());
- if (state & 0xFFFFFFF8)
- {
- log<level::ERR>(
- fmt::format("Status::readOccState: INVALID STATE from {}!!",
- filename.c_str())
- .c_str());
- }
- lastState = state;
-
-#ifdef POWER10
- if (OccState(state) == OccState::ACTIVE)
- {
- if (pmode && device.master())
- {
- // Set the master OCC on the PowerMode object
- pmode->setMasterOcc(path);
- // Enable mode changes
- pmode->setMasterActive();
-
- // Special processing by master OCC when it goes active
- occsWentActive();
- }
-
- CmdStatus status = sendAmbient();
- if (status != CmdStatus::SUCCESS)
- {
- log<level::ERR>(
- fmt::format(
- "readOccState: Sending Ambient failed with status {}",
- status)
- .c_str());
- }
- }
-
- if (OccState(state) == OccState::SAFE)
- {
- // start safe delay timer (before requesting reset)
- using namespace std::literals::chrono_literals;
- safeStateDelayTimer.restartOnce(60s);
- }
- else if (safeStateDelayTimer.isEnabled())
- {
- // stop safe delay timer (no longer in SAFE state)
- safeStateDelayTimer.setEnabled(false);
- }
-#endif
- }
- file.close();
- }
- else
- {
- // If not able to read, OCC may be offline
- log<level::DEBUG>(
- fmt::format("Status::readOccState: open failed (errno={})",
- open_errno)
- .c_str());
- lastState = 0;
- }
+ currentOccReadRetriesCount = occReadRetries;
+ occReadStateNow();
}
#ifdef POWER10
@@ -450,5 +378,145 @@
return hwmonPath;
}
+// Called to read state and upon failure to read after occReadStateFailTimer.
+void Status::occReadStateNow()
+{
+ unsigned int state;
+ const fs::path filename =
+ fs::path(DEV_PATH) /
+ fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
+
+ std::ifstream file;
+ bool goodFile = false;
+
+ // open file.
+ file.open(filename, std::ios::in);
+ const int openErrno = errno;
+
+ // File is open and state can be used.
+ if (file.is_open() && file.good())
+ {
+ goodFile = true;
+ file >> state;
+
+ if (state != lastState)
+ {
+ // Trace OCC state changes
+ log<level::INFO>(
+ fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
+ instance, state)
+ .c_str());
+ lastState = state;
+#ifdef POWER10
+ if (OccState(state) == OccState::ACTIVE)
+ {
+ if (pmode && device.master())
+ {
+ // Set the master OCC on the PowerMode object
+ pmode->setMasterOcc(path);
+ // Enable mode changes
+ pmode->setMasterActive();
+
+ // Special processing by master OCC when it goes active
+ occsWentActive();
+ }
+
+ CmdStatus status = sendAmbient();
+ if (status != CmdStatus::SUCCESS)
+ {
+ log<level::ERR>(
+ fmt::format(
+ "readOccState: Sending Ambient failed with status {}",
+ status)
+ .c_str());
+ }
+ }
+
+ // If OCC in known Good State.
+ if ((OccState(state) == OccState::ACTIVE) ||
+ (OccState(state) == OccState::CHARACTERIZATION) ||
+ (OccState(state) == OccState::OBSERVATION))
+ {
+ // Good OCC State then sensors valid again
+ stateValid = true;
+
+ if (safeStateDelayTimer.isEnabled())
+ {
+ // stop safe delay timer (no longer in SAFE state)
+ safeStateDelayTimer.setEnabled(false);
+ }
+ }
+ // Else not Valid state We would be in SAFE mode.
+ // This captures both SAFE mode, and 0x00, or other invalid
+ // state values.
+ else
+ {
+ if (!safeStateDelayTimer.isEnabled())
+ {
+ // start safe delay timer (before requesting reset)
+ using namespace std::literals::chrono_literals;
+ safeStateDelayTimer.restartOnce(60s);
+ }
+ // Not valid state, update sensors to Nan & not functional.
+ stateValid = false;
+ }
+#else
+ // Before P10 state not checked, only used good file open.
+ stateValid = true;
+#endif
+ }
+ }
+ file.close();
+
+ // if failed to Read a state or not a valid state -> Attempt retry
+ // after 1 Second delay if allowed.
+ if ((!goodFile) || (!stateValid))
+ {
+ if (!goodFile)
+ {
+ // If not able to read, OCC may be offline
+ log<level::ERR>(
+ fmt::format("Status::readOccState: open failed (errno={})",
+ openErrno)
+ .c_str());
+ }
+ else
+ {
+ // else this failed due to state not valid.
+ log<level::ERR>(
+ fmt::format(
+ "Status::readOccState: OCC{} Invalid state 0x{:02X}",
+ instance, state)
+ .c_str());
+ }
+
+#ifdef READ_OCC_SENSORS
+ manager.setSensorValueToNonFunctional(instance);
+#endif
+
+ // See occReadRetries for number of retry attempts.
+ if (currentOccReadRetriesCount > 0)
+ {
+ --currentOccReadRetriesCount;
+#ifdef POWER10
+ using namespace std::chrono_literals;
+ occReadStateFailTimer.restartOnce(1s);
+#endif
+ }
+ else
+ {
+ // State could not be determined, set it to NO State.
+ lastState = 0;
+
+ // Disable the ability to send Failed actions until OCC is
+ // Active again.
+ stateValid = false;
+
+ // Disable and reset to try recovering
+ deviceError();
+ }
+ }
+}
+
} // namespace occ
} // namespace open_power