| From 98fc2914b15e89c2324c1636af62225c653e45f9 Mon Sep 17 00:00:00 2001 |
| From: Doug Gilbert <dgilbert@us.ibm.com> |
| Date: Tue, 3 Mar 2015 16:00:29 -0600 |
| Subject: [PATCH] HTMGT add attempt to reset OCC when OCC Activate fails |
| |
| Change-Id: I964d2b68216c3ddabae73ce3b851bbc468ec96a7 |
| RTC: 123180 |
| --- |
| src/include/usr/htmgt/htmgt_reasoncodes.H | 1 + |
| src/usr/htmgt/htmgt.C | 123 ++++++++++++++++++------------ |
| src/usr/htmgt/htmgt_activate.C | 9 +++ |
| src/usr/htmgt/htmgt_occ.C | 32 +++++++- |
| src/usr/htmgt/htmgt_occ.H | 1 + |
| 5 files changed, 116 insertions(+), 50 deletions(-) |
| |
| diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H |
| index ade192d..6fe269d 100644 |
| --- a/src/include/usr/htmgt/htmgt_reasoncodes.H |
| +++ b/src/include/usr/htmgt/htmgt_reasoncodes.H |
| @@ -48,6 +48,7 @@ namespace HTMGT |
| HTMGT_MOD_CHECK_OCC_RSP = 0x92, |
| HTMGT_MOD_PARSE_OCC_RSP = 0x94, |
| HTMGT_MOD_HANLDE_OCC_EXCEPTION = 0xE0, |
| + HTMGT_MOD_ENABLE_OCC_ACTUATION = 0xE1, |
| }; |
| |
| enum htmgtReasonCode |
| diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C |
| index aff2500..a2f556f 100644 |
| --- a/src/usr/htmgt/htmgt.C |
| +++ b/src/usr/htmgt/htmgt.C |
| @@ -184,42 +184,30 @@ namespace HTMGT |
| |
| if (NULL != l_err) |
| { |
| - TMGT_ERR("OCCs not all active. System will stay in safe mode"); |
| + TMGT_ERR("OCCs not all active. Attempting OCC Reset"); |
| TMGT_CONSOLE("OCCs are not active (rc=0x%04X). " |
| - "System will remain in safe mode", |
| + "Attempting OCC Reset", |
| l_err->reasonCode()); |
| - TMGT_INF("Calling HBOCC::stopAllOCCs"); |
| - errlHndl_t err2 = HBOCC::stopAllOCCs(); |
| + TMGT_INF("Calling resetOccs"); |
| + errlHndl_t err2 = OccManager::resetOccs(NULL); |
| if(err2) |
| { |
| - TMGT_ERR("stopAllOCCs() failed with 0x%04X", |
| + TMGT_ERR("OccManager:;resetOccs failed with 0x%04X", |
| err2->reasonCode()); |
| - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| - } |
| - |
| - // Update error log to unrecoverable and set SRC |
| - // to indicate the system will remain in safe mode |
| - /*@ |
| - * @errortype |
| - * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE |
| - * @moduleid HTMGT_MOD_LOAD_START_STATUS |
| - * @userdata1[0:7] load/start completed |
| - * @devdesc OCCs did not all reach active state, |
| - * system will be in Safe Mode |
| - */ |
| - bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS, |
| - HTMGT_RC_OCC_CRIT_FAILURE, |
| - i_startCompleted, 0, 0, 1, |
| - ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| |
| - // Add level 2 support callout |
| - l_err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, |
| - HWAS::SRCI_PRIORITY_MED); |
| - // Add HB firmware callout |
| - l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, |
| - HWAS::SRCI_PRIORITY_MED); |
| + // Set original error log as unrecoverable and commit |
| + l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| |
| - ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| + // Commit occReset error |
| + ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| + } |
| + else |
| + { |
| + // retry worked - commit original error as informational |
| + l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); |
| + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| + } |
| } |
| |
| } // end processOccStartStatus() |
| @@ -229,6 +217,19 @@ namespace HTMGT |
| // Notify HTMGT that an OCC has an error to report |
| void processOccError(TARGETING::Target * i_procTarget) |
| { |
| + TARGETING::Target* sys = NULL; |
| + TARGETING::targetService().getTopLevelTarget(sys); |
| + uint8_t safeMode = 0; |
| + |
| + // If the system is in safemode then can't talk to OCCs - |
| + // ignore call to processOccError |
| + if(sys && |
| + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && |
| + safeMode) |
| + { |
| + return; |
| + } |
| + |
| bool polledOneOcc = false; |
| OccManager::buildOccs(); |
| |
| @@ -347,29 +348,57 @@ namespace HTMGT |
| // Set the OCC state |
| errlHndl_t enableOccActuation(bool i_occActivation) |
| { |
| - occStateId targetState = OCC_STATE_ACTIVE; |
| - if (false == i_occActivation) |
| - { |
| - targetState = OCC_STATE_OBSERVATION; |
| - } |
| + errlHndl_t l_err = NULL; |
| + TARGETING::Target* sys = NULL; |
| + |
| + TARGETING::targetService().getTopLevelTarget(sys); |
| + uint8_t safeMode = 0; |
| |
| - // Set state for all OCCs |
| - errlHndl_t l_err = OccManager::setOccState(targetState); |
| - if (NULL == l_err) |
| + // If the system is in safemode then can't talk to OCCs - |
| + // ignore call to enableOccActuation |
| + if(sys && |
| + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && |
| + safeMode) |
| { |
| - TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", |
| - targetState); |
| + /*@ |
| + * @errortype |
| + * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE |
| + * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION |
| + * @userdata1[0:7] OCC activate [1==true][0==false] |
| + * @devdesc Invalid operation when OCCs are in safemode |
| + */ |
| + bldErrLog(l_err, |
| + HTMGT_MOD_ENABLE_OCC_ACTUATION, |
| + HTMGT_RC_OCC_CRIT_FAILURE, |
| + i_occActivation, 0, 0, 1, |
| + ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| } |
| - |
| - if (OccManager::occNeedsReset()) |
| + else |
| { |
| - TMGT_ERR("enableOccActuation(): OCCs need to be reset"); |
| - // Don't pass failed target as OCC should have already |
| - // been marked as failed during the poll. |
| - errlHndl_t err2 = OccManager::resetOccs(NULL); |
| - if(err2) |
| + occStateId targetState = OCC_STATE_ACTIVE; |
| + if (false == i_occActivation) |
| { |
| - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| + targetState = OCC_STATE_OBSERVATION; |
| + } |
| + |
| + // Set state for all OCCs |
| + l_err = OccManager::setOccState(targetState); |
| + if (NULL == l_err) |
| + { |
| + TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", |
| + targetState); |
| + } |
| + |
| + if (OccManager::occNeedsReset()) |
| + { |
| + TMGT_ERR("enableOccActuation(): OCCs need to be reset"); |
| + // Don't pass failed target as OCC should have already |
| + // been marked as failed during the poll. |
| + errlHndl_t err2 = OccManager::resetOccs(NULL); |
| + if(err2) |
| + { |
| + ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| + } |
| } |
| } |
| |
| diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C |
| index 7f54d6d..4cb46f0 100644 |
| --- a/src/usr/htmgt/htmgt_activate.C |
| +++ b/src/usr/htmgt/htmgt_activate.C |
| @@ -39,6 +39,7 @@ |
| |
| #include <ipmi/ipmisensor.H> |
| #include <sys/time.h> |
| +#include <console/consoleif.H> |
| |
| using namespace TARGETING; |
| |
| @@ -163,6 +164,14 @@ namespace HTMGT |
| l_err = occ->ipmiSensor(i_activate); |
| if( l_err ) |
| { |
| + TMGT_ERR("setOccActiveSensors failed. (OCC%d state:%d)", |
| + occ->getInstance(), |
| + i_activate); |
| + |
| + TMGT_CONSOLE("setOccActiveSensors failed. (OCC%d state:%d)", |
| + occ->getInstance(), |
| + i_activate); |
| + |
| ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| } |
| } |
| diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C |
| index 8a539f4..bd95987 100644 |
| --- a/src/usr/htmgt/htmgt_occ.C |
| +++ b/src/usr/htmgt/htmgt_occ.C |
| @@ -248,7 +248,8 @@ namespace HTMGT |
| OccManager::OccManager() |
| :iv_occMaster(NULL), |
| iv_state(OCC_STATE_UNKNOWN), |
| - iv_targetState(OCC_STATE_ACTIVE) |
| + iv_targetState(OCC_STATE_ACTIVE), |
| + iv_resetCount(0) |
| { |
| } |
| |
| @@ -590,6 +591,19 @@ namespace HTMGT |
| ERRORLOG::errlCommit(err, HTMGT_COMP_ID); |
| } |
| |
| + if(NULL == i_failedOccTarget) |
| + { |
| + ++iv_resetCount; // increment system reset count |
| + |
| + TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d", |
| + iv_resetCount); |
| + |
| + if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD) |
| + { |
| + atThreshold = true; |
| + } |
| + } |
| + |
| for(occList_t::const_iterator occ = iv_occArray.begin(); |
| occ != iv_occArray.end(); |
| ++occ) |
| @@ -663,7 +677,7 @@ namespace HTMGT |
| */ |
| bldErrLog(err, |
| HTMTG_MOD_OCC_RESET, |
| - HTMGT_RC_OCC_RESET_THREHOLD, |
| + HTMGT_RC_OCC_CRIT_FAILURE, |
| 0, 0, 0, 0, |
| ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| } |
| @@ -673,6 +687,13 @@ namespace HTMGT |
| { |
| err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| |
| + // Add level 2 support callout |
| + err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, |
| + HWAS::SRCI_PRIORITY_MED); |
| + // Add HB firmware callout |
| + err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, |
| + HWAS::SRCI_PRIORITY_MED); |
| + |
| TARGETING::Target* sys = NULL; |
| TARGETING::targetService().getTopLevelTarget(sys); |
| uint8_t safeMode = 1; |
| @@ -683,8 +704,13 @@ namespace HTMGT |
| sys->setAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); |
| } |
| |
| - TMGT_ERR("_resetOccs: Safe Mode RC: 0x%04X (OCC%d)", |
| + TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)", |
| cv_safeReturnCode, cv_safeOccInstance); |
| + |
| + TMGT_CONSOLE("OCCs are not active. The system will remain in " |
| + "safe mode (RC: 0x%04x for OCC%d)", |
| + cv_safeReturnCode, |
| + cv_safeOccInstance); |
| } |
| |
| return err; |
| diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H |
| index dec19b8..5ac545a 100644 |
| --- a/src/usr/htmgt/htmgt_occ.H |
| +++ b/src/usr/htmgt/htmgt_occ.H |
| @@ -507,6 +507,7 @@ namespace HTMGT |
| occList_t iv_occArray; |
| occStateId iv_state; |
| occStateId iv_targetState; |
| + uint8_t iv_resetCount; |
| |
| /** |
| * @brief SRC that caused system to enter safe mode |
| -- |
| 1.8.2.2 |
| |