blob: 4b666b0886ee993acef45952fe013cffb2c7e8e0 [file] [log] [blame]
From 98fc2914b15e89c2324c1636af62225c653e45f9 Mon Sep 17 00:00:00 2001
From: Doug Gilbert <dgilbert@us.ibm.com>
Date: Tue, 3 Mar 2015 16:00:29 -0600
Subject: [PATCH] HTMGT add attempt to reset OCC when OCC Activate fails
Change-Id: I964d2b68216c3ddabae73ce3b851bbc468ec96a7
RTC: 123180
---
src/include/usr/htmgt/htmgt_reasoncodes.H | 1 +
src/usr/htmgt/htmgt.C | 123 ++++++++++++++++++------------
src/usr/htmgt/htmgt_activate.C | 9 +++
src/usr/htmgt/htmgt_occ.C | 32 +++++++-
src/usr/htmgt/htmgt_occ.H | 1 +
5 files changed, 116 insertions(+), 50 deletions(-)
diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H
index ade192d..6fe269d 100644
--- a/src/include/usr/htmgt/htmgt_reasoncodes.H
+++ b/src/include/usr/htmgt/htmgt_reasoncodes.H
@@ -48,6 +48,7 @@ namespace HTMGT
HTMGT_MOD_CHECK_OCC_RSP = 0x92,
HTMGT_MOD_PARSE_OCC_RSP = 0x94,
HTMGT_MOD_HANLDE_OCC_EXCEPTION = 0xE0,
+ HTMGT_MOD_ENABLE_OCC_ACTUATION = 0xE1,
};
enum htmgtReasonCode
diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C
index aff2500..a2f556f 100644
--- a/src/usr/htmgt/htmgt.C
+++ b/src/usr/htmgt/htmgt.C
@@ -184,42 +184,30 @@ namespace HTMGT
if (NULL != l_err)
{
- TMGT_ERR("OCCs not all active. System will stay in safe mode");
+ TMGT_ERR("OCCs not all active. Attempting OCC Reset");
TMGT_CONSOLE("OCCs are not active (rc=0x%04X). "
- "System will remain in safe mode",
+ "Attempting OCC Reset",
l_err->reasonCode());
- TMGT_INF("Calling HBOCC::stopAllOCCs");
- errlHndl_t err2 = HBOCC::stopAllOCCs();
+ TMGT_INF("Calling resetOccs");
+ errlHndl_t err2 = OccManager::resetOccs(NULL);
if(err2)
{
- TMGT_ERR("stopAllOCCs() failed with 0x%04X",
+ TMGT_ERR("OccManager:;resetOccs failed with 0x%04X",
err2->reasonCode());
- ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
- }
-
- // Update error log to unrecoverable and set SRC
- // to indicate the system will remain in safe mode
- /*@
- * @errortype
- * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
- * @moduleid HTMGT_MOD_LOAD_START_STATUS
- * @userdata1[0:7] load/start completed
- * @devdesc OCCs did not all reach active state,
- * system will be in Safe Mode
- */
- bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS,
- HTMGT_RC_OCC_CRIT_FAILURE,
- i_startCompleted, 0, 0, 1,
- ERRORLOG::ERRL_SEV_UNRECOVERABLE);
- // Add level 2 support callout
- l_err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP,
- HWAS::SRCI_PRIORITY_MED);
- // Add HB firmware callout
- l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE,
- HWAS::SRCI_PRIORITY_MED);
+ // Set original error log as unrecoverable and commit
+ l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE);
+ ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
- ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
+ // Commit occReset error
+ ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
+ }
+ else
+ {
+ // retry worked - commit original error as informational
+ l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL);
+ ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
+ }
}
} // end processOccStartStatus()
@@ -229,6 +217,19 @@ namespace HTMGT
// Notify HTMGT that an OCC has an error to report
void processOccError(TARGETING::Target * i_procTarget)
{
+ TARGETING::Target* sys = NULL;
+ TARGETING::targetService().getTopLevelTarget(sys);
+ uint8_t safeMode = 0;
+
+ // If the system is in safemode then can't talk to OCCs -
+ // ignore call to processOccError
+ if(sys &&
+ sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) &&
+ safeMode)
+ {
+ return;
+ }
+
bool polledOneOcc = false;
OccManager::buildOccs();
@@ -347,29 +348,57 @@ namespace HTMGT
// Set the OCC state
errlHndl_t enableOccActuation(bool i_occActivation)
{
- occStateId targetState = OCC_STATE_ACTIVE;
- if (false == i_occActivation)
- {
- targetState = OCC_STATE_OBSERVATION;
- }
+ errlHndl_t l_err = NULL;
+ TARGETING::Target* sys = NULL;
+
+ TARGETING::targetService().getTopLevelTarget(sys);
+ uint8_t safeMode = 0;
- // Set state for all OCCs
- errlHndl_t l_err = OccManager::setOccState(targetState);
- if (NULL == l_err)
+ // If the system is in safemode then can't talk to OCCs -
+ // ignore call to enableOccActuation
+ if(sys &&
+ sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) &&
+ safeMode)
{
- TMGT_INF("enableOccActuation: OCC states updated to 0x%02X",
- targetState);
+ /*@
+ * @errortype
+ * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
+ * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION
+ * @userdata1[0:7] OCC activate [1==true][0==false]
+ * @devdesc Invalid operation when OCCs are in safemode
+ */
+ bldErrLog(l_err,
+ HTMGT_MOD_ENABLE_OCC_ACTUATION,
+ HTMGT_RC_OCC_CRIT_FAILURE,
+ i_occActivation, 0, 0, 1,
+ ERRORLOG::ERRL_SEV_UNRECOVERABLE);
}
-
- if (OccManager::occNeedsReset())
+ else
{
- TMGT_ERR("enableOccActuation(): OCCs need to be reset");
- // Don't pass failed target as OCC should have already
- // been marked as failed during the poll.
- errlHndl_t err2 = OccManager::resetOccs(NULL);
- if(err2)
+ occStateId targetState = OCC_STATE_ACTIVE;
+ if (false == i_occActivation)
{
- ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
+ targetState = OCC_STATE_OBSERVATION;
+ }
+
+ // Set state for all OCCs
+ l_err = OccManager::setOccState(targetState);
+ if (NULL == l_err)
+ {
+ TMGT_INF("enableOccActuation: OCC states updated to 0x%02X",
+ targetState);
+ }
+
+ if (OccManager::occNeedsReset())
+ {
+ TMGT_ERR("enableOccActuation(): OCCs need to be reset");
+ // Don't pass failed target as OCC should have already
+ // been marked as failed during the poll.
+ errlHndl_t err2 = OccManager::resetOccs(NULL);
+ if(err2)
+ {
+ ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
+ }
}
}
diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C
index 7f54d6d..4cb46f0 100644
--- a/src/usr/htmgt/htmgt_activate.C
+++ b/src/usr/htmgt/htmgt_activate.C
@@ -39,6 +39,7 @@
#include <ipmi/ipmisensor.H>
#include <sys/time.h>
+#include <console/consoleif.H>
using namespace TARGETING;
@@ -163,6 +164,14 @@ namespace HTMGT
l_err = occ->ipmiSensor(i_activate);
if( l_err )
{
+ TMGT_ERR("setOccActiveSensors failed. (OCC%d state:%d)",
+ occ->getInstance(),
+ i_activate);
+
+ TMGT_CONSOLE("setOccActiveSensors failed. (OCC%d state:%d)",
+ occ->getInstance(),
+ i_activate);
+
ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
}
}
diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C
index 8a539f4..bd95987 100644
--- a/src/usr/htmgt/htmgt_occ.C
+++ b/src/usr/htmgt/htmgt_occ.C
@@ -248,7 +248,8 @@ namespace HTMGT
OccManager::OccManager()
:iv_occMaster(NULL),
iv_state(OCC_STATE_UNKNOWN),
- iv_targetState(OCC_STATE_ACTIVE)
+ iv_targetState(OCC_STATE_ACTIVE),
+ iv_resetCount(0)
{
}
@@ -590,6 +591,19 @@ namespace HTMGT
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
}
+ if(NULL == i_failedOccTarget)
+ {
+ ++iv_resetCount; // increment system reset count
+
+ TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d",
+ iv_resetCount);
+
+ if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD)
+ {
+ atThreshold = true;
+ }
+ }
+
for(occList_t::const_iterator occ = iv_occArray.begin();
occ != iv_occArray.end();
++occ)
@@ -663,7 +677,7 @@ namespace HTMGT
*/
bldErrLog(err,
HTMTG_MOD_OCC_RESET,
- HTMGT_RC_OCC_RESET_THREHOLD,
+ HTMGT_RC_OCC_CRIT_FAILURE,
0, 0, 0, 0,
ERRORLOG::ERRL_SEV_UNRECOVERABLE);
}
@@ -673,6 +687,13 @@ namespace HTMGT
{
err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE);
+ // Add level 2 support callout
+ err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP,
+ HWAS::SRCI_PRIORITY_MED);
+ // Add HB firmware callout
+ err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE,
+ HWAS::SRCI_PRIORITY_MED);
+
TARGETING::Target* sys = NULL;
TARGETING::targetService().getTopLevelTarget(sys);
uint8_t safeMode = 1;
@@ -683,8 +704,13 @@ namespace HTMGT
sys->setAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode);
}
- TMGT_ERR("_resetOccs: Safe Mode RC: 0x%04X (OCC%d)",
+ TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)",
cv_safeReturnCode, cv_safeOccInstance);
+
+ TMGT_CONSOLE("OCCs are not active. The system will remain in "
+ "safe mode (RC: 0x%04x for OCC%d)",
+ cv_safeReturnCode,
+ cv_safeOccInstance);
}
return err;
diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H
index dec19b8..5ac545a 100644
--- a/src/usr/htmgt/htmgt_occ.H
+++ b/src/usr/htmgt/htmgt_occ.H
@@ -507,6 +507,7 @@ namespace HTMGT
occList_t iv_occArray;
occStateId iv_state;
occStateId iv_targetState;
+ uint8_t iv_resetCount;
/**
* @brief SRC that caused system to enter safe mode
--
1.8.2.2