Matt Ploetz | 50ec2fb | 2015-03-05 10:16:24 -0600 | [diff] [blame^] | 1 | From 98fc2914b15e89c2324c1636af62225c653e45f9 Mon Sep 17 00:00:00 2001 |
| 2 | From: Doug Gilbert <dgilbert@us.ibm.com> |
| 3 | Date: Tue, 3 Mar 2015 16:00:29 -0600 |
| 4 | Subject: [PATCH] HTMGT add attempt to reset OCC when OCC Activate fails |
| 5 | |
| 6 | Change-Id: I964d2b68216c3ddabae73ce3b851bbc468ec96a7 |
| 7 | RTC: 123180 |
| 8 | --- |
| 9 | src/include/usr/htmgt/htmgt_reasoncodes.H | 1 + |
| 10 | src/usr/htmgt/htmgt.C | 123 ++++++++++++++++++------------ |
| 11 | src/usr/htmgt/htmgt_activate.C | 9 +++ |
| 12 | src/usr/htmgt/htmgt_occ.C | 32 +++++++- |
| 13 | src/usr/htmgt/htmgt_occ.H | 1 + |
| 14 | 5 files changed, 116 insertions(+), 50 deletions(-) |
| 15 | |
| 16 | diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H |
| 17 | index ade192d..6fe269d 100644 |
| 18 | --- a/src/include/usr/htmgt/htmgt_reasoncodes.H |
| 19 | +++ b/src/include/usr/htmgt/htmgt_reasoncodes.H |
| 20 | @@ -48,6 +48,7 @@ namespace HTMGT |
| 21 | HTMGT_MOD_CHECK_OCC_RSP = 0x92, |
| 22 | HTMGT_MOD_PARSE_OCC_RSP = 0x94, |
| 23 | HTMGT_MOD_HANLDE_OCC_EXCEPTION = 0xE0, |
| 24 | + HTMGT_MOD_ENABLE_OCC_ACTUATION = 0xE1, |
| 25 | }; |
| 26 | |
| 27 | enum htmgtReasonCode |
| 28 | diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C |
| 29 | index aff2500..a2f556f 100644 |
| 30 | --- a/src/usr/htmgt/htmgt.C |
| 31 | +++ b/src/usr/htmgt/htmgt.C |
| 32 | @@ -184,42 +184,30 @@ namespace HTMGT |
| 33 | |
| 34 | if (NULL != l_err) |
| 35 | { |
| 36 | - TMGT_ERR("OCCs not all active. System will stay in safe mode"); |
| 37 | + TMGT_ERR("OCCs not all active. Attempting OCC Reset"); |
| 38 | TMGT_CONSOLE("OCCs are not active (rc=0x%04X). " |
| 39 | - "System will remain in safe mode", |
| 40 | + "Attempting OCC Reset", |
| 41 | l_err->reasonCode()); |
| 42 | - TMGT_INF("Calling HBOCC::stopAllOCCs"); |
| 43 | - errlHndl_t err2 = HBOCC::stopAllOCCs(); |
| 44 | + TMGT_INF("Calling resetOccs"); |
| 45 | + errlHndl_t err2 = OccManager::resetOccs(NULL); |
| 46 | if(err2) |
| 47 | { |
| 48 | - TMGT_ERR("stopAllOCCs() failed with 0x%04X", |
| 49 | + TMGT_ERR("OccManager:;resetOccs failed with 0x%04X", |
| 50 | err2->reasonCode()); |
| 51 | - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| 52 | - } |
| 53 | - |
| 54 | - // Update error log to unrecoverable and set SRC |
| 55 | - // to indicate the system will remain in safe mode |
| 56 | - /*@ |
| 57 | - * @errortype |
| 58 | - * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE |
| 59 | - * @moduleid HTMGT_MOD_LOAD_START_STATUS |
| 60 | - * @userdata1[0:7] load/start completed |
| 61 | - * @devdesc OCCs did not all reach active state, |
| 62 | - * system will be in Safe Mode |
| 63 | - */ |
| 64 | - bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS, |
| 65 | - HTMGT_RC_OCC_CRIT_FAILURE, |
| 66 | - i_startCompleted, 0, 0, 1, |
| 67 | - ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| 68 | |
| 69 | - // Add level 2 support callout |
| 70 | - l_err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, |
| 71 | - HWAS::SRCI_PRIORITY_MED); |
| 72 | - // Add HB firmware callout |
| 73 | - l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, |
| 74 | - HWAS::SRCI_PRIORITY_MED); |
| 75 | + // Set original error log as unrecoverable and commit |
| 76 | + l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| 77 | + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| 78 | |
| 79 | - ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| 80 | + // Commit occReset error |
| 81 | + ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| 82 | + } |
| 83 | + else |
| 84 | + { |
| 85 | + // retry worked - commit original error as informational |
| 86 | + l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); |
| 87 | + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| 88 | + } |
| 89 | } |
| 90 | |
| 91 | } // end processOccStartStatus() |
| 92 | @@ -229,6 +217,19 @@ namespace HTMGT |
| 93 | // Notify HTMGT that an OCC has an error to report |
| 94 | void processOccError(TARGETING::Target * i_procTarget) |
| 95 | { |
| 96 | + TARGETING::Target* sys = NULL; |
| 97 | + TARGETING::targetService().getTopLevelTarget(sys); |
| 98 | + uint8_t safeMode = 0; |
| 99 | + |
| 100 | + // If the system is in safemode then can't talk to OCCs - |
| 101 | + // ignore call to processOccError |
| 102 | + if(sys && |
| 103 | + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && |
| 104 | + safeMode) |
| 105 | + { |
| 106 | + return; |
| 107 | + } |
| 108 | + |
| 109 | bool polledOneOcc = false; |
| 110 | OccManager::buildOccs(); |
| 111 | |
| 112 | @@ -347,29 +348,57 @@ namespace HTMGT |
| 113 | // Set the OCC state |
| 114 | errlHndl_t enableOccActuation(bool i_occActivation) |
| 115 | { |
| 116 | - occStateId targetState = OCC_STATE_ACTIVE; |
| 117 | - if (false == i_occActivation) |
| 118 | - { |
| 119 | - targetState = OCC_STATE_OBSERVATION; |
| 120 | - } |
| 121 | + errlHndl_t l_err = NULL; |
| 122 | + TARGETING::Target* sys = NULL; |
| 123 | + |
| 124 | + TARGETING::targetService().getTopLevelTarget(sys); |
| 125 | + uint8_t safeMode = 0; |
| 126 | |
| 127 | - // Set state for all OCCs |
| 128 | - errlHndl_t l_err = OccManager::setOccState(targetState); |
| 129 | - if (NULL == l_err) |
| 130 | + // If the system is in safemode then can't talk to OCCs - |
| 131 | + // ignore call to enableOccActuation |
| 132 | + if(sys && |
| 133 | + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && |
| 134 | + safeMode) |
| 135 | { |
| 136 | - TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", |
| 137 | - targetState); |
| 138 | + /*@ |
| 139 | + * @errortype |
| 140 | + * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE |
| 141 | + * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION |
| 142 | + * @userdata1[0:7] OCC activate [1==true][0==false] |
| 143 | + * @devdesc Invalid operation when OCCs are in safemode |
| 144 | + */ |
| 145 | + bldErrLog(l_err, |
| 146 | + HTMGT_MOD_ENABLE_OCC_ACTUATION, |
| 147 | + HTMGT_RC_OCC_CRIT_FAILURE, |
| 148 | + i_occActivation, 0, 0, 1, |
| 149 | + ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| 150 | } |
| 151 | - |
| 152 | - if (OccManager::occNeedsReset()) |
| 153 | + else |
| 154 | { |
| 155 | - TMGT_ERR("enableOccActuation(): OCCs need to be reset"); |
| 156 | - // Don't pass failed target as OCC should have already |
| 157 | - // been marked as failed during the poll. |
| 158 | - errlHndl_t err2 = OccManager::resetOccs(NULL); |
| 159 | - if(err2) |
| 160 | + occStateId targetState = OCC_STATE_ACTIVE; |
| 161 | + if (false == i_occActivation) |
| 162 | { |
| 163 | - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| 164 | + targetState = OCC_STATE_OBSERVATION; |
| 165 | + } |
| 166 | + |
| 167 | + // Set state for all OCCs |
| 168 | + l_err = OccManager::setOccState(targetState); |
| 169 | + if (NULL == l_err) |
| 170 | + { |
| 171 | + TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", |
| 172 | + targetState); |
| 173 | + } |
| 174 | + |
| 175 | + if (OccManager::occNeedsReset()) |
| 176 | + { |
| 177 | + TMGT_ERR("enableOccActuation(): OCCs need to be reset"); |
| 178 | + // Don't pass failed target as OCC should have already |
| 179 | + // been marked as failed during the poll. |
| 180 | + errlHndl_t err2 = OccManager::resetOccs(NULL); |
| 181 | + if(err2) |
| 182 | + { |
| 183 | + ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); |
| 184 | + } |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C |
| 189 | index 7f54d6d..4cb46f0 100644 |
| 190 | --- a/src/usr/htmgt/htmgt_activate.C |
| 191 | +++ b/src/usr/htmgt/htmgt_activate.C |
| 192 | @@ -39,6 +39,7 @@ |
| 193 | |
| 194 | #include <ipmi/ipmisensor.H> |
| 195 | #include <sys/time.h> |
| 196 | +#include <console/consoleif.H> |
| 197 | |
| 198 | using namespace TARGETING; |
| 199 | |
| 200 | @@ -163,6 +164,14 @@ namespace HTMGT |
| 201 | l_err = occ->ipmiSensor(i_activate); |
| 202 | if( l_err ) |
| 203 | { |
| 204 | + TMGT_ERR("setOccActiveSensors failed. (OCC%d state:%d)", |
| 205 | + occ->getInstance(), |
| 206 | + i_activate); |
| 207 | + |
| 208 | + TMGT_CONSOLE("setOccActiveSensors failed. (OCC%d state:%d)", |
| 209 | + occ->getInstance(), |
| 210 | + i_activate); |
| 211 | + |
| 212 | ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); |
| 213 | } |
| 214 | } |
| 215 | diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C |
| 216 | index 8a539f4..bd95987 100644 |
| 217 | --- a/src/usr/htmgt/htmgt_occ.C |
| 218 | +++ b/src/usr/htmgt/htmgt_occ.C |
| 219 | @@ -248,7 +248,8 @@ namespace HTMGT |
| 220 | OccManager::OccManager() |
| 221 | :iv_occMaster(NULL), |
| 222 | iv_state(OCC_STATE_UNKNOWN), |
| 223 | - iv_targetState(OCC_STATE_ACTIVE) |
| 224 | + iv_targetState(OCC_STATE_ACTIVE), |
| 225 | + iv_resetCount(0) |
| 226 | { |
| 227 | } |
| 228 | |
| 229 | @@ -590,6 +591,19 @@ namespace HTMGT |
| 230 | ERRORLOG::errlCommit(err, HTMGT_COMP_ID); |
| 231 | } |
| 232 | |
| 233 | + if(NULL == i_failedOccTarget) |
| 234 | + { |
| 235 | + ++iv_resetCount; // increment system reset count |
| 236 | + |
| 237 | + TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d", |
| 238 | + iv_resetCount); |
| 239 | + |
| 240 | + if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD) |
| 241 | + { |
| 242 | + atThreshold = true; |
| 243 | + } |
| 244 | + } |
| 245 | + |
| 246 | for(occList_t::const_iterator occ = iv_occArray.begin(); |
| 247 | occ != iv_occArray.end(); |
| 248 | ++occ) |
| 249 | @@ -663,7 +677,7 @@ namespace HTMGT |
| 250 | */ |
| 251 | bldErrLog(err, |
| 252 | HTMTG_MOD_OCC_RESET, |
| 253 | - HTMGT_RC_OCC_RESET_THREHOLD, |
| 254 | + HTMGT_RC_OCC_CRIT_FAILURE, |
| 255 | 0, 0, 0, 0, |
| 256 | ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| 257 | } |
| 258 | @@ -673,6 +687,13 @@ namespace HTMGT |
| 259 | { |
| 260 | err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); |
| 261 | |
| 262 | + // Add level 2 support callout |
| 263 | + err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, |
| 264 | + HWAS::SRCI_PRIORITY_MED); |
| 265 | + // Add HB firmware callout |
| 266 | + err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, |
| 267 | + HWAS::SRCI_PRIORITY_MED); |
| 268 | + |
| 269 | TARGETING::Target* sys = NULL; |
| 270 | TARGETING::targetService().getTopLevelTarget(sys); |
| 271 | uint8_t safeMode = 1; |
| 272 | @@ -683,8 +704,13 @@ namespace HTMGT |
| 273 | sys->setAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); |
| 274 | } |
| 275 | |
| 276 | - TMGT_ERR("_resetOccs: Safe Mode RC: 0x%04X (OCC%d)", |
| 277 | + TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)", |
| 278 | cv_safeReturnCode, cv_safeOccInstance); |
| 279 | + |
| 280 | + TMGT_CONSOLE("OCCs are not active. The system will remain in " |
| 281 | + "safe mode (RC: 0x%04x for OCC%d)", |
| 282 | + cv_safeReturnCode, |
| 283 | + cv_safeOccInstance); |
| 284 | } |
| 285 | |
| 286 | return err; |
| 287 | diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H |
| 288 | index dec19b8..5ac545a 100644 |
| 289 | --- a/src/usr/htmgt/htmgt_occ.H |
| 290 | +++ b/src/usr/htmgt/htmgt_occ.H |
| 291 | @@ -507,6 +507,7 @@ namespace HTMGT |
| 292 | occList_t iv_occArray; |
| 293 | occStateId iv_state; |
| 294 | occStateId iv_targetState; |
| 295 | + uint8_t iv_resetCount; |
| 296 | |
| 297 | /** |
| 298 | * @brief SRC that caused system to enter safe mode |
| 299 | -- |
| 300 | 1.8.2.2 |
| 301 | |