blob: 4b666b0886ee993acef45952fe013cffb2c7e8e0 [file] [log] [blame]
Matt Ploetz50ec2fb2015-03-05 10:16:24 -06001From 98fc2914b15e89c2324c1636af62225c653e45f9 Mon Sep 17 00:00:00 2001
2From: Doug Gilbert <dgilbert@us.ibm.com>
3Date: Tue, 3 Mar 2015 16:00:29 -0600
4Subject: [PATCH] HTMGT add attempt to reset OCC when OCC Activate fails
5
6Change-Id: I964d2b68216c3ddabae73ce3b851bbc468ec96a7
7RTC: 123180
8---
9 src/include/usr/htmgt/htmgt_reasoncodes.H | 1 +
10 src/usr/htmgt/htmgt.C | 123 ++++++++++++++++++------------
11 src/usr/htmgt/htmgt_activate.C | 9 +++
12 src/usr/htmgt/htmgt_occ.C | 32 +++++++-
13 src/usr/htmgt/htmgt_occ.H | 1 +
14 5 files changed, 116 insertions(+), 50 deletions(-)
15
16diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H
17index ade192d..6fe269d 100644
18--- a/src/include/usr/htmgt/htmgt_reasoncodes.H
19+++ b/src/include/usr/htmgt/htmgt_reasoncodes.H
20@@ -48,6 +48,7 @@ namespace HTMGT
21 HTMGT_MOD_CHECK_OCC_RSP = 0x92,
22 HTMGT_MOD_PARSE_OCC_RSP = 0x94,
23 HTMGT_MOD_HANLDE_OCC_EXCEPTION = 0xE0,
24+ HTMGT_MOD_ENABLE_OCC_ACTUATION = 0xE1,
25 };
26
27 enum htmgtReasonCode
28diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C
29index aff2500..a2f556f 100644
30--- a/src/usr/htmgt/htmgt.C
31+++ b/src/usr/htmgt/htmgt.C
32@@ -184,42 +184,30 @@ namespace HTMGT
33
34 if (NULL != l_err)
35 {
36- TMGT_ERR("OCCs not all active. System will stay in safe mode");
37+ TMGT_ERR("OCCs not all active. Attempting OCC Reset");
38 TMGT_CONSOLE("OCCs are not active (rc=0x%04X). "
39- "System will remain in safe mode",
40+ "Attempting OCC Reset",
41 l_err->reasonCode());
42- TMGT_INF("Calling HBOCC::stopAllOCCs");
43- errlHndl_t err2 = HBOCC::stopAllOCCs();
44+ TMGT_INF("Calling resetOccs");
45+ errlHndl_t err2 = OccManager::resetOccs(NULL);
46 if(err2)
47 {
48- TMGT_ERR("stopAllOCCs() failed with 0x%04X",
49+ TMGT_ERR("OccManager:;resetOccs failed with 0x%04X",
50 err2->reasonCode());
51- ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
52- }
53-
54- // Update error log to unrecoverable and set SRC
55- // to indicate the system will remain in safe mode
56- /*@
57- * @errortype
58- * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
59- * @moduleid HTMGT_MOD_LOAD_START_STATUS
60- * @userdata1[0:7] load/start completed
61- * @devdesc OCCs did not all reach active state,
62- * system will be in Safe Mode
63- */
64- bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS,
65- HTMGT_RC_OCC_CRIT_FAILURE,
66- i_startCompleted, 0, 0, 1,
67- ERRORLOG::ERRL_SEV_UNRECOVERABLE);
68
69- // Add level 2 support callout
70- l_err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP,
71- HWAS::SRCI_PRIORITY_MED);
72- // Add HB firmware callout
73- l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE,
74- HWAS::SRCI_PRIORITY_MED);
75+ // Set original error log as unrecoverable and commit
76+ l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE);
77+ ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
78
79- ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
80+ // Commit occReset error
81+ ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
82+ }
83+ else
84+ {
85+ // retry worked - commit original error as informational
86+ l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL);
87+ ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
88+ }
89 }
90
91 } // end processOccStartStatus()
92@@ -229,6 +217,19 @@ namespace HTMGT
93 // Notify HTMGT that an OCC has an error to report
94 void processOccError(TARGETING::Target * i_procTarget)
95 {
96+ TARGETING::Target* sys = NULL;
97+ TARGETING::targetService().getTopLevelTarget(sys);
98+ uint8_t safeMode = 0;
99+
100+ // If the system is in safemode then can't talk to OCCs -
101+ // ignore call to processOccError
102+ if(sys &&
103+ sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) &&
104+ safeMode)
105+ {
106+ return;
107+ }
108+
109 bool polledOneOcc = false;
110 OccManager::buildOccs();
111
112@@ -347,29 +348,57 @@ namespace HTMGT
113 // Set the OCC state
114 errlHndl_t enableOccActuation(bool i_occActivation)
115 {
116- occStateId targetState = OCC_STATE_ACTIVE;
117- if (false == i_occActivation)
118- {
119- targetState = OCC_STATE_OBSERVATION;
120- }
121+ errlHndl_t l_err = NULL;
122+ TARGETING::Target* sys = NULL;
123+
124+ TARGETING::targetService().getTopLevelTarget(sys);
125+ uint8_t safeMode = 0;
126
127- // Set state for all OCCs
128- errlHndl_t l_err = OccManager::setOccState(targetState);
129- if (NULL == l_err)
130+ // If the system is in safemode then can't talk to OCCs -
131+ // ignore call to enableOccActuation
132+ if(sys &&
133+ sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) &&
134+ safeMode)
135 {
136- TMGT_INF("enableOccActuation: OCC states updated to 0x%02X",
137- targetState);
138+ /*@
139+ * @errortype
140+ * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
141+ * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION
142+ * @userdata1[0:7] OCC activate [1==true][0==false]
143+ * @devdesc Invalid operation when OCCs are in safemode
144+ */
145+ bldErrLog(l_err,
146+ HTMGT_MOD_ENABLE_OCC_ACTUATION,
147+ HTMGT_RC_OCC_CRIT_FAILURE,
148+ i_occActivation, 0, 0, 1,
149+ ERRORLOG::ERRL_SEV_UNRECOVERABLE);
150 }
151-
152- if (OccManager::occNeedsReset())
153+ else
154 {
155- TMGT_ERR("enableOccActuation(): OCCs need to be reset");
156- // Don't pass failed target as OCC should have already
157- // been marked as failed during the poll.
158- errlHndl_t err2 = OccManager::resetOccs(NULL);
159- if(err2)
160+ occStateId targetState = OCC_STATE_ACTIVE;
161+ if (false == i_occActivation)
162 {
163- ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
164+ targetState = OCC_STATE_OBSERVATION;
165+ }
166+
167+ // Set state for all OCCs
168+ l_err = OccManager::setOccState(targetState);
169+ if (NULL == l_err)
170+ {
171+ TMGT_INF("enableOccActuation: OCC states updated to 0x%02X",
172+ targetState);
173+ }
174+
175+ if (OccManager::occNeedsReset())
176+ {
177+ TMGT_ERR("enableOccActuation(): OCCs need to be reset");
178+ // Don't pass failed target as OCC should have already
179+ // been marked as failed during the poll.
180+ errlHndl_t err2 = OccManager::resetOccs(NULL);
181+ if(err2)
182+ {
183+ ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
184+ }
185 }
186 }
187
188diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C
189index 7f54d6d..4cb46f0 100644
190--- a/src/usr/htmgt/htmgt_activate.C
191+++ b/src/usr/htmgt/htmgt_activate.C
192@@ -39,6 +39,7 @@
193
194 #include <ipmi/ipmisensor.H>
195 #include <sys/time.h>
196+#include <console/consoleif.H>
197
198 using namespace TARGETING;
199
200@@ -163,6 +164,14 @@ namespace HTMGT
201 l_err = occ->ipmiSensor(i_activate);
202 if( l_err )
203 {
204+ TMGT_ERR("setOccActiveSensors failed. (OCC%d state:%d)",
205+ occ->getInstance(),
206+ i_activate);
207+
208+ TMGT_CONSOLE("setOccActiveSensors failed. (OCC%d state:%d)",
209+ occ->getInstance(),
210+ i_activate);
211+
212 ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
213 }
214 }
215diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C
216index 8a539f4..bd95987 100644
217--- a/src/usr/htmgt/htmgt_occ.C
218+++ b/src/usr/htmgt/htmgt_occ.C
219@@ -248,7 +248,8 @@ namespace HTMGT
220 OccManager::OccManager()
221 :iv_occMaster(NULL),
222 iv_state(OCC_STATE_UNKNOWN),
223- iv_targetState(OCC_STATE_ACTIVE)
224+ iv_targetState(OCC_STATE_ACTIVE),
225+ iv_resetCount(0)
226 {
227 }
228
229@@ -590,6 +591,19 @@ namespace HTMGT
230 ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
231 }
232
233+ if(NULL == i_failedOccTarget)
234+ {
235+ ++iv_resetCount; // increment system reset count
236+
237+ TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d",
238+ iv_resetCount);
239+
240+ if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD)
241+ {
242+ atThreshold = true;
243+ }
244+ }
245+
246 for(occList_t::const_iterator occ = iv_occArray.begin();
247 occ != iv_occArray.end();
248 ++occ)
249@@ -663,7 +677,7 @@ namespace HTMGT
250 */
251 bldErrLog(err,
252 HTMTG_MOD_OCC_RESET,
253- HTMGT_RC_OCC_RESET_THREHOLD,
254+ HTMGT_RC_OCC_CRIT_FAILURE,
255 0, 0, 0, 0,
256 ERRORLOG::ERRL_SEV_UNRECOVERABLE);
257 }
258@@ -673,6 +687,13 @@ namespace HTMGT
259 {
260 err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE);
261
262+ // Add level 2 support callout
263+ err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP,
264+ HWAS::SRCI_PRIORITY_MED);
265+ // Add HB firmware callout
266+ err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE,
267+ HWAS::SRCI_PRIORITY_MED);
268+
269 TARGETING::Target* sys = NULL;
270 TARGETING::targetService().getTopLevelTarget(sys);
271 uint8_t safeMode = 1;
272@@ -683,8 +704,13 @@ namespace HTMGT
273 sys->setAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode);
274 }
275
276- TMGT_ERR("_resetOccs: Safe Mode RC: 0x%04X (OCC%d)",
277+ TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)",
278 cv_safeReturnCode, cv_safeOccInstance);
279+
280+ TMGT_CONSOLE("OCCs are not active. The system will remain in "
281+ "safe mode (RC: 0x%04x for OCC%d)",
282+ cv_safeReturnCode,
283+ cv_safeOccInstance);
284 }
285
286 return err;
287diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H
288index dec19b8..5ac545a 100644
289--- a/src/usr/htmgt/htmgt_occ.H
290+++ b/src/usr/htmgt/htmgt_occ.H
291@@ -507,6 +507,7 @@ namespace HTMGT
292 occList_t iv_occArray;
293 occStateId iv_state;
294 occStateId iv_targetState;
295+ uint8_t iv_resetCount;
296
297 /**
298 * @brief SRC that caused system to enter safe mode
299--
3001.8.2.2
301