blob: 2d5d90012ae627e60e97062caea62038eb894dd1 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
24
25namespace host_error_monitor
26{
27static boost::asio::io_service io;
28static std::shared_ptr<sdbusplus::asio::connection> conn;
29
30static bool hostOff = true;
31
32const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Bills6a2cb692019-08-06 11:03:49 -070033const static constexpr size_t err2TimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070034const static constexpr size_t crashdumpTimeoutS = 300;
35
36// Timers
37// Timer for CATERR asserted
38static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070039// Timer for ERR2 asserted
40static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070041
42// GPIO Lines and Event Descriptors
43static gpiod::line caterrLine;
44static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045static gpiod::line err2Line;
46static boost::asio::posix::stream_descriptor err2Event(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080047//----------------------------------
48// PCH_BMC_THERMTRIP function related definition
49//----------------------------------
50// GPIO Lines and Event Descriptors
51static gpiod::line pchThermtripLine;
52static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070053
Jason M. Billsa3397932019-08-06 11:07:21 -070054static void cpuIERRLog()
55{
56 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
57 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
58 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
59}
60
61static void cpuIERRLog(const int cpuNum)
62{
63 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
64
65 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
66 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
67 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
68}
69
70static void cpuIERRLog(const int cpuNum, const std::string& type)
71{
72 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
73
74 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
75 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
76 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
77}
78
Jason M. Bills6a2cb692019-08-06 11:03:49 -070079static void cpuERR2Log()
80{
81 sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
82 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
83 "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
84}
85
86static void cpuERR2Log(const int cpuNum)
87{
88 std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
89
90 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
91 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
92 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
93}
94
Jason M. Billsa15c2522019-08-16 10:01:44 -070095static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -070096static void initializeHostState()
97{
98 conn->async_method_call(
99 [](boost::system::error_code ec,
100 const std::variant<std::string>& property) {
101 if (ec)
102 {
103 return;
104 }
105 const std::string* state = std::get_if<std::string>(&property);
106 if (state == nullptr)
107 {
108 std::cerr << "Unable to read host state value\n";
109 return;
110 }
111 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700112 // If the system is on, initialize the error state
113 if (!hostOff)
114 {
115 initializeErrorState();
116 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700117 },
118 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
119 "org.freedesktop.DBus.Properties", "Get",
120 "xyz.openbmc_project.State.Host", "CurrentHostState");
121}
122
123static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
124{
125 return std::make_shared<sdbusplus::bus::match::match>(
126 *conn,
127 "type='signal',interface='org.freedesktop.DBus.Properties',"
128 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
129 "Host'",
130 [](sdbusplus::message::message& msg) {
131 std::string interfaceName;
132 boost::container::flat_map<std::string, std::variant<std::string>>
133 propertiesChanged;
134 std::string state;
135 try
136 {
137 msg.read(interfaceName, propertiesChanged);
138 state =
139 std::get<std::string>(propertiesChanged.begin()->second);
140 }
141 catch (std::exception& e)
142 {
143 std::cerr << "Unable to read host state\n";
144 return;
145 }
146 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
147
148 // No host events should fire while off, so cancel any pending
149 // timers
150 if (hostOff)
151 {
152 caterrAssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700153 err2AssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700154 }
155 });
156}
157
158static bool requestGPIOEvents(
159 const std::string& name, const std::function<void()>& handler,
160 gpiod::line& gpioLine,
161 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
162{
163 // Find the GPIO line
164 gpioLine = gpiod::find_line(name);
165 if (!gpioLine)
166 {
167 std::cerr << "Failed to find the " << name << " line\n";
168 return false;
169 }
170
171 try
172 {
173 gpioLine.request(
174 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
175 }
176 catch (std::exception&)
177 {
178 std::cerr << "Failed to request events for " << name << "\n";
179 return false;
180 }
181
182 int gpioLineFd = gpioLine.event_get_fd();
183 if (gpioLineFd < 0)
184 {
185 std::cerr << "Failed to get " << name << " fd\n";
186 return false;
187 }
188
189 gpioEventDescriptor.assign(gpioLineFd);
190
191 gpioEventDescriptor.async_wait(
192 boost::asio::posix::stream_descriptor::wait_read,
193 [&name, handler](const boost::system::error_code ec) {
194 if (ec)
195 {
196 std::cerr << name << " fd handler error: " << ec.message()
197 << "\n";
198 return;
199 }
200 handler();
201 });
202 return true;
203}
204
205static void startPowerCycle()
206{
207 conn->async_method_call(
208 [](boost::system::error_code ec) {
209 if (ec)
210 {
211 std::cerr << "failed to set Chassis State\n";
212 }
213 },
214 "xyz.openbmc_project.State.Chassis",
215 "/xyz/openbmc_project/state/chassis0",
216 "org.freedesktop.DBus.Properties", "Set",
217 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
218 std::variant<std::string>{
219 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
220}
221
222static void startCrashdumpAndRecovery(bool recoverSystem)
223{
224 std::cout << "Starting crashdump\n";
225 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
226 static boost::asio::steady_timer crashdumpTimer(io);
227
228 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
229 *conn,
230 "type='signal',interface='org.freedesktop.DBus.Properties',"
231 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
232 [recoverSystem](sdbusplus::message::message& msg) {
233 crashdumpTimer.cancel();
234 std::cout << "Crashdump completed\n";
235 if (recoverSystem)
236 {
237 std::cout << "Recovering the system\n";
238 startPowerCycle();
239 }
240 crashdumpCompleteMatch.reset();
241 });
242
243 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
244 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
245 if (ec)
246 {
247 // operation_aborted is expected if timer is canceled
248 if (ec != boost::asio::error::operation_aborted)
249 {
250 std::cerr << "Crashdump async_wait failed: " << ec.message()
251 << "\n";
252 }
253 std::cout << "Crashdump timer canceled\n";
254 return;
255 }
256 std::cerr << "Crashdump failed to complete before timeout\n";
257 crashdumpCompleteMatch.reset();
258 });
259
260 conn->async_method_call(
261 [](boost::system::error_code ec) {
262 if (ec)
263 {
264 std::cerr << "failed to start Crashdump\n";
265 crashdumpTimer.cancel();
266 crashdumpCompleteMatch.reset();
267 }
268 },
269 "com.intel.crashdump", "/com/intel/crashdump",
270 "com.intel.crashdump.Stored", "GenerateStoredLog");
271}
272
Jason M. Billsa3397932019-08-06 11:07:21 -0700273static bool checkIERRCPUs()
274{
275 bool cpuIERRFound = false;
276 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
277 cpu++, addr++)
278 {
279 uint8_t cc = 0;
280 CPUModel model{};
281 uint8_t stepping = 0;
282 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
283 {
284 std::cerr << "Cannot get CPUID!\n";
285 continue;
286 }
287
288 switch (model)
289 {
290 case skx:
291 {
292 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
293 // that caused the IERR
294 uint32_t mcaErrSrcLog = 0;
295 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
296 &cc) != PECI_CC_SUCCESS)
297 {
298 continue;
299 }
300 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
301 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
302 {
303 // TODO: Light the CPU fault LED?
304 cpuIERRFound = true;
305 // Next check if it's a CPU/VR mismatch by reading the
306 // IA32_MC4_STATUS MSR (0x411)
307 uint64_t mc4Status = 0;
308 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
309 PECI_CC_SUCCESS)
310 {
311 continue;
312 }
313 // Check MSEC bits 31:24 for
314 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
315 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
316 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
317 if ((mc4Status & (0x40 << 24)) ||
318 (mc4Status & (0x42 << 24)) ||
319 (mc4Status & (0x43 << 24)))
320 {
321 cpuIERRLog(cpu, "CPU/VR Mismatch");
322 continue;
323 }
324
325 // Next check if it's a Core FIVR fault by looking for a
326 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
327 // 80h)
328 uint32_t coreFIVRErrLog = 0;
329 if (peci_RdPCIConfigLocal(
330 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
331 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
332 {
333 continue;
334 }
335 if (coreFIVRErrLog)
336 {
337 cpuIERRLog(cpu, "Core FIVR Fault");
338 continue;
339 }
340
341 // Next check if it's an Uncore FIVR fault by looking for a
342 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
343 // 84h)
344 uint32_t uncoreFIVRErrLog = 0;
345 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
346 sizeof(uint32_t),
347 (uint8_t*)&uncoreFIVRErrLog,
348 &cc) != PECI_CC_SUCCESS)
349 {
350 continue;
351 }
352 if (uncoreFIVRErrLog)
353 {
354 cpuIERRLog(cpu, "Uncore FIVR Fault");
355 continue;
356 }
357
358 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
359 // both zero, but MSEC bits 31:24 have either
360 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
361 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
362 // uncore FIVR fault
363 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
364 ((mc4Status & (0x51 << 24)) ||
365 (mc4Status & (0x52 << 24))))
366 {
367 cpuIERRLog(cpu, "Uncore FIVR Fault");
368 continue;
369 }
370 cpuIERRLog(cpu);
371 }
372 break;
373 }
374 case icx:
375 {
376 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
377 // that caused the IERR
378 uint32_t mcaErrSrcLog = 0;
379 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
380 &cc) != PECI_CC_SUCCESS)
381 {
382 continue;
383 }
384 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
385 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
386 {
387 // TODO: Light the CPU fault LED?
388 cpuIERRFound = true;
389 // Next check if it's a CPU/VR mismatch by reading the
390 // IA32_MC4_STATUS MSR (0x411)
391 uint64_t mc4Status = 0;
392 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
393 PECI_CC_SUCCESS)
394 {
395 continue;
396 }
397 // TODO: Update MSEC/MSCOD_31_24 check
398 // Check MSEC bits 31:24 for
399 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
400 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
401 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
402 if ((mc4Status & (0x40 << 24)) ||
403 (mc4Status & (0x42 << 24)) ||
404 (mc4Status & (0x43 << 24)))
405 {
406 cpuIERRLog(cpu, "CPU/VR Mismatch");
407 continue;
408 }
409
410 // Next check if it's a Core FIVR fault by looking for a
411 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
412 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
413 uint32_t coreFIVRErrLog0 = 0;
414 uint32_t coreFIVRErrLog1 = 0;
415 if (peci_RdEndPointConfigPciLocal(
416 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
417 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
418 {
419 continue;
420 }
421 if (peci_RdEndPointConfigPciLocal(
422 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
423 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
424 {
425 continue;
426 }
427 if (coreFIVRErrLog0 || coreFIVRErrLog1)
428 {
429 cpuIERRLog(cpu, "Core FIVR Fault");
430 continue;
431 }
432
433 // Next check if it's an Uncore FIVR fault by looking for a
434 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
435 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
436 uint32_t uncoreFIVRErrLog = 0;
437 if (peci_RdEndPointConfigPciLocal(
438 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
439 (uint8_t*)&uncoreFIVRErrLog,
440 &cc) != PECI_CC_SUCCESS)
441 {
442 continue;
443 }
444 if (uncoreFIVRErrLog)
445 {
446 cpuIERRLog(cpu, "Uncore FIVR Fault");
447 continue;
448 }
449
450 // TODO: Update MSEC/MSCOD_31_24 check
451 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
452 // both zero, but MSEC bits 31:24 have either
453 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
454 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
455 // uncore FIVR fault
456 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
457 !uncoreFIVRErrLog &&
458 ((mc4Status & (0x51 << 24)) ||
459 (mc4Status & (0x52 << 24))))
460 {
461 cpuIERRLog(cpu, "Uncore FIVR Fault");
462 continue;
463 }
464 cpuIERRLog(cpu);
465 }
466 break;
467 }
468 }
469 }
470 return cpuIERRFound;
471}
472
Jason M. Billsa15c2522019-08-16 10:01:44 -0700473static void caterrAssertHandler()
474{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700475 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
476 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
477 if (ec)
478 {
479 // operation_aborted is expected if timer is canceled
480 // before completion.
481 if (ec != boost::asio::error::operation_aborted)
482 {
483 std::cerr << "caterr timeout async_wait failed: "
484 << ec.message() << "\n";
485 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700486 return;
487 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700488 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
489 << " ms\n";
490 if (!checkIERRCPUs())
491 {
492 cpuIERRLog();
493 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700494 conn->async_method_call(
495 [](boost::system::error_code ec,
496 const std::variant<bool>& property) {
497 if (ec)
498 {
499 return;
500 }
501 const bool* reset = std::get_if<bool>(&property);
502 if (reset == nullptr)
503 {
504 std::cerr << "Unable to read reset on CATERR value\n";
505 return;
506 }
507 startCrashdumpAndRecovery(*reset);
508 },
509 "xyz.openbmc_project.Settings",
510 "/xyz/openbmc_project/control/processor_error_config",
511 "org.freedesktop.DBus.Properties", "Get",
512 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
513 });
514}
515
Jason M. Bills1490b142019-07-01 15:48:43 -0700516static void caterrHandler()
517{
518 if (!hostOff)
519 {
520 gpiod::line_event gpioLineEvent = caterrLine.event_read();
521
522 bool caterr =
523 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
524 if (caterr)
525 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700526 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700527 }
528 else
529 {
530 caterrAssertTimer.cancel();
531 }
532 }
533 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
534 [](const boost::system::error_code ec) {
535 if (ec)
536 {
537 std::cerr << "caterr handler error: "
538 << ec.message() << "\n";
539 return;
540 }
541 caterrHandler();
542 });
543}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800544static void pchThermtripHandler()
545{
546 if (!hostOff)
547 {
548 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
549
550 bool pchThermtrip =
551 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
552 if (pchThermtrip)
553 {
554 std::cout << "PCH Thermal trip detected \n";
555 // log to redfish, call API
556 sd_journal_send("MESSAGE=SsbThermalTrip: SSB Thermal trip",
557 "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s",
558 "OpenBMC.0.1.SsbThermalTrip", NULL);
559 }
560 }
561 pchThermtripEvent.async_wait(
562 boost::asio::posix::stream_descriptor::wait_read,
563 [](const boost::system::error_code ec) {
564 if (ec)
565 {
566 std::cerr << "PCH Thermal trip handler error: " << ec.message()
567 << "\n";
568 return;
569 }
570 pchThermtripHandler();
571 });
572}
573
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700574static std::bitset<MAX_CPUS> checkERR2CPUs()
575{
576 std::bitset<MAX_CPUS> err2CPUs = 0;
577 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
578 cpu++, addr++)
579 {
580 if (peci_Ping(addr) == PECI_CC_SUCCESS)
581 {
582 uint8_t cc = 0;
583 CPUModel model{};
584 uint8_t stepping = 0;
585 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
586 {
587 std::cerr << "Cannot get CPUID!\n";
588 continue;
589 }
590
591 switch (model)
592 {
593 case skx:
594 {
595 // Check the ERRPINSTS to see if this is the CPU that caused
596 // the ERR2 (B(0) D8 F0 offset 210h)
597 uint32_t errpinsts = 0;
598 if (peci_RdPCIConfigLocal(
599 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
600 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
601 {
602 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
603 }
604 break;
605 }
606 case icx:
607 {
608 // Check the ERRPINSTS to see if this is the CPU that caused
609 // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
610 // accessed on PECI as bus 13)
611 uint32_t errpinsts = 0;
612 if (peci_RdEndPointConfigPciLocal(
613 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
614 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
615 {
616 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
617 }
618 break;
619 }
620 }
621 }
622 }
623 return err2CPUs;
624}
625
626static void err2AssertHandler()
627{
628 // ERR2 status is not guaranteed through the timeout, so save which
629 // CPUs have asserted ERR2 now
630 std::bitset<MAX_CPUS> err2CPUs = checkERR2CPUs();
631 err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
632 err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
633 if (ec)
634 {
635 // operation_aborted is expected if timer is canceled before
636 // completion.
637 if (ec != boost::asio::error::operation_aborted)
638 {
639 std::cerr << "err2 timeout async_wait failed: " << ec.message()
640 << "\n";
641 }
642 return;
643 }
644 std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
645 << " ms\n";
646 if (err2CPUs.count())
647 {
648 for (int i = 0; i < err2CPUs.size(); i++)
649 {
650 if (err2CPUs[i])
651 {
652 cpuERR2Log(i);
653 }
654 }
655 }
656 else
657 {
658 cpuERR2Log();
659 }
660 conn->async_method_call(
661 [](boost::system::error_code ec,
662 const std::variant<bool>& property) {
663 if (ec)
664 {
665 return;
666 }
667 const bool* reset = std::get_if<bool>(&property);
668 if (reset == nullptr)
669 {
670 std::cerr << "Unable to read reset on ERR2 value\n";
671 return;
672 }
673 startCrashdumpAndRecovery(*reset);
674 },
675 "xyz.openbmc_project.Settings",
676 "/xyz/openbmc_project/control/processor_error_config",
677 "org.freedesktop.DBus.Properties", "Get",
678 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
679 });
680}
681
682static void err2Handler()
683{
684 if (!hostOff)
685 {
686 gpiod::line_event gpioLineEvent = err2Line.event_read();
687
688 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
689 if (err2)
690 {
691 err2AssertHandler();
692 }
693 else
694 {
695 err2AssertTimer.cancel();
696 }
697 }
698 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
699 [](const boost::system::error_code ec) {
700 if (ec)
701 {
702 std::cerr
703 << "err2 handler error: " << ec.message()
704 << "\n";
705 return;
706 }
707 err2Handler();
708 });
709}
710
Jason M. Billsa15c2522019-08-16 10:01:44 -0700711static void initializeErrorState()
712{
713 // Handle CPU_CATERR if it's asserted now
714 if (caterrLine.get_value() == 0)
715 {
716 caterrAssertHandler();
717 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700718
719 // Handle CPU_ERR2 if it's asserted now
720 if (err2Line.get_value() == 0)
721 {
722 err2AssertHandler();
723 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700724}
Jason M. Bills1490b142019-07-01 15:48:43 -0700725} // namespace host_error_monitor
726
727int main(int argc, char* argv[])
728{
729 // setup connection to dbus
730 host_error_monitor::conn =
731 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
732
733 // Host Error Monitor Object
734 host_error_monitor::conn->request_name(
735 "xyz.openbmc_project.HostErrorMonitor");
736 sdbusplus::asio::object_server server =
737 sdbusplus::asio::object_server(host_error_monitor::conn);
738
739 // Start tracking host state
740 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
741 host_error_monitor::startHostStateMonitor();
742
743 // Initialize the host state
744 host_error_monitor::initializeHostState();
745
746 // Request CPU_CATERR GPIO events
747 if (!host_error_monitor::requestGPIOEvents(
748 "CPU_CATERR", host_error_monitor::caterrHandler,
749 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
750 {
751 return -1;
752 }
753
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700754 // Request CPU_ERR2 GPIO events
755 if (!host_error_monitor::requestGPIOEvents(
756 "CPU_ERR2", host_error_monitor::err2Handler,
757 host_error_monitor::err2Line, host_error_monitor::err2Event))
758 {
759 return -1;
760 }
761
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800762 // Request PCH_BMC_THERMTRIP GPIO events
763 if (!host_error_monitor::requestGPIOEvents(
764 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
765 host_error_monitor::pchThermtripLine,
766 host_error_monitor::pchThermtripEvent))
767 {
768 return -1;
769 }
770
Jason M. Bills1490b142019-07-01 15:48:43 -0700771 host_error_monitor::io.run();
772
773 return 0;
774}