blob: bd2ac9c73c07eb2aa7562bd881fe5bf5b672fa49 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
24
25namespace host_error_monitor
26{
27static boost::asio::io_service io;
28static std::shared_ptr<sdbusplus::asio::connection> conn;
29
30static bool hostOff = true;
31
32const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Bills6a2cb692019-08-06 11:03:49 -070033const static constexpr size_t err2TimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070034const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070035const static constexpr size_t crashdumpTimeoutS = 300;
36
37// Timers
38// Timer for CATERR asserted
39static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070040// Timer for ERR2 asserted
41static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070042// Timer for SMI asserted
43static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070044
45// GPIO Lines and Event Descriptors
46static gpiod::line caterrLine;
47static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070048static gpiod::line err2Line;
49static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070050static gpiod::line smiLine;
51static boost::asio::posix::stream_descriptor smiEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080052//----------------------------------
53// PCH_BMC_THERMTRIP function related definition
54//----------------------------------
55// GPIO Lines and Event Descriptors
56static gpiod::line pchThermtripLine;
57static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070058
Jason M. Billsa3397932019-08-06 11:07:21 -070059static void cpuIERRLog()
60{
61 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
62 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
63 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
64}
65
66static void cpuIERRLog(const int cpuNum)
67{
68 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
69
70 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
71 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
72 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
73}
74
75static void cpuIERRLog(const int cpuNum, const std::string& type)
76{
77 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
78
79 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
80 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
81 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
82}
83
Jason M. Bills6a2cb692019-08-06 11:03:49 -070084static void cpuERR2Log()
85{
86 sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
87 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
88 "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
89}
90
91static void cpuERR2Log(const int cpuNum)
92{
93 std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
94
95 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
96 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
97 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
98}
99
Jason M. Bills89922f82019-08-06 11:10:02 -0700100static void smiTimeoutLog()
101{
102 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
103 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
104 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
105}
106
Jason M. Billsa15c2522019-08-16 10:01:44 -0700107static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700108static void initializeHostState()
109{
110 conn->async_method_call(
111 [](boost::system::error_code ec,
112 const std::variant<std::string>& property) {
113 if (ec)
114 {
115 return;
116 }
117 const std::string* state = std::get_if<std::string>(&property);
118 if (state == nullptr)
119 {
120 std::cerr << "Unable to read host state value\n";
121 return;
122 }
123 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700124 // If the system is on, initialize the error state
125 if (!hostOff)
126 {
127 initializeErrorState();
128 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700129 },
130 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
131 "org.freedesktop.DBus.Properties", "Get",
132 "xyz.openbmc_project.State.Host", "CurrentHostState");
133}
134
135static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
136{
137 return std::make_shared<sdbusplus::bus::match::match>(
138 *conn,
139 "type='signal',interface='org.freedesktop.DBus.Properties',"
140 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
141 "Host'",
142 [](sdbusplus::message::message& msg) {
143 std::string interfaceName;
144 boost::container::flat_map<std::string, std::variant<std::string>>
145 propertiesChanged;
146 std::string state;
147 try
148 {
149 msg.read(interfaceName, propertiesChanged);
150 state =
151 std::get<std::string>(propertiesChanged.begin()->second);
152 }
153 catch (std::exception& e)
154 {
155 std::cerr << "Unable to read host state\n";
156 return;
157 }
158 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
159
160 // No host events should fire while off, so cancel any pending
161 // timers
162 if (hostOff)
163 {
164 caterrAssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700165 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700166 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700167 }
168 });
169}
170
171static bool requestGPIOEvents(
172 const std::string& name, const std::function<void()>& handler,
173 gpiod::line& gpioLine,
174 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
175{
176 // Find the GPIO line
177 gpioLine = gpiod::find_line(name);
178 if (!gpioLine)
179 {
180 std::cerr << "Failed to find the " << name << " line\n";
181 return false;
182 }
183
184 try
185 {
186 gpioLine.request(
187 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
188 }
189 catch (std::exception&)
190 {
191 std::cerr << "Failed to request events for " << name << "\n";
192 return false;
193 }
194
195 int gpioLineFd = gpioLine.event_get_fd();
196 if (gpioLineFd < 0)
197 {
198 std::cerr << "Failed to get " << name << " fd\n";
199 return false;
200 }
201
202 gpioEventDescriptor.assign(gpioLineFd);
203
204 gpioEventDescriptor.async_wait(
205 boost::asio::posix::stream_descriptor::wait_read,
206 [&name, handler](const boost::system::error_code ec) {
207 if (ec)
208 {
209 std::cerr << name << " fd handler error: " << ec.message()
210 << "\n";
211 return;
212 }
213 handler();
214 });
215 return true;
216}
217
218static void startPowerCycle()
219{
220 conn->async_method_call(
221 [](boost::system::error_code ec) {
222 if (ec)
223 {
224 std::cerr << "failed to set Chassis State\n";
225 }
226 },
227 "xyz.openbmc_project.State.Chassis",
228 "/xyz/openbmc_project/state/chassis0",
229 "org.freedesktop.DBus.Properties", "Set",
230 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
231 std::variant<std::string>{
232 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
233}
234
235static void startCrashdumpAndRecovery(bool recoverSystem)
236{
237 std::cout << "Starting crashdump\n";
238 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
239 static boost::asio::steady_timer crashdumpTimer(io);
240
241 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
242 *conn,
243 "type='signal',interface='org.freedesktop.DBus.Properties',"
244 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
245 [recoverSystem](sdbusplus::message::message& msg) {
246 crashdumpTimer.cancel();
247 std::cout << "Crashdump completed\n";
248 if (recoverSystem)
249 {
250 std::cout << "Recovering the system\n";
251 startPowerCycle();
252 }
253 crashdumpCompleteMatch.reset();
254 });
255
256 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
257 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
258 if (ec)
259 {
260 // operation_aborted is expected if timer is canceled
261 if (ec != boost::asio::error::operation_aborted)
262 {
263 std::cerr << "Crashdump async_wait failed: " << ec.message()
264 << "\n";
265 }
266 std::cout << "Crashdump timer canceled\n";
267 return;
268 }
269 std::cerr << "Crashdump failed to complete before timeout\n";
270 crashdumpCompleteMatch.reset();
271 });
272
273 conn->async_method_call(
274 [](boost::system::error_code ec) {
275 if (ec)
276 {
277 std::cerr << "failed to start Crashdump\n";
278 crashdumpTimer.cancel();
279 crashdumpCompleteMatch.reset();
280 }
281 },
282 "com.intel.crashdump", "/com/intel/crashdump",
283 "com.intel.crashdump.Stored", "GenerateStoredLog");
284}
285
Jason M. Billsa3397932019-08-06 11:07:21 -0700286static bool checkIERRCPUs()
287{
288 bool cpuIERRFound = false;
289 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
290 cpu++, addr++)
291 {
292 uint8_t cc = 0;
293 CPUModel model{};
294 uint8_t stepping = 0;
295 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
296 {
297 std::cerr << "Cannot get CPUID!\n";
298 continue;
299 }
300
301 switch (model)
302 {
303 case skx:
304 {
305 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
306 // that caused the IERR
307 uint32_t mcaErrSrcLog = 0;
308 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
309 &cc) != PECI_CC_SUCCESS)
310 {
311 continue;
312 }
313 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
314 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
315 {
316 // TODO: Light the CPU fault LED?
317 cpuIERRFound = true;
318 // Next check if it's a CPU/VR mismatch by reading the
319 // IA32_MC4_STATUS MSR (0x411)
320 uint64_t mc4Status = 0;
321 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
322 PECI_CC_SUCCESS)
323 {
324 continue;
325 }
326 // Check MSEC bits 31:24 for
327 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
328 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
329 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
330 if ((mc4Status & (0x40 << 24)) ||
331 (mc4Status & (0x42 << 24)) ||
332 (mc4Status & (0x43 << 24)))
333 {
334 cpuIERRLog(cpu, "CPU/VR Mismatch");
335 continue;
336 }
337
338 // Next check if it's a Core FIVR fault by looking for a
339 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
340 // 80h)
341 uint32_t coreFIVRErrLog = 0;
342 if (peci_RdPCIConfigLocal(
343 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
344 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
345 {
346 continue;
347 }
348 if (coreFIVRErrLog)
349 {
350 cpuIERRLog(cpu, "Core FIVR Fault");
351 continue;
352 }
353
354 // Next check if it's an Uncore FIVR fault by looking for a
355 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
356 // 84h)
357 uint32_t uncoreFIVRErrLog = 0;
358 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
359 sizeof(uint32_t),
360 (uint8_t*)&uncoreFIVRErrLog,
361 &cc) != PECI_CC_SUCCESS)
362 {
363 continue;
364 }
365 if (uncoreFIVRErrLog)
366 {
367 cpuIERRLog(cpu, "Uncore FIVR Fault");
368 continue;
369 }
370
371 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
372 // both zero, but MSEC bits 31:24 have either
373 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
374 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
375 // uncore FIVR fault
376 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
377 ((mc4Status & (0x51 << 24)) ||
378 (mc4Status & (0x52 << 24))))
379 {
380 cpuIERRLog(cpu, "Uncore FIVR Fault");
381 continue;
382 }
383 cpuIERRLog(cpu);
384 }
385 break;
386 }
387 case icx:
388 {
389 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
390 // that caused the IERR
391 uint32_t mcaErrSrcLog = 0;
392 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
393 &cc) != PECI_CC_SUCCESS)
394 {
395 continue;
396 }
397 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
398 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
399 {
400 // TODO: Light the CPU fault LED?
401 cpuIERRFound = true;
402 // Next check if it's a CPU/VR mismatch by reading the
403 // IA32_MC4_STATUS MSR (0x411)
404 uint64_t mc4Status = 0;
405 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
406 PECI_CC_SUCCESS)
407 {
408 continue;
409 }
410 // TODO: Update MSEC/MSCOD_31_24 check
411 // Check MSEC bits 31:24 for
412 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
413 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
414 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
415 if ((mc4Status & (0x40 << 24)) ||
416 (mc4Status & (0x42 << 24)) ||
417 (mc4Status & (0x43 << 24)))
418 {
419 cpuIERRLog(cpu, "CPU/VR Mismatch");
420 continue;
421 }
422
423 // Next check if it's a Core FIVR fault by looking for a
424 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
425 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
426 uint32_t coreFIVRErrLog0 = 0;
427 uint32_t coreFIVRErrLog1 = 0;
428 if (peci_RdEndPointConfigPciLocal(
429 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
430 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
431 {
432 continue;
433 }
434 if (peci_RdEndPointConfigPciLocal(
435 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
436 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
437 {
438 continue;
439 }
440 if (coreFIVRErrLog0 || coreFIVRErrLog1)
441 {
442 cpuIERRLog(cpu, "Core FIVR Fault");
443 continue;
444 }
445
446 // Next check if it's an Uncore FIVR fault by looking for a
447 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
448 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
449 uint32_t uncoreFIVRErrLog = 0;
450 if (peci_RdEndPointConfigPciLocal(
451 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
452 (uint8_t*)&uncoreFIVRErrLog,
453 &cc) != PECI_CC_SUCCESS)
454 {
455 continue;
456 }
457 if (uncoreFIVRErrLog)
458 {
459 cpuIERRLog(cpu, "Uncore FIVR Fault");
460 continue;
461 }
462
463 // TODO: Update MSEC/MSCOD_31_24 check
464 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
465 // both zero, but MSEC bits 31:24 have either
466 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
467 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
468 // uncore FIVR fault
469 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
470 !uncoreFIVRErrLog &&
471 ((mc4Status & (0x51 << 24)) ||
472 (mc4Status & (0x52 << 24))))
473 {
474 cpuIERRLog(cpu, "Uncore FIVR Fault");
475 continue;
476 }
477 cpuIERRLog(cpu);
478 }
479 break;
480 }
481 }
482 }
483 return cpuIERRFound;
484}
485
Jason M. Billsa15c2522019-08-16 10:01:44 -0700486static void caterrAssertHandler()
487{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700488 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
489 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
490 if (ec)
491 {
492 // operation_aborted is expected if timer is canceled
493 // before completion.
494 if (ec != boost::asio::error::operation_aborted)
495 {
496 std::cerr << "caterr timeout async_wait failed: "
497 << ec.message() << "\n";
498 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700499 return;
500 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700501 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
502 << " ms\n";
503 if (!checkIERRCPUs())
504 {
505 cpuIERRLog();
506 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700507 conn->async_method_call(
508 [](boost::system::error_code ec,
509 const std::variant<bool>& property) {
510 if (ec)
511 {
512 return;
513 }
514 const bool* reset = std::get_if<bool>(&property);
515 if (reset == nullptr)
516 {
517 std::cerr << "Unable to read reset on CATERR value\n";
518 return;
519 }
520 startCrashdumpAndRecovery(*reset);
521 },
522 "xyz.openbmc_project.Settings",
523 "/xyz/openbmc_project/control/processor_error_config",
524 "org.freedesktop.DBus.Properties", "Get",
525 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
526 });
527}
528
Jason M. Bills1490b142019-07-01 15:48:43 -0700529static void caterrHandler()
530{
531 if (!hostOff)
532 {
533 gpiod::line_event gpioLineEvent = caterrLine.event_read();
534
535 bool caterr =
536 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
537 if (caterr)
538 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700539 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700540 }
541 else
542 {
543 caterrAssertTimer.cancel();
544 }
545 }
546 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
547 [](const boost::system::error_code ec) {
548 if (ec)
549 {
550 std::cerr << "caterr handler error: "
551 << ec.message() << "\n";
552 return;
553 }
554 caterrHandler();
555 });
556}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800557static void pchThermtripHandler()
558{
559 if (!hostOff)
560 {
561 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
562
563 bool pchThermtrip =
564 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
565 if (pchThermtrip)
566 {
567 std::cout << "PCH Thermal trip detected \n";
568 // log to redfish, call API
569 sd_journal_send("MESSAGE=SsbThermalTrip: SSB Thermal trip",
570 "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s",
571 "OpenBMC.0.1.SsbThermalTrip", NULL);
572 }
573 }
574 pchThermtripEvent.async_wait(
575 boost::asio::posix::stream_descriptor::wait_read,
576 [](const boost::system::error_code ec) {
577 if (ec)
578 {
579 std::cerr << "PCH Thermal trip handler error: " << ec.message()
580 << "\n";
581 return;
582 }
583 pchThermtripHandler();
584 });
585}
586
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700587static std::bitset<MAX_CPUS> checkERR2CPUs()
588{
589 std::bitset<MAX_CPUS> err2CPUs = 0;
590 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
591 cpu++, addr++)
592 {
593 if (peci_Ping(addr) == PECI_CC_SUCCESS)
594 {
595 uint8_t cc = 0;
596 CPUModel model{};
597 uint8_t stepping = 0;
598 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
599 {
600 std::cerr << "Cannot get CPUID!\n";
601 continue;
602 }
603
604 switch (model)
605 {
606 case skx:
607 {
608 // Check the ERRPINSTS to see if this is the CPU that caused
609 // the ERR2 (B(0) D8 F0 offset 210h)
610 uint32_t errpinsts = 0;
611 if (peci_RdPCIConfigLocal(
612 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
613 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
614 {
615 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
616 }
617 break;
618 }
619 case icx:
620 {
621 // Check the ERRPINSTS to see if this is the CPU that caused
622 // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
623 // accessed on PECI as bus 13)
624 uint32_t errpinsts = 0;
625 if (peci_RdEndPointConfigPciLocal(
626 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
627 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
628 {
629 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
630 }
631 break;
632 }
633 }
634 }
635 }
636 return err2CPUs;
637}
638
639static void err2AssertHandler()
640{
641 // ERR2 status is not guaranteed through the timeout, so save which
642 // CPUs have asserted ERR2 now
643 std::bitset<MAX_CPUS> err2CPUs = checkERR2CPUs();
644 err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
645 err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
646 if (ec)
647 {
648 // operation_aborted is expected if timer is canceled before
649 // completion.
650 if (ec != boost::asio::error::operation_aborted)
651 {
652 std::cerr << "err2 timeout async_wait failed: " << ec.message()
653 << "\n";
654 }
655 return;
656 }
657 std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
658 << " ms\n";
659 if (err2CPUs.count())
660 {
661 for (int i = 0; i < err2CPUs.size(); i++)
662 {
663 if (err2CPUs[i])
664 {
665 cpuERR2Log(i);
666 }
667 }
668 }
669 else
670 {
671 cpuERR2Log();
672 }
673 conn->async_method_call(
674 [](boost::system::error_code ec,
675 const std::variant<bool>& property) {
676 if (ec)
677 {
678 return;
679 }
680 const bool* reset = std::get_if<bool>(&property);
681 if (reset == nullptr)
682 {
683 std::cerr << "Unable to read reset on ERR2 value\n";
684 return;
685 }
686 startCrashdumpAndRecovery(*reset);
687 },
688 "xyz.openbmc_project.Settings",
689 "/xyz/openbmc_project/control/processor_error_config",
690 "org.freedesktop.DBus.Properties", "Get",
691 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
692 });
693}
694
695static void err2Handler()
696{
697 if (!hostOff)
698 {
699 gpiod::line_event gpioLineEvent = err2Line.event_read();
700
701 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
702 if (err2)
703 {
704 err2AssertHandler();
705 }
706 else
707 {
708 err2AssertTimer.cancel();
709 }
710 }
711 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
712 [](const boost::system::error_code ec) {
713 if (ec)
714 {
715 std::cerr
716 << "err2 handler error: " << ec.message()
717 << "\n";
718 return;
719 }
720 err2Handler();
721 });
722}
723
Jason M. Bills89922f82019-08-06 11:10:02 -0700724static void smiAssertHandler()
725{
726 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
727 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
728 if (ec)
729 {
730 // operation_aborted is expected if timer is canceled before
731 // completion.
732 if (ec != boost::asio::error::operation_aborted)
733 {
734 std::cerr << "smi timeout async_wait failed: " << ec.message()
735 << "\n";
736 }
737 return;
738 }
739 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
740 << " ms\n";
741 smiTimeoutLog();
742 conn->async_method_call(
743 [](boost::system::error_code ec,
744 const std::variant<bool>& property) {
745 if (ec)
746 {
747 return;
748 }
749 const bool* reset = std::get_if<bool>(&property);
750 if (reset == nullptr)
751 {
752 std::cerr << "Unable to read reset on SMI value\n";
753 return;
754 }
755 startCrashdumpAndRecovery(*reset);
756 },
757 "xyz.openbmc_project.Settings",
758 "/xyz/openbmc_project/control/bmc_reset_disables",
759 "org.freedesktop.DBus.Properties", "Get",
760 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
761 });
762}
763
764static void smiHandler()
765{
766 if (!hostOff)
767 {
768 gpiod::line_event gpioLineEvent = smiLine.event_read();
769
770 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
771 if (smi)
772 {
773 smiAssertHandler();
774 }
775 else
776 {
777 smiAssertTimer.cancel();
778 }
779 }
780 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
781 [](const boost::system::error_code ec) {
782 if (ec)
783 {
784 std::cerr
785 << "smi handler error: " << ec.message()
786 << "\n";
787 return;
788 }
789 smiHandler();
790 });
791}
792
Jason M. Billsa15c2522019-08-16 10:01:44 -0700793static void initializeErrorState()
794{
795 // Handle CPU_CATERR if it's asserted now
796 if (caterrLine.get_value() == 0)
797 {
798 caterrAssertHandler();
799 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700800
801 // Handle CPU_ERR2 if it's asserted now
802 if (err2Line.get_value() == 0)
803 {
804 err2AssertHandler();
805 }
Jason M. Bills89922f82019-08-06 11:10:02 -0700806
807 // Handle SMI if it's asserted now
808 if (smiLine.get_value() == 0)
809 {
810 smiAssertHandler();
811 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700812}
Jason M. Bills1490b142019-07-01 15:48:43 -0700813} // namespace host_error_monitor
814
815int main(int argc, char* argv[])
816{
817 // setup connection to dbus
818 host_error_monitor::conn =
819 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
820
821 // Host Error Monitor Object
822 host_error_monitor::conn->request_name(
823 "xyz.openbmc_project.HostErrorMonitor");
824 sdbusplus::asio::object_server server =
825 sdbusplus::asio::object_server(host_error_monitor::conn);
826
827 // Start tracking host state
828 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
829 host_error_monitor::startHostStateMonitor();
830
831 // Initialize the host state
832 host_error_monitor::initializeHostState();
833
834 // Request CPU_CATERR GPIO events
835 if (!host_error_monitor::requestGPIOEvents(
836 "CPU_CATERR", host_error_monitor::caterrHandler,
837 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
838 {
839 return -1;
840 }
841
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700842 // Request CPU_ERR2 GPIO events
843 if (!host_error_monitor::requestGPIOEvents(
844 "CPU_ERR2", host_error_monitor::err2Handler,
845 host_error_monitor::err2Line, host_error_monitor::err2Event))
846 {
847 return -1;
848 }
849
Jason M. Bills89922f82019-08-06 11:10:02 -0700850 // Request SMI GPIO events
851 if (!host_error_monitor::requestGPIOEvents(
852 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
853 host_error_monitor::smiEvent))
854 {
855 return -1;
856 }
857
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800858 // Request PCH_BMC_THERMTRIP GPIO events
859 if (!host_error_monitor::requestGPIOEvents(
860 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
861 host_error_monitor::pchThermtripLine,
862 host_error_monitor::pchThermtripEvent))
863 {
864 return -1;
865 }
866
Jason M. Bills1490b142019-07-01 15:48:43 -0700867 host_error_monitor::io.run();
868
869 return 0;
870}