blob: 408acc90e363ca3cd34fdbe08382197c6d1f4a74 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070041// Timer for ERR0 asserted
42static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070043// Timer for ERR2 asserted
44static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070045// Timer for SMI asserted
46static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070047
48// GPIO Lines and Event Descriptors
49static gpiod::line caterrLine;
50static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070051static gpiod::line err0Line;
52static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070053static gpiod::line err2Line;
54static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070055static gpiod::line smiLine;
56static boost::asio::posix::stream_descriptor smiEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080057//----------------------------------
58// PCH_BMC_THERMTRIP function related definition
59//----------------------------------
60// GPIO Lines and Event Descriptors
61static gpiod::line pchThermtripLine;
62static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070063
Jason M. Billsa3397932019-08-06 11:07:21 -070064static void cpuIERRLog()
65{
66 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
67 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
68 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
69}
70
71static void cpuIERRLog(const int cpuNum)
72{
73 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
74
75 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
76 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
77 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
78}
79
80static void cpuIERRLog(const int cpuNum, const std::string& type)
81{
82 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
83
84 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
85 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
86 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
87}
88
Jason M. Billscbf78532019-08-16 15:32:11 -070089static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -070090{
Jason M. Billscbf78532019-08-16 15:32:11 -070091 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
92
93 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
94 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
95 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070096}
97
Jason M. Billscbf78532019-08-16 15:32:11 -070098static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -070099{
Jason M. Billscbf78532019-08-16 15:32:11 -0700100 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
101 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700102
103 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
104 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
105 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
106}
107
Jason M. Bills89922f82019-08-06 11:10:02 -0700108static void smiTimeoutLog()
109{
110 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
111 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
112 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
113}
114
Jason M. Bills08866542019-08-16 12:04:19 -0700115static void ssbThermTripLog()
116{
117 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
118 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
119 "OpenBMC.0.1.SsbThermalTrip", NULL);
120}
121
Jason M. Billsa15c2522019-08-16 10:01:44 -0700122static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700123static void initializeHostState()
124{
125 conn->async_method_call(
126 [](boost::system::error_code ec,
127 const std::variant<std::string>& property) {
128 if (ec)
129 {
130 return;
131 }
132 const std::string* state = std::get_if<std::string>(&property);
133 if (state == nullptr)
134 {
135 std::cerr << "Unable to read host state value\n";
136 return;
137 }
138 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700139 // If the system is on, initialize the error state
140 if (!hostOff)
141 {
142 initializeErrorState();
143 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700144 },
145 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
146 "org.freedesktop.DBus.Properties", "Get",
147 "xyz.openbmc_project.State.Host", "CurrentHostState");
148}
149
150static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
151{
152 return std::make_shared<sdbusplus::bus::match::match>(
153 *conn,
154 "type='signal',interface='org.freedesktop.DBus.Properties',"
155 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
156 "Host'",
157 [](sdbusplus::message::message& msg) {
158 std::string interfaceName;
159 boost::container::flat_map<std::string, std::variant<std::string>>
160 propertiesChanged;
161 std::string state;
162 try
163 {
164 msg.read(interfaceName, propertiesChanged);
165 state =
166 std::get<std::string>(propertiesChanged.begin()->second);
167 }
168 catch (std::exception& e)
169 {
170 std::cerr << "Unable to read host state\n";
171 return;
172 }
173 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
174
175 // No host events should fire while off, so cancel any pending
176 // timers
177 if (hostOff)
178 {
179 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700180 err0AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700181 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700182 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700183 }
184 });
185}
186
187static bool requestGPIOEvents(
188 const std::string& name, const std::function<void()>& handler,
189 gpiod::line& gpioLine,
190 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
191{
192 // Find the GPIO line
193 gpioLine = gpiod::find_line(name);
194 if (!gpioLine)
195 {
196 std::cerr << "Failed to find the " << name << " line\n";
197 return false;
198 }
199
200 try
201 {
202 gpioLine.request(
203 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
204 }
205 catch (std::exception&)
206 {
207 std::cerr << "Failed to request events for " << name << "\n";
208 return false;
209 }
210
211 int gpioLineFd = gpioLine.event_get_fd();
212 if (gpioLineFd < 0)
213 {
214 std::cerr << "Failed to get " << name << " fd\n";
215 return false;
216 }
217
218 gpioEventDescriptor.assign(gpioLineFd);
219
220 gpioEventDescriptor.async_wait(
221 boost::asio::posix::stream_descriptor::wait_read,
222 [&name, handler](const boost::system::error_code ec) {
223 if (ec)
224 {
225 std::cerr << name << " fd handler error: " << ec.message()
226 << "\n";
227 return;
228 }
229 handler();
230 });
231 return true;
232}
233
234static void startPowerCycle()
235{
236 conn->async_method_call(
237 [](boost::system::error_code ec) {
238 if (ec)
239 {
240 std::cerr << "failed to set Chassis State\n";
241 }
242 },
243 "xyz.openbmc_project.State.Chassis",
244 "/xyz/openbmc_project/state/chassis0",
245 "org.freedesktop.DBus.Properties", "Set",
246 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
247 std::variant<std::string>{
248 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
249}
250
251static void startCrashdumpAndRecovery(bool recoverSystem)
252{
253 std::cout << "Starting crashdump\n";
254 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
255 static boost::asio::steady_timer crashdumpTimer(io);
256
257 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
258 *conn,
259 "type='signal',interface='org.freedesktop.DBus.Properties',"
260 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
261 [recoverSystem](sdbusplus::message::message& msg) {
262 crashdumpTimer.cancel();
263 std::cout << "Crashdump completed\n";
264 if (recoverSystem)
265 {
266 std::cout << "Recovering the system\n";
267 startPowerCycle();
268 }
269 crashdumpCompleteMatch.reset();
270 });
271
272 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
273 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
274 if (ec)
275 {
276 // operation_aborted is expected if timer is canceled
277 if (ec != boost::asio::error::operation_aborted)
278 {
279 std::cerr << "Crashdump async_wait failed: " << ec.message()
280 << "\n";
281 }
282 std::cout << "Crashdump timer canceled\n";
283 return;
284 }
285 std::cerr << "Crashdump failed to complete before timeout\n";
286 crashdumpCompleteMatch.reset();
287 });
288
289 conn->async_method_call(
290 [](boost::system::error_code ec) {
291 if (ec)
292 {
293 std::cerr << "failed to start Crashdump\n";
294 crashdumpTimer.cancel();
295 crashdumpCompleteMatch.reset();
296 }
297 },
298 "com.intel.crashdump", "/com/intel/crashdump",
299 "com.intel.crashdump.Stored", "GenerateStoredLog");
300}
301
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700302static void incrementCPUErrorCount(int cpuNum)
303{
304 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
305
306 // Get the current count
307 conn->async_method_call(
308 [propertyName](boost::system::error_code ec,
309 const std::variant<uint8_t>& property) {
310 if (ec)
311 {
312 std::cerr << "Failed to read " << propertyName << ": "
313 << ec.message() << "\n";
314 return;
315 }
316 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
317 if (errorCountVariant == nullptr)
318 {
319 std::cerr << propertyName << " invalid\n";
320 return;
321 }
322 uint8_t errorCount = *errorCountVariant;
323 if (errorCount == std::numeric_limits<uint8_t>::max())
324 {
325 std::cerr << "Maximum error count reached\n";
326 return;
327 }
328 // Increment the count
329 errorCount++;
330 conn->async_method_call(
331 [propertyName](boost::system::error_code ec) {
332 if (ec)
333 {
334 std::cerr << "Failed to set " << propertyName << ": "
335 << ec.message() << "\n";
336 }
337 },
338 "xyz.openbmc_project.Settings",
339 "/xyz/openbmc_project/control/processor_error_config",
340 "org.freedesktop.DBus.Properties", "Set",
341 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
342 std::variant<uint8_t>{errorCount});
343 },
344 "xyz.openbmc_project.Settings",
345 "/xyz/openbmc_project/control/processor_error_config",
346 "org.freedesktop.DBus.Properties", "Get",
347 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
348}
349
Jason M. Billsa3397932019-08-06 11:07:21 -0700350static bool checkIERRCPUs()
351{
352 bool cpuIERRFound = false;
353 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
354 cpu++, addr++)
355 {
356 uint8_t cc = 0;
357 CPUModel model{};
358 uint8_t stepping = 0;
359 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
360 {
361 std::cerr << "Cannot get CPUID!\n";
362 continue;
363 }
364
365 switch (model)
366 {
367 case skx:
368 {
369 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
370 // that caused the IERR
371 uint32_t mcaErrSrcLog = 0;
372 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
373 &cc) != PECI_CC_SUCCESS)
374 {
375 continue;
376 }
377 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
378 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
379 {
380 // TODO: Light the CPU fault LED?
381 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700382 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700383 // Next check if it's a CPU/VR mismatch by reading the
384 // IA32_MC4_STATUS MSR (0x411)
385 uint64_t mc4Status = 0;
386 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
387 PECI_CC_SUCCESS)
388 {
389 continue;
390 }
391 // Check MSEC bits 31:24 for
392 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
393 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
394 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
395 if ((mc4Status & (0x40 << 24)) ||
396 (mc4Status & (0x42 << 24)) ||
397 (mc4Status & (0x43 << 24)))
398 {
399 cpuIERRLog(cpu, "CPU/VR Mismatch");
400 continue;
401 }
402
403 // Next check if it's a Core FIVR fault by looking for a
404 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
405 // 80h)
406 uint32_t coreFIVRErrLog = 0;
407 if (peci_RdPCIConfigLocal(
408 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
409 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
410 {
411 continue;
412 }
413 if (coreFIVRErrLog)
414 {
415 cpuIERRLog(cpu, "Core FIVR Fault");
416 continue;
417 }
418
419 // Next check if it's an Uncore FIVR fault by looking for a
420 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
421 // 84h)
422 uint32_t uncoreFIVRErrLog = 0;
423 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
424 sizeof(uint32_t),
425 (uint8_t*)&uncoreFIVRErrLog,
426 &cc) != PECI_CC_SUCCESS)
427 {
428 continue;
429 }
430 if (uncoreFIVRErrLog)
431 {
432 cpuIERRLog(cpu, "Uncore FIVR Fault");
433 continue;
434 }
435
436 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
437 // both zero, but MSEC bits 31:24 have either
438 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
439 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
440 // uncore FIVR fault
441 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
442 ((mc4Status & (0x51 << 24)) ||
443 (mc4Status & (0x52 << 24))))
444 {
445 cpuIERRLog(cpu, "Uncore FIVR Fault");
446 continue;
447 }
448 cpuIERRLog(cpu);
449 }
450 break;
451 }
452 case icx:
453 {
454 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
455 // that caused the IERR
456 uint32_t mcaErrSrcLog = 0;
457 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
458 &cc) != PECI_CC_SUCCESS)
459 {
460 continue;
461 }
462 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
463 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
464 {
465 // TODO: Light the CPU fault LED?
466 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700467 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700468 // Next check if it's a CPU/VR mismatch by reading the
469 // IA32_MC4_STATUS MSR (0x411)
470 uint64_t mc4Status = 0;
471 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
472 PECI_CC_SUCCESS)
473 {
474 continue;
475 }
476 // TODO: Update MSEC/MSCOD_31_24 check
477 // Check MSEC bits 31:24 for
478 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
479 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
480 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
481 if ((mc4Status & (0x40 << 24)) ||
482 (mc4Status & (0x42 << 24)) ||
483 (mc4Status & (0x43 << 24)))
484 {
485 cpuIERRLog(cpu, "CPU/VR Mismatch");
486 continue;
487 }
488
489 // Next check if it's a Core FIVR fault by looking for a
490 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
491 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
492 uint32_t coreFIVRErrLog0 = 0;
493 uint32_t coreFIVRErrLog1 = 0;
494 if (peci_RdEndPointConfigPciLocal(
495 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
496 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
497 {
498 continue;
499 }
500 if (peci_RdEndPointConfigPciLocal(
501 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
502 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
503 {
504 continue;
505 }
506 if (coreFIVRErrLog0 || coreFIVRErrLog1)
507 {
508 cpuIERRLog(cpu, "Core FIVR Fault");
509 continue;
510 }
511
512 // Next check if it's an Uncore FIVR fault by looking for a
513 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
514 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
515 uint32_t uncoreFIVRErrLog = 0;
516 if (peci_RdEndPointConfigPciLocal(
517 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
518 (uint8_t*)&uncoreFIVRErrLog,
519 &cc) != PECI_CC_SUCCESS)
520 {
521 continue;
522 }
523 if (uncoreFIVRErrLog)
524 {
525 cpuIERRLog(cpu, "Uncore FIVR Fault");
526 continue;
527 }
528
529 // TODO: Update MSEC/MSCOD_31_24 check
530 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
531 // both zero, but MSEC bits 31:24 have either
532 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
533 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
534 // uncore FIVR fault
535 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
536 !uncoreFIVRErrLog &&
537 ((mc4Status & (0x51 << 24)) ||
538 (mc4Status & (0x52 << 24))))
539 {
540 cpuIERRLog(cpu, "Uncore FIVR Fault");
541 continue;
542 }
543 cpuIERRLog(cpu);
544 }
545 break;
546 }
547 }
548 }
549 return cpuIERRFound;
550}
551
Jason M. Billsa15c2522019-08-16 10:01:44 -0700552static void caterrAssertHandler()
553{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700554 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
555 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
556 if (ec)
557 {
558 // operation_aborted is expected if timer is canceled
559 // before completion.
560 if (ec != boost::asio::error::operation_aborted)
561 {
562 std::cerr << "caterr timeout async_wait failed: "
563 << ec.message() << "\n";
564 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700565 return;
566 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700567 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
568 << " ms\n";
569 if (!checkIERRCPUs())
570 {
571 cpuIERRLog();
572 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700573 conn->async_method_call(
574 [](boost::system::error_code ec,
575 const std::variant<bool>& property) {
576 if (ec)
577 {
578 return;
579 }
580 const bool* reset = std::get_if<bool>(&property);
581 if (reset == nullptr)
582 {
583 std::cerr << "Unable to read reset on CATERR value\n";
584 return;
585 }
586 startCrashdumpAndRecovery(*reset);
587 },
588 "xyz.openbmc_project.Settings",
589 "/xyz/openbmc_project/control/processor_error_config",
590 "org.freedesktop.DBus.Properties", "Get",
591 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
592 });
593}
594
Jason M. Bills1490b142019-07-01 15:48:43 -0700595static void caterrHandler()
596{
597 if (!hostOff)
598 {
599 gpiod::line_event gpioLineEvent = caterrLine.event_read();
600
601 bool caterr =
602 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
603 if (caterr)
604 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700605 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700606 }
607 else
608 {
609 caterrAssertTimer.cancel();
610 }
611 }
612 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
613 [](const boost::system::error_code ec) {
614 if (ec)
615 {
616 std::cerr << "caterr handler error: "
617 << ec.message() << "\n";
618 return;
619 }
620 caterrHandler();
621 });
622}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800623static void pchThermtripHandler()
624{
625 if (!hostOff)
626 {
627 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
628
629 bool pchThermtrip =
630 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
631 if (pchThermtrip)
632 {
Jason M. Bills08866542019-08-16 12:04:19 -0700633 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800634 }
635 }
636 pchThermtripEvent.async_wait(
637 boost::asio::posix::stream_descriptor::wait_read,
638 [](const boost::system::error_code ec) {
639 if (ec)
640 {
641 std::cerr << "PCH Thermal trip handler error: " << ec.message()
642 << "\n";
643 return;
644 }
645 pchThermtripHandler();
646 });
647}
648
Jason M. Billscbf78532019-08-16 15:32:11 -0700649static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700650{
Jason M. Billscbf78532019-08-16 15:32:11 -0700651 int errPinSts = (1 << errPin);
652 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700653 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
654 cpu++, addr++)
655 {
656 if (peci_Ping(addr) == PECI_CC_SUCCESS)
657 {
658 uint8_t cc = 0;
659 CPUModel model{};
660 uint8_t stepping = 0;
661 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
662 {
663 std::cerr << "Cannot get CPUID!\n";
664 continue;
665 }
666
667 switch (model)
668 {
669 case skx:
670 {
671 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700672 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700673 uint32_t errpinsts = 0;
674 if (peci_RdPCIConfigLocal(
675 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
676 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
677 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700678 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700679 }
680 break;
681 }
682 case icx:
683 {
684 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700685 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700686 // accessed on PECI as bus 13)
687 uint32_t errpinsts = 0;
688 if (peci_RdEndPointConfigPciLocal(
689 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
690 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
691 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700692 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700693 }
694 break;
695 }
696 }
697 }
698 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700699 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700700}
701
Jason M. Billscbf78532019-08-16 15:32:11 -0700702static void errXAssertHandler(const int errPin,
703 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700704{
Jason M. Billscbf78532019-08-16 15:32:11 -0700705 // ERRx status is not guaranteed through the timeout, so save which
706 // CPUs have it asserted
707 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
708 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
709 errXAssertTimer.async_wait([errPin, errPinCPUs](
710 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700711 if (ec)
712 {
713 // operation_aborted is expected if timer is canceled before
714 // completion.
715 if (ec != boost::asio::error::operation_aborted)
716 {
717 std::cerr << "err2 timeout async_wait failed: " << ec.message()
718 << "\n";
719 }
720 return;
721 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700722 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
723 << std::to_string(errTimeoutMs) << " ms\n";
724 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700725 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700726 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700727 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700728 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700729 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700730 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700731 }
732 }
733 }
734 else
735 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700736 cpuERRXLog(errPin);
737 }
738 });
739}
740
Jason M. Bills8c584392019-08-19 11:05:51 -0700741static void err0AssertHandler()
742{
743 // Handle the standard ERR0 detection and logging
744 const static constexpr int err0 = 0;
745 errXAssertHandler(err0, err0AssertTimer);
746}
747
748static void err0Handler()
749{
750 if (!hostOff)
751 {
752 gpiod::line_event gpioLineEvent = err0Line.event_read();
753
754 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
755 if (err0)
756 {
757 err0AssertHandler();
758 }
759 else
760 {
761 err0AssertTimer.cancel();
762 }
763 }
764 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
765 [](const boost::system::error_code ec) {
766 if (ec)
767 {
768 std::cerr
769 << "err0 handler error: " << ec.message()
770 << "\n";
771 return;
772 }
773 err0Handler();
774 });
775}
776
Jason M. Billscbf78532019-08-16 15:32:11 -0700777static void err2AssertHandler()
778{
779 // Handle the standard ERR2 detection and logging
780 const static constexpr int err2 = 2;
781 errXAssertHandler(err2, err2AssertTimer);
782 // Also handle reset for ERR2
783 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
784 if (ec)
785 {
786 // operation_aborted is expected if timer is canceled before
787 // completion.
788 if (ec != boost::asio::error::operation_aborted)
789 {
790 std::cerr << "err2 timeout async_wait failed: " << ec.message()
791 << "\n";
792 }
793 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700794 }
795 conn->async_method_call(
796 [](boost::system::error_code ec,
797 const std::variant<bool>& property) {
798 if (ec)
799 {
800 return;
801 }
802 const bool* reset = std::get_if<bool>(&property);
803 if (reset == nullptr)
804 {
805 std::cerr << "Unable to read reset on ERR2 value\n";
806 return;
807 }
808 startCrashdumpAndRecovery(*reset);
809 },
810 "xyz.openbmc_project.Settings",
811 "/xyz/openbmc_project/control/processor_error_config",
812 "org.freedesktop.DBus.Properties", "Get",
813 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
814 });
815}
816
817static void err2Handler()
818{
819 if (!hostOff)
820 {
821 gpiod::line_event gpioLineEvent = err2Line.event_read();
822
823 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
824 if (err2)
825 {
826 err2AssertHandler();
827 }
828 else
829 {
830 err2AssertTimer.cancel();
831 }
832 }
833 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
834 [](const boost::system::error_code ec) {
835 if (ec)
836 {
837 std::cerr
838 << "err2 handler error: " << ec.message()
839 << "\n";
840 return;
841 }
842 err2Handler();
843 });
844}
845
Jason M. Bills89922f82019-08-06 11:10:02 -0700846static void smiAssertHandler()
847{
848 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
849 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
850 if (ec)
851 {
852 // operation_aborted is expected if timer is canceled before
853 // completion.
854 if (ec != boost::asio::error::operation_aborted)
855 {
856 std::cerr << "smi timeout async_wait failed: " << ec.message()
857 << "\n";
858 }
859 return;
860 }
861 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
862 << " ms\n";
863 smiTimeoutLog();
864 conn->async_method_call(
865 [](boost::system::error_code ec,
866 const std::variant<bool>& property) {
867 if (ec)
868 {
869 return;
870 }
871 const bool* reset = std::get_if<bool>(&property);
872 if (reset == nullptr)
873 {
874 std::cerr << "Unable to read reset on SMI value\n";
875 return;
876 }
877 startCrashdumpAndRecovery(*reset);
878 },
879 "xyz.openbmc_project.Settings",
880 "/xyz/openbmc_project/control/bmc_reset_disables",
881 "org.freedesktop.DBus.Properties", "Get",
882 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
883 });
884}
885
886static void smiHandler()
887{
888 if (!hostOff)
889 {
890 gpiod::line_event gpioLineEvent = smiLine.event_read();
891
892 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
893 if (smi)
894 {
895 smiAssertHandler();
896 }
897 else
898 {
899 smiAssertTimer.cancel();
900 }
901 }
902 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
903 [](const boost::system::error_code ec) {
904 if (ec)
905 {
906 std::cerr
907 << "smi handler error: " << ec.message()
908 << "\n";
909 return;
910 }
911 smiHandler();
912 });
913}
914
Jason M. Billsa15c2522019-08-16 10:01:44 -0700915static void initializeErrorState()
916{
917 // Handle CPU_CATERR if it's asserted now
918 if (caterrLine.get_value() == 0)
919 {
920 caterrAssertHandler();
921 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700922
Jason M. Bills8c584392019-08-19 11:05:51 -0700923 // Handle CPU_ERR0 if it's asserted now
924 if (err0Line.get_value() == 0)
925 {
926 err0AssertHandler();
927 }
928
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700929 // Handle CPU_ERR2 if it's asserted now
930 if (err2Line.get_value() == 0)
931 {
932 err2AssertHandler();
933 }
Jason M. Bills89922f82019-08-06 11:10:02 -0700934
935 // Handle SMI if it's asserted now
936 if (smiLine.get_value() == 0)
937 {
938 smiAssertHandler();
939 }
Jason M. Bills08866542019-08-16 12:04:19 -0700940
941 // Handle PCH_BMC_THERMTRIP if it's asserted now
942 if (pchThermtripLine.get_value() == 0)
943 {
944 ssbThermTripLog();
945 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700946}
Jason M. Bills1490b142019-07-01 15:48:43 -0700947} // namespace host_error_monitor
948
949int main(int argc, char* argv[])
950{
951 // setup connection to dbus
952 host_error_monitor::conn =
953 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
954
955 // Host Error Monitor Object
956 host_error_monitor::conn->request_name(
957 "xyz.openbmc_project.HostErrorMonitor");
958 sdbusplus::asio::object_server server =
959 sdbusplus::asio::object_server(host_error_monitor::conn);
960
961 // Start tracking host state
962 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
963 host_error_monitor::startHostStateMonitor();
964
965 // Initialize the host state
966 host_error_monitor::initializeHostState();
967
968 // Request CPU_CATERR GPIO events
969 if (!host_error_monitor::requestGPIOEvents(
970 "CPU_CATERR", host_error_monitor::caterrHandler,
971 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
972 {
973 return -1;
974 }
975
Jason M. Bills8c584392019-08-19 11:05:51 -0700976 // Request CPU_ERR0 GPIO events
977 if (!host_error_monitor::requestGPIOEvents(
978 "CPU_ERR0", host_error_monitor::err0Handler,
979 host_error_monitor::err0Line, host_error_monitor::err0Event))
980 {
981 return -1;
982 }
983
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700984 // Request CPU_ERR2 GPIO events
985 if (!host_error_monitor::requestGPIOEvents(
986 "CPU_ERR2", host_error_monitor::err2Handler,
987 host_error_monitor::err2Line, host_error_monitor::err2Event))
988 {
989 return -1;
990 }
991
Jason M. Bills89922f82019-08-06 11:10:02 -0700992 // Request SMI GPIO events
993 if (!host_error_monitor::requestGPIOEvents(
994 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
995 host_error_monitor::smiEvent))
996 {
997 return -1;
998 }
999
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001000 // Request PCH_BMC_THERMTRIP GPIO events
1001 if (!host_error_monitor::requestGPIOEvents(
1002 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1003 host_error_monitor::pchThermtripLine,
1004 host_error_monitor::pchThermtripEvent))
1005 {
1006 return -1;
1007 }
1008
Jason M. Bills1490b142019-07-01 15:48:43 -07001009 host_error_monitor::io.run();
1010
1011 return 0;
1012}