blob: 52c34b40111b2e1e8180f4e74b6117728dd89f1f [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070041// Timer for ERR0 asserted
42static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070043// Timer for ERR1 asserted
44static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045// Timer for ERR2 asserted
46static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070047// Timer for SMI asserted
48static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070049
50// GPIO Lines and Event Descriptors
51static gpiod::line caterrLine;
52static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070053static gpiod::line err0Line;
54static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070055static gpiod::line err1Line;
56static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070057static gpiod::line err2Line;
58static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070059static gpiod::line smiLine;
60static boost::asio::posix::stream_descriptor smiEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080061//----------------------------------
62// PCH_BMC_THERMTRIP function related definition
63//----------------------------------
64// GPIO Lines and Event Descriptors
65static gpiod::line pchThermtripLine;
66static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070067
Jason M. Billsa3397932019-08-06 11:07:21 -070068static void cpuIERRLog()
69{
70 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
71 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
72 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
73}
74
75static void cpuIERRLog(const int cpuNum)
76{
77 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
78
79 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
80 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
81 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
82}
83
84static void cpuIERRLog(const int cpuNum, const std::string& type)
85{
86 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
87
88 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
89 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
90 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
91}
92
Jason M. Billscbf78532019-08-16 15:32:11 -070093static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -070094{
Jason M. Billscbf78532019-08-16 15:32:11 -070095 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
96
97 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
98 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
99 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700100}
101
Jason M. Billscbf78532019-08-16 15:32:11 -0700102static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700103{
Jason M. Billscbf78532019-08-16 15:32:11 -0700104 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
105 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700106
107 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
108 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
109 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
110}
111
Jason M. Bills89922f82019-08-06 11:10:02 -0700112static void smiTimeoutLog()
113{
114 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
115 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
116 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
117}
118
Jason M. Bills08866542019-08-16 12:04:19 -0700119static void ssbThermTripLog()
120{
121 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
122 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
123 "OpenBMC.0.1.SsbThermalTrip", NULL);
124}
125
Jason M. Billsa15c2522019-08-16 10:01:44 -0700126static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700127static void initializeHostState()
128{
129 conn->async_method_call(
130 [](boost::system::error_code ec,
131 const std::variant<std::string>& property) {
132 if (ec)
133 {
134 return;
135 }
136 const std::string* state = std::get_if<std::string>(&property);
137 if (state == nullptr)
138 {
139 std::cerr << "Unable to read host state value\n";
140 return;
141 }
142 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700143 // If the system is on, initialize the error state
144 if (!hostOff)
145 {
146 initializeErrorState();
147 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700148 },
149 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
150 "org.freedesktop.DBus.Properties", "Get",
151 "xyz.openbmc_project.State.Host", "CurrentHostState");
152}
153
154static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
155{
156 return std::make_shared<sdbusplus::bus::match::match>(
157 *conn,
158 "type='signal',interface='org.freedesktop.DBus.Properties',"
159 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
160 "Host'",
161 [](sdbusplus::message::message& msg) {
162 std::string interfaceName;
163 boost::container::flat_map<std::string, std::variant<std::string>>
164 propertiesChanged;
165 std::string state;
166 try
167 {
168 msg.read(interfaceName, propertiesChanged);
169 state =
170 std::get<std::string>(propertiesChanged.begin()->second);
171 }
172 catch (std::exception& e)
173 {
174 std::cerr << "Unable to read host state\n";
175 return;
176 }
177 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
178
179 // No host events should fire while off, so cancel any pending
180 // timers
181 if (hostOff)
182 {
183 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700184 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700185 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700186 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700187 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700188 }
189 });
190}
191
192static bool requestGPIOEvents(
193 const std::string& name, const std::function<void()>& handler,
194 gpiod::line& gpioLine,
195 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
196{
197 // Find the GPIO line
198 gpioLine = gpiod::find_line(name);
199 if (!gpioLine)
200 {
201 std::cerr << "Failed to find the " << name << " line\n";
202 return false;
203 }
204
205 try
206 {
207 gpioLine.request(
208 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
209 }
210 catch (std::exception&)
211 {
212 std::cerr << "Failed to request events for " << name << "\n";
213 return false;
214 }
215
216 int gpioLineFd = gpioLine.event_get_fd();
217 if (gpioLineFd < 0)
218 {
219 std::cerr << "Failed to get " << name << " fd\n";
220 return false;
221 }
222
223 gpioEventDescriptor.assign(gpioLineFd);
224
225 gpioEventDescriptor.async_wait(
226 boost::asio::posix::stream_descriptor::wait_read,
227 [&name, handler](const boost::system::error_code ec) {
228 if (ec)
229 {
230 std::cerr << name << " fd handler error: " << ec.message()
231 << "\n";
232 return;
233 }
234 handler();
235 });
236 return true;
237}
238
239static void startPowerCycle()
240{
241 conn->async_method_call(
242 [](boost::system::error_code ec) {
243 if (ec)
244 {
245 std::cerr << "failed to set Chassis State\n";
246 }
247 },
248 "xyz.openbmc_project.State.Chassis",
249 "/xyz/openbmc_project/state/chassis0",
250 "org.freedesktop.DBus.Properties", "Set",
251 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
252 std::variant<std::string>{
253 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
254}
255
256static void startCrashdumpAndRecovery(bool recoverSystem)
257{
258 std::cout << "Starting crashdump\n";
259 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
260 static boost::asio::steady_timer crashdumpTimer(io);
261
262 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
263 *conn,
264 "type='signal',interface='org.freedesktop.DBus.Properties',"
265 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
266 [recoverSystem](sdbusplus::message::message& msg) {
267 crashdumpTimer.cancel();
268 std::cout << "Crashdump completed\n";
269 if (recoverSystem)
270 {
271 std::cout << "Recovering the system\n";
272 startPowerCycle();
273 }
274 crashdumpCompleteMatch.reset();
275 });
276
277 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
278 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
279 if (ec)
280 {
281 // operation_aborted is expected if timer is canceled
282 if (ec != boost::asio::error::operation_aborted)
283 {
284 std::cerr << "Crashdump async_wait failed: " << ec.message()
285 << "\n";
286 }
287 std::cout << "Crashdump timer canceled\n";
288 return;
289 }
290 std::cerr << "Crashdump failed to complete before timeout\n";
291 crashdumpCompleteMatch.reset();
292 });
293
294 conn->async_method_call(
295 [](boost::system::error_code ec) {
296 if (ec)
297 {
298 std::cerr << "failed to start Crashdump\n";
299 crashdumpTimer.cancel();
300 crashdumpCompleteMatch.reset();
301 }
302 },
303 "com.intel.crashdump", "/com/intel/crashdump",
304 "com.intel.crashdump.Stored", "GenerateStoredLog");
305}
306
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700307static void incrementCPUErrorCount(int cpuNum)
308{
309 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
310
311 // Get the current count
312 conn->async_method_call(
313 [propertyName](boost::system::error_code ec,
314 const std::variant<uint8_t>& property) {
315 if (ec)
316 {
317 std::cerr << "Failed to read " << propertyName << ": "
318 << ec.message() << "\n";
319 return;
320 }
321 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
322 if (errorCountVariant == nullptr)
323 {
324 std::cerr << propertyName << " invalid\n";
325 return;
326 }
327 uint8_t errorCount = *errorCountVariant;
328 if (errorCount == std::numeric_limits<uint8_t>::max())
329 {
330 std::cerr << "Maximum error count reached\n";
331 return;
332 }
333 // Increment the count
334 errorCount++;
335 conn->async_method_call(
336 [propertyName](boost::system::error_code ec) {
337 if (ec)
338 {
339 std::cerr << "Failed to set " << propertyName << ": "
340 << ec.message() << "\n";
341 }
342 },
343 "xyz.openbmc_project.Settings",
344 "/xyz/openbmc_project/control/processor_error_config",
345 "org.freedesktop.DBus.Properties", "Set",
346 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
347 std::variant<uint8_t>{errorCount});
348 },
349 "xyz.openbmc_project.Settings",
350 "/xyz/openbmc_project/control/processor_error_config",
351 "org.freedesktop.DBus.Properties", "Get",
352 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
353}
354
Jason M. Billsa3397932019-08-06 11:07:21 -0700355static bool checkIERRCPUs()
356{
357 bool cpuIERRFound = false;
358 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
359 cpu++, addr++)
360 {
361 uint8_t cc = 0;
362 CPUModel model{};
363 uint8_t stepping = 0;
364 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
365 {
366 std::cerr << "Cannot get CPUID!\n";
367 continue;
368 }
369
370 switch (model)
371 {
372 case skx:
373 {
374 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
375 // that caused the IERR
376 uint32_t mcaErrSrcLog = 0;
377 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
378 &cc) != PECI_CC_SUCCESS)
379 {
380 continue;
381 }
382 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
383 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
384 {
385 // TODO: Light the CPU fault LED?
386 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700387 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700388 // Next check if it's a CPU/VR mismatch by reading the
389 // IA32_MC4_STATUS MSR (0x411)
390 uint64_t mc4Status = 0;
391 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
392 PECI_CC_SUCCESS)
393 {
394 continue;
395 }
396 // Check MSEC bits 31:24 for
397 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
398 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
399 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
400 if ((mc4Status & (0x40 << 24)) ||
401 (mc4Status & (0x42 << 24)) ||
402 (mc4Status & (0x43 << 24)))
403 {
404 cpuIERRLog(cpu, "CPU/VR Mismatch");
405 continue;
406 }
407
408 // Next check if it's a Core FIVR fault by looking for a
409 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
410 // 80h)
411 uint32_t coreFIVRErrLog = 0;
412 if (peci_RdPCIConfigLocal(
413 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
414 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
415 {
416 continue;
417 }
418 if (coreFIVRErrLog)
419 {
420 cpuIERRLog(cpu, "Core FIVR Fault");
421 continue;
422 }
423
424 // Next check if it's an Uncore FIVR fault by looking for a
425 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
426 // 84h)
427 uint32_t uncoreFIVRErrLog = 0;
428 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
429 sizeof(uint32_t),
430 (uint8_t*)&uncoreFIVRErrLog,
431 &cc) != PECI_CC_SUCCESS)
432 {
433 continue;
434 }
435 if (uncoreFIVRErrLog)
436 {
437 cpuIERRLog(cpu, "Uncore FIVR Fault");
438 continue;
439 }
440
441 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
442 // both zero, but MSEC bits 31:24 have either
443 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
444 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
445 // uncore FIVR fault
446 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
447 ((mc4Status & (0x51 << 24)) ||
448 (mc4Status & (0x52 << 24))))
449 {
450 cpuIERRLog(cpu, "Uncore FIVR Fault");
451 continue;
452 }
453 cpuIERRLog(cpu);
454 }
455 break;
456 }
457 case icx:
458 {
459 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
460 // that caused the IERR
461 uint32_t mcaErrSrcLog = 0;
462 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
463 &cc) != PECI_CC_SUCCESS)
464 {
465 continue;
466 }
467 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
468 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
469 {
470 // TODO: Light the CPU fault LED?
471 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700472 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700473 // Next check if it's a CPU/VR mismatch by reading the
474 // IA32_MC4_STATUS MSR (0x411)
475 uint64_t mc4Status = 0;
476 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
477 PECI_CC_SUCCESS)
478 {
479 continue;
480 }
481 // TODO: Update MSEC/MSCOD_31_24 check
482 // Check MSEC bits 31:24 for
483 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
484 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
485 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
486 if ((mc4Status & (0x40 << 24)) ||
487 (mc4Status & (0x42 << 24)) ||
488 (mc4Status & (0x43 << 24)))
489 {
490 cpuIERRLog(cpu, "CPU/VR Mismatch");
491 continue;
492 }
493
494 // Next check if it's a Core FIVR fault by looking for a
495 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
496 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
497 uint32_t coreFIVRErrLog0 = 0;
498 uint32_t coreFIVRErrLog1 = 0;
499 if (peci_RdEndPointConfigPciLocal(
500 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
501 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
502 {
503 continue;
504 }
505 if (peci_RdEndPointConfigPciLocal(
506 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
507 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
508 {
509 continue;
510 }
511 if (coreFIVRErrLog0 || coreFIVRErrLog1)
512 {
513 cpuIERRLog(cpu, "Core FIVR Fault");
514 continue;
515 }
516
517 // Next check if it's an Uncore FIVR fault by looking for a
518 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
519 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
520 uint32_t uncoreFIVRErrLog = 0;
521 if (peci_RdEndPointConfigPciLocal(
522 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
523 (uint8_t*)&uncoreFIVRErrLog,
524 &cc) != PECI_CC_SUCCESS)
525 {
526 continue;
527 }
528 if (uncoreFIVRErrLog)
529 {
530 cpuIERRLog(cpu, "Uncore FIVR Fault");
531 continue;
532 }
533
534 // TODO: Update MSEC/MSCOD_31_24 check
535 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
536 // both zero, but MSEC bits 31:24 have either
537 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
538 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
539 // uncore FIVR fault
540 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
541 !uncoreFIVRErrLog &&
542 ((mc4Status & (0x51 << 24)) ||
543 (mc4Status & (0x52 << 24))))
544 {
545 cpuIERRLog(cpu, "Uncore FIVR Fault");
546 continue;
547 }
548 cpuIERRLog(cpu);
549 }
550 break;
551 }
552 }
553 }
554 return cpuIERRFound;
555}
556
Jason M. Billsa15c2522019-08-16 10:01:44 -0700557static void caterrAssertHandler()
558{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700559 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
560 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
561 if (ec)
562 {
563 // operation_aborted is expected if timer is canceled
564 // before completion.
565 if (ec != boost::asio::error::operation_aborted)
566 {
567 std::cerr << "caterr timeout async_wait failed: "
568 << ec.message() << "\n";
569 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700570 return;
571 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700572 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
573 << " ms\n";
574 if (!checkIERRCPUs())
575 {
576 cpuIERRLog();
577 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700578 conn->async_method_call(
579 [](boost::system::error_code ec,
580 const std::variant<bool>& property) {
581 if (ec)
582 {
583 return;
584 }
585 const bool* reset = std::get_if<bool>(&property);
586 if (reset == nullptr)
587 {
588 std::cerr << "Unable to read reset on CATERR value\n";
589 return;
590 }
591 startCrashdumpAndRecovery(*reset);
592 },
593 "xyz.openbmc_project.Settings",
594 "/xyz/openbmc_project/control/processor_error_config",
595 "org.freedesktop.DBus.Properties", "Get",
596 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
597 });
598}
599
Jason M. Bills1490b142019-07-01 15:48:43 -0700600static void caterrHandler()
601{
602 if (!hostOff)
603 {
604 gpiod::line_event gpioLineEvent = caterrLine.event_read();
605
606 bool caterr =
607 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
608 if (caterr)
609 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700610 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700611 }
612 else
613 {
614 caterrAssertTimer.cancel();
615 }
616 }
617 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
618 [](const boost::system::error_code ec) {
619 if (ec)
620 {
621 std::cerr << "caterr handler error: "
622 << ec.message() << "\n";
623 return;
624 }
625 caterrHandler();
626 });
627}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800628static void pchThermtripHandler()
629{
630 if (!hostOff)
631 {
632 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
633
634 bool pchThermtrip =
635 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
636 if (pchThermtrip)
637 {
Jason M. Bills08866542019-08-16 12:04:19 -0700638 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800639 }
640 }
641 pchThermtripEvent.async_wait(
642 boost::asio::posix::stream_descriptor::wait_read,
643 [](const boost::system::error_code ec) {
644 if (ec)
645 {
646 std::cerr << "PCH Thermal trip handler error: " << ec.message()
647 << "\n";
648 return;
649 }
650 pchThermtripHandler();
651 });
652}
653
Jason M. Billscbf78532019-08-16 15:32:11 -0700654static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700655{
Jason M. Billscbf78532019-08-16 15:32:11 -0700656 int errPinSts = (1 << errPin);
657 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700658 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
659 cpu++, addr++)
660 {
661 if (peci_Ping(addr) == PECI_CC_SUCCESS)
662 {
663 uint8_t cc = 0;
664 CPUModel model{};
665 uint8_t stepping = 0;
666 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
667 {
668 std::cerr << "Cannot get CPUID!\n";
669 continue;
670 }
671
672 switch (model)
673 {
674 case skx:
675 {
676 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700677 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700678 uint32_t errpinsts = 0;
679 if (peci_RdPCIConfigLocal(
680 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
681 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
682 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700683 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700684 }
685 break;
686 }
687 case icx:
688 {
689 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700690 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700691 // accessed on PECI as bus 13)
692 uint32_t errpinsts = 0;
693 if (peci_RdEndPointConfigPciLocal(
694 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
695 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
696 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700697 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700698 }
699 break;
700 }
701 }
702 }
703 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700704 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700705}
706
Jason M. Billscbf78532019-08-16 15:32:11 -0700707static void errXAssertHandler(const int errPin,
708 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700709{
Jason M. Billscbf78532019-08-16 15:32:11 -0700710 // ERRx status is not guaranteed through the timeout, so save which
711 // CPUs have it asserted
712 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
713 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
714 errXAssertTimer.async_wait([errPin, errPinCPUs](
715 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700716 if (ec)
717 {
718 // operation_aborted is expected if timer is canceled before
719 // completion.
720 if (ec != boost::asio::error::operation_aborted)
721 {
722 std::cerr << "err2 timeout async_wait failed: " << ec.message()
723 << "\n";
724 }
725 return;
726 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700727 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
728 << std::to_string(errTimeoutMs) << " ms\n";
729 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700730 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700731 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700732 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700733 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700734 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700735 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700736 }
737 }
738 }
739 else
740 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700741 cpuERRXLog(errPin);
742 }
743 });
744}
745
Jason M. Bills8c584392019-08-19 11:05:51 -0700746static void err0AssertHandler()
747{
748 // Handle the standard ERR0 detection and logging
749 const static constexpr int err0 = 0;
750 errXAssertHandler(err0, err0AssertTimer);
751}
752
753static void err0Handler()
754{
755 if (!hostOff)
756 {
757 gpiod::line_event gpioLineEvent = err0Line.event_read();
758
759 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
760 if (err0)
761 {
762 err0AssertHandler();
763 }
764 else
765 {
766 err0AssertTimer.cancel();
767 }
768 }
769 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
770 [](const boost::system::error_code ec) {
771 if (ec)
772 {
773 std::cerr
774 << "err0 handler error: " << ec.message()
775 << "\n";
776 return;
777 }
778 err0Handler();
779 });
780}
781
Jason M. Bills75af3962019-08-19 11:07:17 -0700782static void err1AssertHandler()
783{
784 // Handle the standard ERR1 detection and logging
785 const static constexpr int err1 = 1;
786 errXAssertHandler(err1, err1AssertTimer);
787}
788
789static void err1Handler()
790{
791 if (!hostOff)
792 {
793 gpiod::line_event gpioLineEvent = err1Line.event_read();
794
795 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
796 if (err1)
797 {
798 err1AssertHandler();
799 }
800 else
801 {
802 err1AssertTimer.cancel();
803 }
804 }
805 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
806 [](const boost::system::error_code ec) {
807 if (ec)
808 {
809 std::cerr
810 << "err1 handler error: " << ec.message()
811 << "\n";
812 return;
813 }
814 err1Handler();
815 });
816}
817
Jason M. Billscbf78532019-08-16 15:32:11 -0700818static void err2AssertHandler()
819{
820 // Handle the standard ERR2 detection and logging
821 const static constexpr int err2 = 2;
822 errXAssertHandler(err2, err2AssertTimer);
823 // Also handle reset for ERR2
824 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
825 if (ec)
826 {
827 // operation_aborted is expected if timer is canceled before
828 // completion.
829 if (ec != boost::asio::error::operation_aborted)
830 {
831 std::cerr << "err2 timeout async_wait failed: " << ec.message()
832 << "\n";
833 }
834 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700835 }
836 conn->async_method_call(
837 [](boost::system::error_code ec,
838 const std::variant<bool>& property) {
839 if (ec)
840 {
841 return;
842 }
843 const bool* reset = std::get_if<bool>(&property);
844 if (reset == nullptr)
845 {
846 std::cerr << "Unable to read reset on ERR2 value\n";
847 return;
848 }
849 startCrashdumpAndRecovery(*reset);
850 },
851 "xyz.openbmc_project.Settings",
852 "/xyz/openbmc_project/control/processor_error_config",
853 "org.freedesktop.DBus.Properties", "Get",
854 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
855 });
856}
857
858static void err2Handler()
859{
860 if (!hostOff)
861 {
862 gpiod::line_event gpioLineEvent = err2Line.event_read();
863
864 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
865 if (err2)
866 {
867 err2AssertHandler();
868 }
869 else
870 {
871 err2AssertTimer.cancel();
872 }
873 }
874 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
875 [](const boost::system::error_code ec) {
876 if (ec)
877 {
878 std::cerr
879 << "err2 handler error: " << ec.message()
880 << "\n";
881 return;
882 }
883 err2Handler();
884 });
885}
886
Jason M. Bills89922f82019-08-06 11:10:02 -0700887static void smiAssertHandler()
888{
889 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
890 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
891 if (ec)
892 {
893 // operation_aborted is expected if timer is canceled before
894 // completion.
895 if (ec != boost::asio::error::operation_aborted)
896 {
897 std::cerr << "smi timeout async_wait failed: " << ec.message()
898 << "\n";
899 }
900 return;
901 }
902 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
903 << " ms\n";
904 smiTimeoutLog();
905 conn->async_method_call(
906 [](boost::system::error_code ec,
907 const std::variant<bool>& property) {
908 if (ec)
909 {
910 return;
911 }
912 const bool* reset = std::get_if<bool>(&property);
913 if (reset == nullptr)
914 {
915 std::cerr << "Unable to read reset on SMI value\n";
916 return;
917 }
918 startCrashdumpAndRecovery(*reset);
919 },
920 "xyz.openbmc_project.Settings",
921 "/xyz/openbmc_project/control/bmc_reset_disables",
922 "org.freedesktop.DBus.Properties", "Get",
923 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
924 });
925}
926
927static void smiHandler()
928{
929 if (!hostOff)
930 {
931 gpiod::line_event gpioLineEvent = smiLine.event_read();
932
933 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
934 if (smi)
935 {
936 smiAssertHandler();
937 }
938 else
939 {
940 smiAssertTimer.cancel();
941 }
942 }
943 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
944 [](const boost::system::error_code ec) {
945 if (ec)
946 {
947 std::cerr
948 << "smi handler error: " << ec.message()
949 << "\n";
950 return;
951 }
952 smiHandler();
953 });
954}
955
Jason M. Billsa15c2522019-08-16 10:01:44 -0700956static void initializeErrorState()
957{
958 // Handle CPU_CATERR if it's asserted now
959 if (caterrLine.get_value() == 0)
960 {
961 caterrAssertHandler();
962 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700963
Jason M. Bills8c584392019-08-19 11:05:51 -0700964 // Handle CPU_ERR0 if it's asserted now
965 if (err0Line.get_value() == 0)
966 {
967 err0AssertHandler();
968 }
969
Jason M. Bills75af3962019-08-19 11:07:17 -0700970 // Handle CPU_ERR1 if it's asserted now
971 if (err1Line.get_value() == 0)
972 {
973 err1AssertHandler();
974 }
975
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700976 // Handle CPU_ERR2 if it's asserted now
977 if (err2Line.get_value() == 0)
978 {
979 err2AssertHandler();
980 }
Jason M. Bills89922f82019-08-06 11:10:02 -0700981
982 // Handle SMI if it's asserted now
983 if (smiLine.get_value() == 0)
984 {
985 smiAssertHandler();
986 }
Jason M. Bills08866542019-08-16 12:04:19 -0700987
988 // Handle PCH_BMC_THERMTRIP if it's asserted now
989 if (pchThermtripLine.get_value() == 0)
990 {
991 ssbThermTripLog();
992 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700993}
Jason M. Bills1490b142019-07-01 15:48:43 -0700994} // namespace host_error_monitor
995
996int main(int argc, char* argv[])
997{
998 // setup connection to dbus
999 host_error_monitor::conn =
1000 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1001
1002 // Host Error Monitor Object
1003 host_error_monitor::conn->request_name(
1004 "xyz.openbmc_project.HostErrorMonitor");
1005 sdbusplus::asio::object_server server =
1006 sdbusplus::asio::object_server(host_error_monitor::conn);
1007
1008 // Start tracking host state
1009 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1010 host_error_monitor::startHostStateMonitor();
1011
1012 // Initialize the host state
1013 host_error_monitor::initializeHostState();
1014
1015 // Request CPU_CATERR GPIO events
1016 if (!host_error_monitor::requestGPIOEvents(
1017 "CPU_CATERR", host_error_monitor::caterrHandler,
1018 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1019 {
1020 return -1;
1021 }
1022
Jason M. Bills8c584392019-08-19 11:05:51 -07001023 // Request CPU_ERR0 GPIO events
1024 if (!host_error_monitor::requestGPIOEvents(
1025 "CPU_ERR0", host_error_monitor::err0Handler,
1026 host_error_monitor::err0Line, host_error_monitor::err0Event))
1027 {
1028 return -1;
1029 }
1030
Jason M. Bills75af3962019-08-19 11:07:17 -07001031 // Request CPU_ERR1 GPIO events
1032 if (!host_error_monitor::requestGPIOEvents(
1033 "CPU_ERR1", host_error_monitor::err1Handler,
1034 host_error_monitor::err1Line, host_error_monitor::err1Event))
1035 {
1036 return -1;
1037 }
1038
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001039 // Request CPU_ERR2 GPIO events
1040 if (!host_error_monitor::requestGPIOEvents(
1041 "CPU_ERR2", host_error_monitor::err2Handler,
1042 host_error_monitor::err2Line, host_error_monitor::err2Event))
1043 {
1044 return -1;
1045 }
1046
Jason M. Bills89922f82019-08-06 11:10:02 -07001047 // Request SMI GPIO events
1048 if (!host_error_monitor::requestGPIOEvents(
1049 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1050 host_error_monitor::smiEvent))
1051 {
1052 return -1;
1053 }
1054
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001055 // Request PCH_BMC_THERMTRIP GPIO events
1056 if (!host_error_monitor::requestGPIOEvents(
1057 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1058 host_error_monitor::pchThermtripLine,
1059 host_error_monitor::pchThermtripEvent))
1060 {
1061 return -1;
1062 }
1063
Jason M. Bills1490b142019-07-01 15:48:43 -07001064 host_error_monitor::io.run();
1065
1066 return 0;
1067}