blob: 0ef43be2f834c6e4a14c6a1f05128ba51101fa9c [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070041// Timer for ERR2 asserted
42static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070043// Timer for SMI asserted
44static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070045
46// GPIO Lines and Event Descriptors
47static gpiod::line caterrLine;
48static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070049static gpiod::line err2Line;
50static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070051static gpiod::line smiLine;
52static boost::asio::posix::stream_descriptor smiEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080053//----------------------------------
54// PCH_BMC_THERMTRIP function related definition
55//----------------------------------
56// GPIO Lines and Event Descriptors
57static gpiod::line pchThermtripLine;
58static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070059
Jason M. Billsa3397932019-08-06 11:07:21 -070060static void cpuIERRLog()
61{
62 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
63 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
64 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
65}
66
67static void cpuIERRLog(const int cpuNum)
68{
69 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
70
71 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
72 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
73 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
74}
75
76static void cpuIERRLog(const int cpuNum, const std::string& type)
77{
78 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
79
80 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
81 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
82 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
83}
84
Jason M. Billscbf78532019-08-16 15:32:11 -070085static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -070086{
Jason M. Billscbf78532019-08-16 15:32:11 -070087 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
88
89 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
90 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
91 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070092}
93
Jason M. Billscbf78532019-08-16 15:32:11 -070094static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -070095{
Jason M. Billscbf78532019-08-16 15:32:11 -070096 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
97 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070098
99 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
100 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
101 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
102}
103
Jason M. Bills89922f82019-08-06 11:10:02 -0700104static void smiTimeoutLog()
105{
106 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
107 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
108 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
109}
110
Jason M. Bills08866542019-08-16 12:04:19 -0700111static void ssbThermTripLog()
112{
113 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
114 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
115 "OpenBMC.0.1.SsbThermalTrip", NULL);
116}
117
Jason M. Billsa15c2522019-08-16 10:01:44 -0700118static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700119static void initializeHostState()
120{
121 conn->async_method_call(
122 [](boost::system::error_code ec,
123 const std::variant<std::string>& property) {
124 if (ec)
125 {
126 return;
127 }
128 const std::string* state = std::get_if<std::string>(&property);
129 if (state == nullptr)
130 {
131 std::cerr << "Unable to read host state value\n";
132 return;
133 }
134 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700135 // If the system is on, initialize the error state
136 if (!hostOff)
137 {
138 initializeErrorState();
139 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700140 },
141 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
142 "org.freedesktop.DBus.Properties", "Get",
143 "xyz.openbmc_project.State.Host", "CurrentHostState");
144}
145
146static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
147{
148 return std::make_shared<sdbusplus::bus::match::match>(
149 *conn,
150 "type='signal',interface='org.freedesktop.DBus.Properties',"
151 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
152 "Host'",
153 [](sdbusplus::message::message& msg) {
154 std::string interfaceName;
155 boost::container::flat_map<std::string, std::variant<std::string>>
156 propertiesChanged;
157 std::string state;
158 try
159 {
160 msg.read(interfaceName, propertiesChanged);
161 state =
162 std::get<std::string>(propertiesChanged.begin()->second);
163 }
164 catch (std::exception& e)
165 {
166 std::cerr << "Unable to read host state\n";
167 return;
168 }
169 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
170
171 // No host events should fire while off, so cancel any pending
172 // timers
173 if (hostOff)
174 {
175 caterrAssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700176 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700177 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700178 }
179 });
180}
181
182static bool requestGPIOEvents(
183 const std::string& name, const std::function<void()>& handler,
184 gpiod::line& gpioLine,
185 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
186{
187 // Find the GPIO line
188 gpioLine = gpiod::find_line(name);
189 if (!gpioLine)
190 {
191 std::cerr << "Failed to find the " << name << " line\n";
192 return false;
193 }
194
195 try
196 {
197 gpioLine.request(
198 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
199 }
200 catch (std::exception&)
201 {
202 std::cerr << "Failed to request events for " << name << "\n";
203 return false;
204 }
205
206 int gpioLineFd = gpioLine.event_get_fd();
207 if (gpioLineFd < 0)
208 {
209 std::cerr << "Failed to get " << name << " fd\n";
210 return false;
211 }
212
213 gpioEventDescriptor.assign(gpioLineFd);
214
215 gpioEventDescriptor.async_wait(
216 boost::asio::posix::stream_descriptor::wait_read,
217 [&name, handler](const boost::system::error_code ec) {
218 if (ec)
219 {
220 std::cerr << name << " fd handler error: " << ec.message()
221 << "\n";
222 return;
223 }
224 handler();
225 });
226 return true;
227}
228
229static void startPowerCycle()
230{
231 conn->async_method_call(
232 [](boost::system::error_code ec) {
233 if (ec)
234 {
235 std::cerr << "failed to set Chassis State\n";
236 }
237 },
238 "xyz.openbmc_project.State.Chassis",
239 "/xyz/openbmc_project/state/chassis0",
240 "org.freedesktop.DBus.Properties", "Set",
241 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
242 std::variant<std::string>{
243 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
244}
245
246static void startCrashdumpAndRecovery(bool recoverSystem)
247{
248 std::cout << "Starting crashdump\n";
249 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
250 static boost::asio::steady_timer crashdumpTimer(io);
251
252 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
253 *conn,
254 "type='signal',interface='org.freedesktop.DBus.Properties',"
255 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
256 [recoverSystem](sdbusplus::message::message& msg) {
257 crashdumpTimer.cancel();
258 std::cout << "Crashdump completed\n";
259 if (recoverSystem)
260 {
261 std::cout << "Recovering the system\n";
262 startPowerCycle();
263 }
264 crashdumpCompleteMatch.reset();
265 });
266
267 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
268 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
269 if (ec)
270 {
271 // operation_aborted is expected if timer is canceled
272 if (ec != boost::asio::error::operation_aborted)
273 {
274 std::cerr << "Crashdump async_wait failed: " << ec.message()
275 << "\n";
276 }
277 std::cout << "Crashdump timer canceled\n";
278 return;
279 }
280 std::cerr << "Crashdump failed to complete before timeout\n";
281 crashdumpCompleteMatch.reset();
282 });
283
284 conn->async_method_call(
285 [](boost::system::error_code ec) {
286 if (ec)
287 {
288 std::cerr << "failed to start Crashdump\n";
289 crashdumpTimer.cancel();
290 crashdumpCompleteMatch.reset();
291 }
292 },
293 "com.intel.crashdump", "/com/intel/crashdump",
294 "com.intel.crashdump.Stored", "GenerateStoredLog");
295}
296
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700297static void incrementCPUErrorCount(int cpuNum)
298{
299 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
300
301 // Get the current count
302 conn->async_method_call(
303 [propertyName](boost::system::error_code ec,
304 const std::variant<uint8_t>& property) {
305 if (ec)
306 {
307 std::cerr << "Failed to read " << propertyName << ": "
308 << ec.message() << "\n";
309 return;
310 }
311 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
312 if (errorCountVariant == nullptr)
313 {
314 std::cerr << propertyName << " invalid\n";
315 return;
316 }
317 uint8_t errorCount = *errorCountVariant;
318 if (errorCount == std::numeric_limits<uint8_t>::max())
319 {
320 std::cerr << "Maximum error count reached\n";
321 return;
322 }
323 // Increment the count
324 errorCount++;
325 conn->async_method_call(
326 [propertyName](boost::system::error_code ec) {
327 if (ec)
328 {
329 std::cerr << "Failed to set " << propertyName << ": "
330 << ec.message() << "\n";
331 }
332 },
333 "xyz.openbmc_project.Settings",
334 "/xyz/openbmc_project/control/processor_error_config",
335 "org.freedesktop.DBus.Properties", "Set",
336 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
337 std::variant<uint8_t>{errorCount});
338 },
339 "xyz.openbmc_project.Settings",
340 "/xyz/openbmc_project/control/processor_error_config",
341 "org.freedesktop.DBus.Properties", "Get",
342 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
343}
344
Jason M. Billsa3397932019-08-06 11:07:21 -0700345static bool checkIERRCPUs()
346{
347 bool cpuIERRFound = false;
348 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
349 cpu++, addr++)
350 {
351 uint8_t cc = 0;
352 CPUModel model{};
353 uint8_t stepping = 0;
354 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
355 {
356 std::cerr << "Cannot get CPUID!\n";
357 continue;
358 }
359
360 switch (model)
361 {
362 case skx:
363 {
364 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
365 // that caused the IERR
366 uint32_t mcaErrSrcLog = 0;
367 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
368 &cc) != PECI_CC_SUCCESS)
369 {
370 continue;
371 }
372 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
373 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
374 {
375 // TODO: Light the CPU fault LED?
376 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700377 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700378 // Next check if it's a CPU/VR mismatch by reading the
379 // IA32_MC4_STATUS MSR (0x411)
380 uint64_t mc4Status = 0;
381 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
382 PECI_CC_SUCCESS)
383 {
384 continue;
385 }
386 // Check MSEC bits 31:24 for
387 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
388 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
389 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
390 if ((mc4Status & (0x40 << 24)) ||
391 (mc4Status & (0x42 << 24)) ||
392 (mc4Status & (0x43 << 24)))
393 {
394 cpuIERRLog(cpu, "CPU/VR Mismatch");
395 continue;
396 }
397
398 // Next check if it's a Core FIVR fault by looking for a
399 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
400 // 80h)
401 uint32_t coreFIVRErrLog = 0;
402 if (peci_RdPCIConfigLocal(
403 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
404 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
405 {
406 continue;
407 }
408 if (coreFIVRErrLog)
409 {
410 cpuIERRLog(cpu, "Core FIVR Fault");
411 continue;
412 }
413
414 // Next check if it's an Uncore FIVR fault by looking for a
415 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
416 // 84h)
417 uint32_t uncoreFIVRErrLog = 0;
418 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
419 sizeof(uint32_t),
420 (uint8_t*)&uncoreFIVRErrLog,
421 &cc) != PECI_CC_SUCCESS)
422 {
423 continue;
424 }
425 if (uncoreFIVRErrLog)
426 {
427 cpuIERRLog(cpu, "Uncore FIVR Fault");
428 continue;
429 }
430
431 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
432 // both zero, but MSEC bits 31:24 have either
433 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
434 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
435 // uncore FIVR fault
436 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
437 ((mc4Status & (0x51 << 24)) ||
438 (mc4Status & (0x52 << 24))))
439 {
440 cpuIERRLog(cpu, "Uncore FIVR Fault");
441 continue;
442 }
443 cpuIERRLog(cpu);
444 }
445 break;
446 }
447 case icx:
448 {
449 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
450 // that caused the IERR
451 uint32_t mcaErrSrcLog = 0;
452 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
453 &cc) != PECI_CC_SUCCESS)
454 {
455 continue;
456 }
457 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
458 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
459 {
460 // TODO: Light the CPU fault LED?
461 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700462 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700463 // Next check if it's a CPU/VR mismatch by reading the
464 // IA32_MC4_STATUS MSR (0x411)
465 uint64_t mc4Status = 0;
466 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
467 PECI_CC_SUCCESS)
468 {
469 continue;
470 }
471 // TODO: Update MSEC/MSCOD_31_24 check
472 // Check MSEC bits 31:24 for
473 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
474 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
475 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
476 if ((mc4Status & (0x40 << 24)) ||
477 (mc4Status & (0x42 << 24)) ||
478 (mc4Status & (0x43 << 24)))
479 {
480 cpuIERRLog(cpu, "CPU/VR Mismatch");
481 continue;
482 }
483
484 // Next check if it's a Core FIVR fault by looking for a
485 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
486 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
487 uint32_t coreFIVRErrLog0 = 0;
488 uint32_t coreFIVRErrLog1 = 0;
489 if (peci_RdEndPointConfigPciLocal(
490 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
491 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
492 {
493 continue;
494 }
495 if (peci_RdEndPointConfigPciLocal(
496 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
497 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
498 {
499 continue;
500 }
501 if (coreFIVRErrLog0 || coreFIVRErrLog1)
502 {
503 cpuIERRLog(cpu, "Core FIVR Fault");
504 continue;
505 }
506
507 // Next check if it's an Uncore FIVR fault by looking for a
508 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
509 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
510 uint32_t uncoreFIVRErrLog = 0;
511 if (peci_RdEndPointConfigPciLocal(
512 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
513 (uint8_t*)&uncoreFIVRErrLog,
514 &cc) != PECI_CC_SUCCESS)
515 {
516 continue;
517 }
518 if (uncoreFIVRErrLog)
519 {
520 cpuIERRLog(cpu, "Uncore FIVR Fault");
521 continue;
522 }
523
524 // TODO: Update MSEC/MSCOD_31_24 check
525 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
526 // both zero, but MSEC bits 31:24 have either
527 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
528 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
529 // uncore FIVR fault
530 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
531 !uncoreFIVRErrLog &&
532 ((mc4Status & (0x51 << 24)) ||
533 (mc4Status & (0x52 << 24))))
534 {
535 cpuIERRLog(cpu, "Uncore FIVR Fault");
536 continue;
537 }
538 cpuIERRLog(cpu);
539 }
540 break;
541 }
542 }
543 }
544 return cpuIERRFound;
545}
546
Jason M. Billsa15c2522019-08-16 10:01:44 -0700547static void caterrAssertHandler()
548{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700549 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
550 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
551 if (ec)
552 {
553 // operation_aborted is expected if timer is canceled
554 // before completion.
555 if (ec != boost::asio::error::operation_aborted)
556 {
557 std::cerr << "caterr timeout async_wait failed: "
558 << ec.message() << "\n";
559 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700560 return;
561 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700562 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
563 << " ms\n";
564 if (!checkIERRCPUs())
565 {
566 cpuIERRLog();
567 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700568 conn->async_method_call(
569 [](boost::system::error_code ec,
570 const std::variant<bool>& property) {
571 if (ec)
572 {
573 return;
574 }
575 const bool* reset = std::get_if<bool>(&property);
576 if (reset == nullptr)
577 {
578 std::cerr << "Unable to read reset on CATERR value\n";
579 return;
580 }
581 startCrashdumpAndRecovery(*reset);
582 },
583 "xyz.openbmc_project.Settings",
584 "/xyz/openbmc_project/control/processor_error_config",
585 "org.freedesktop.DBus.Properties", "Get",
586 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
587 });
588}
589
Jason M. Bills1490b142019-07-01 15:48:43 -0700590static void caterrHandler()
591{
592 if (!hostOff)
593 {
594 gpiod::line_event gpioLineEvent = caterrLine.event_read();
595
596 bool caterr =
597 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
598 if (caterr)
599 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700600 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700601 }
602 else
603 {
604 caterrAssertTimer.cancel();
605 }
606 }
607 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
608 [](const boost::system::error_code ec) {
609 if (ec)
610 {
611 std::cerr << "caterr handler error: "
612 << ec.message() << "\n";
613 return;
614 }
615 caterrHandler();
616 });
617}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800618static void pchThermtripHandler()
619{
620 if (!hostOff)
621 {
622 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
623
624 bool pchThermtrip =
625 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
626 if (pchThermtrip)
627 {
Jason M. Bills08866542019-08-16 12:04:19 -0700628 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800629 }
630 }
631 pchThermtripEvent.async_wait(
632 boost::asio::posix::stream_descriptor::wait_read,
633 [](const boost::system::error_code ec) {
634 if (ec)
635 {
636 std::cerr << "PCH Thermal trip handler error: " << ec.message()
637 << "\n";
638 return;
639 }
640 pchThermtripHandler();
641 });
642}
643
Jason M. Billscbf78532019-08-16 15:32:11 -0700644static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700645{
Jason M. Billscbf78532019-08-16 15:32:11 -0700646 int errPinSts = (1 << errPin);
647 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700648 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
649 cpu++, addr++)
650 {
651 if (peci_Ping(addr) == PECI_CC_SUCCESS)
652 {
653 uint8_t cc = 0;
654 CPUModel model{};
655 uint8_t stepping = 0;
656 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
657 {
658 std::cerr << "Cannot get CPUID!\n";
659 continue;
660 }
661
662 switch (model)
663 {
664 case skx:
665 {
666 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700667 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700668 uint32_t errpinsts = 0;
669 if (peci_RdPCIConfigLocal(
670 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
671 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
672 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700673 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700674 }
675 break;
676 }
677 case icx:
678 {
679 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700680 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700681 // accessed on PECI as bus 13)
682 uint32_t errpinsts = 0;
683 if (peci_RdEndPointConfigPciLocal(
684 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
685 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
686 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700687 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700688 }
689 break;
690 }
691 }
692 }
693 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700694 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700695}
696
Jason M. Billscbf78532019-08-16 15:32:11 -0700697static void errXAssertHandler(const int errPin,
698 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700699{
Jason M. Billscbf78532019-08-16 15:32:11 -0700700 // ERRx status is not guaranteed through the timeout, so save which
701 // CPUs have it asserted
702 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
703 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
704 errXAssertTimer.async_wait([errPin, errPinCPUs](
705 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700706 if (ec)
707 {
708 // operation_aborted is expected if timer is canceled before
709 // completion.
710 if (ec != boost::asio::error::operation_aborted)
711 {
712 std::cerr << "err2 timeout async_wait failed: " << ec.message()
713 << "\n";
714 }
715 return;
716 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700717 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
718 << std::to_string(errTimeoutMs) << " ms\n";
719 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700720 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700721 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700722 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700723 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700724 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700725 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700726 }
727 }
728 }
729 else
730 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700731 cpuERRXLog(errPin);
732 }
733 });
734}
735
736static void err2AssertHandler()
737{
738 // Handle the standard ERR2 detection and logging
739 const static constexpr int err2 = 2;
740 errXAssertHandler(err2, err2AssertTimer);
741 // Also handle reset for ERR2
742 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
743 if (ec)
744 {
745 // operation_aborted is expected if timer is canceled before
746 // completion.
747 if (ec != boost::asio::error::operation_aborted)
748 {
749 std::cerr << "err2 timeout async_wait failed: " << ec.message()
750 << "\n";
751 }
752 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700753 }
754 conn->async_method_call(
755 [](boost::system::error_code ec,
756 const std::variant<bool>& property) {
757 if (ec)
758 {
759 return;
760 }
761 const bool* reset = std::get_if<bool>(&property);
762 if (reset == nullptr)
763 {
764 std::cerr << "Unable to read reset on ERR2 value\n";
765 return;
766 }
767 startCrashdumpAndRecovery(*reset);
768 },
769 "xyz.openbmc_project.Settings",
770 "/xyz/openbmc_project/control/processor_error_config",
771 "org.freedesktop.DBus.Properties", "Get",
772 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
773 });
774}
775
776static void err2Handler()
777{
778 if (!hostOff)
779 {
780 gpiod::line_event gpioLineEvent = err2Line.event_read();
781
782 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
783 if (err2)
784 {
785 err2AssertHandler();
786 }
787 else
788 {
789 err2AssertTimer.cancel();
790 }
791 }
792 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
793 [](const boost::system::error_code ec) {
794 if (ec)
795 {
796 std::cerr
797 << "err2 handler error: " << ec.message()
798 << "\n";
799 return;
800 }
801 err2Handler();
802 });
803}
804
Jason M. Bills89922f82019-08-06 11:10:02 -0700805static void smiAssertHandler()
806{
807 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
808 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
809 if (ec)
810 {
811 // operation_aborted is expected if timer is canceled before
812 // completion.
813 if (ec != boost::asio::error::operation_aborted)
814 {
815 std::cerr << "smi timeout async_wait failed: " << ec.message()
816 << "\n";
817 }
818 return;
819 }
820 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
821 << " ms\n";
822 smiTimeoutLog();
823 conn->async_method_call(
824 [](boost::system::error_code ec,
825 const std::variant<bool>& property) {
826 if (ec)
827 {
828 return;
829 }
830 const bool* reset = std::get_if<bool>(&property);
831 if (reset == nullptr)
832 {
833 std::cerr << "Unable to read reset on SMI value\n";
834 return;
835 }
836 startCrashdumpAndRecovery(*reset);
837 },
838 "xyz.openbmc_project.Settings",
839 "/xyz/openbmc_project/control/bmc_reset_disables",
840 "org.freedesktop.DBus.Properties", "Get",
841 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
842 });
843}
844
845static void smiHandler()
846{
847 if (!hostOff)
848 {
849 gpiod::line_event gpioLineEvent = smiLine.event_read();
850
851 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
852 if (smi)
853 {
854 smiAssertHandler();
855 }
856 else
857 {
858 smiAssertTimer.cancel();
859 }
860 }
861 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
862 [](const boost::system::error_code ec) {
863 if (ec)
864 {
865 std::cerr
866 << "smi handler error: " << ec.message()
867 << "\n";
868 return;
869 }
870 smiHandler();
871 });
872}
873
Jason M. Billsa15c2522019-08-16 10:01:44 -0700874static void initializeErrorState()
875{
876 // Handle CPU_CATERR if it's asserted now
877 if (caterrLine.get_value() == 0)
878 {
879 caterrAssertHandler();
880 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700881
882 // Handle CPU_ERR2 if it's asserted now
883 if (err2Line.get_value() == 0)
884 {
885 err2AssertHandler();
886 }
Jason M. Bills89922f82019-08-06 11:10:02 -0700887
888 // Handle SMI if it's asserted now
889 if (smiLine.get_value() == 0)
890 {
891 smiAssertHandler();
892 }
Jason M. Bills08866542019-08-16 12:04:19 -0700893
894 // Handle PCH_BMC_THERMTRIP if it's asserted now
895 if (pchThermtripLine.get_value() == 0)
896 {
897 ssbThermTripLog();
898 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700899}
Jason M. Bills1490b142019-07-01 15:48:43 -0700900} // namespace host_error_monitor
901
902int main(int argc, char* argv[])
903{
904 // setup connection to dbus
905 host_error_monitor::conn =
906 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
907
908 // Host Error Monitor Object
909 host_error_monitor::conn->request_name(
910 "xyz.openbmc_project.HostErrorMonitor");
911 sdbusplus::asio::object_server server =
912 sdbusplus::asio::object_server(host_error_monitor::conn);
913
914 // Start tracking host state
915 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
916 host_error_monitor::startHostStateMonitor();
917
918 // Initialize the host state
919 host_error_monitor::initializeHostState();
920
921 // Request CPU_CATERR GPIO events
922 if (!host_error_monitor::requestGPIOEvents(
923 "CPU_CATERR", host_error_monitor::caterrHandler,
924 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
925 {
926 return -1;
927 }
928
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700929 // Request CPU_ERR2 GPIO events
930 if (!host_error_monitor::requestGPIOEvents(
931 "CPU_ERR2", host_error_monitor::err2Handler,
932 host_error_monitor::err2Line, host_error_monitor::err2Event))
933 {
934 return -1;
935 }
936
Jason M. Bills89922f82019-08-06 11:10:02 -0700937 // Request SMI GPIO events
938 if (!host_error_monitor::requestGPIOEvents(
939 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
940 host_error_monitor::smiEvent))
941 {
942 return -1;
943 }
944
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800945 // Request PCH_BMC_THERMTRIP GPIO events
946 if (!host_error_monitor::requestGPIOEvents(
947 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
948 host_error_monitor::pchThermtripLine,
949 host_error_monitor::pchThermtripEvent))
950 {
951 return -1;
952 }
953
Jason M. Bills1490b142019-07-01 15:48:43 -0700954 host_error_monitor::io.run();
955
956 return 0;
957}