blob: 6f9ef0e77b4110982763d9402060cd72d30a6f11 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Bills6a2cb692019-08-06 11:03:49 -070034const static constexpr size_t err2TimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070041// Timer for ERR2 asserted
42static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070043// Timer for SMI asserted
44static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070045
46// GPIO Lines and Event Descriptors
47static gpiod::line caterrLine;
48static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070049static gpiod::line err2Line;
50static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070051static gpiod::line smiLine;
52static boost::asio::posix::stream_descriptor smiEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080053//----------------------------------
54// PCH_BMC_THERMTRIP function related definition
55//----------------------------------
56// GPIO Lines and Event Descriptors
57static gpiod::line pchThermtripLine;
58static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070059
Jason M. Billsa3397932019-08-06 11:07:21 -070060static void cpuIERRLog()
61{
62 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
63 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
64 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
65}
66
67static void cpuIERRLog(const int cpuNum)
68{
69 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
70
71 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
72 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
73 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
74}
75
76static void cpuIERRLog(const int cpuNum, const std::string& type)
77{
78 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
79
80 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
81 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
82 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
83}
84
Jason M. Bills6a2cb692019-08-06 11:03:49 -070085static void cpuERR2Log()
86{
87 sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
88 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
89 "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
90}
91
92static void cpuERR2Log(const int cpuNum)
93{
94 std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
95
96 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
97 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
98 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
99}
100
Jason M. Bills89922f82019-08-06 11:10:02 -0700101static void smiTimeoutLog()
102{
103 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
104 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
105 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
106}
107
Jason M. Bills08866542019-08-16 12:04:19 -0700108static void ssbThermTripLog()
109{
110 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
111 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
112 "OpenBMC.0.1.SsbThermalTrip", NULL);
113}
114
Jason M. Billsa15c2522019-08-16 10:01:44 -0700115static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700116static void initializeHostState()
117{
118 conn->async_method_call(
119 [](boost::system::error_code ec,
120 const std::variant<std::string>& property) {
121 if (ec)
122 {
123 return;
124 }
125 const std::string* state = std::get_if<std::string>(&property);
126 if (state == nullptr)
127 {
128 std::cerr << "Unable to read host state value\n";
129 return;
130 }
131 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700132 // If the system is on, initialize the error state
133 if (!hostOff)
134 {
135 initializeErrorState();
136 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700137 },
138 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
139 "org.freedesktop.DBus.Properties", "Get",
140 "xyz.openbmc_project.State.Host", "CurrentHostState");
141}
142
143static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
144{
145 return std::make_shared<sdbusplus::bus::match::match>(
146 *conn,
147 "type='signal',interface='org.freedesktop.DBus.Properties',"
148 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
149 "Host'",
150 [](sdbusplus::message::message& msg) {
151 std::string interfaceName;
152 boost::container::flat_map<std::string, std::variant<std::string>>
153 propertiesChanged;
154 std::string state;
155 try
156 {
157 msg.read(interfaceName, propertiesChanged);
158 state =
159 std::get<std::string>(propertiesChanged.begin()->second);
160 }
161 catch (std::exception& e)
162 {
163 std::cerr << "Unable to read host state\n";
164 return;
165 }
166 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
167
168 // No host events should fire while off, so cancel any pending
169 // timers
170 if (hostOff)
171 {
172 caterrAssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700173 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700174 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700175 }
176 });
177}
178
179static bool requestGPIOEvents(
180 const std::string& name, const std::function<void()>& handler,
181 gpiod::line& gpioLine,
182 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
183{
184 // Find the GPIO line
185 gpioLine = gpiod::find_line(name);
186 if (!gpioLine)
187 {
188 std::cerr << "Failed to find the " << name << " line\n";
189 return false;
190 }
191
192 try
193 {
194 gpioLine.request(
195 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
196 }
197 catch (std::exception&)
198 {
199 std::cerr << "Failed to request events for " << name << "\n";
200 return false;
201 }
202
203 int gpioLineFd = gpioLine.event_get_fd();
204 if (gpioLineFd < 0)
205 {
206 std::cerr << "Failed to get " << name << " fd\n";
207 return false;
208 }
209
210 gpioEventDescriptor.assign(gpioLineFd);
211
212 gpioEventDescriptor.async_wait(
213 boost::asio::posix::stream_descriptor::wait_read,
214 [&name, handler](const boost::system::error_code ec) {
215 if (ec)
216 {
217 std::cerr << name << " fd handler error: " << ec.message()
218 << "\n";
219 return;
220 }
221 handler();
222 });
223 return true;
224}
225
226static void startPowerCycle()
227{
228 conn->async_method_call(
229 [](boost::system::error_code ec) {
230 if (ec)
231 {
232 std::cerr << "failed to set Chassis State\n";
233 }
234 },
235 "xyz.openbmc_project.State.Chassis",
236 "/xyz/openbmc_project/state/chassis0",
237 "org.freedesktop.DBus.Properties", "Set",
238 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
239 std::variant<std::string>{
240 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
241}
242
243static void startCrashdumpAndRecovery(bool recoverSystem)
244{
245 std::cout << "Starting crashdump\n";
246 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
247 static boost::asio::steady_timer crashdumpTimer(io);
248
249 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
250 *conn,
251 "type='signal',interface='org.freedesktop.DBus.Properties',"
252 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
253 [recoverSystem](sdbusplus::message::message& msg) {
254 crashdumpTimer.cancel();
255 std::cout << "Crashdump completed\n";
256 if (recoverSystem)
257 {
258 std::cout << "Recovering the system\n";
259 startPowerCycle();
260 }
261 crashdumpCompleteMatch.reset();
262 });
263
264 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
265 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
266 if (ec)
267 {
268 // operation_aborted is expected if timer is canceled
269 if (ec != boost::asio::error::operation_aborted)
270 {
271 std::cerr << "Crashdump async_wait failed: " << ec.message()
272 << "\n";
273 }
274 std::cout << "Crashdump timer canceled\n";
275 return;
276 }
277 std::cerr << "Crashdump failed to complete before timeout\n";
278 crashdumpCompleteMatch.reset();
279 });
280
281 conn->async_method_call(
282 [](boost::system::error_code ec) {
283 if (ec)
284 {
285 std::cerr << "failed to start Crashdump\n";
286 crashdumpTimer.cancel();
287 crashdumpCompleteMatch.reset();
288 }
289 },
290 "com.intel.crashdump", "/com/intel/crashdump",
291 "com.intel.crashdump.Stored", "GenerateStoredLog");
292}
293
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700294static void incrementCPUErrorCount(int cpuNum)
295{
296 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
297
298 // Get the current count
299 conn->async_method_call(
300 [propertyName](boost::system::error_code ec,
301 const std::variant<uint8_t>& property) {
302 if (ec)
303 {
304 std::cerr << "Failed to read " << propertyName << ": "
305 << ec.message() << "\n";
306 return;
307 }
308 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
309 if (errorCountVariant == nullptr)
310 {
311 std::cerr << propertyName << " invalid\n";
312 return;
313 }
314 uint8_t errorCount = *errorCountVariant;
315 if (errorCount == std::numeric_limits<uint8_t>::max())
316 {
317 std::cerr << "Maximum error count reached\n";
318 return;
319 }
320 // Increment the count
321 errorCount++;
322 conn->async_method_call(
323 [propertyName](boost::system::error_code ec) {
324 if (ec)
325 {
326 std::cerr << "Failed to set " << propertyName << ": "
327 << ec.message() << "\n";
328 }
329 },
330 "xyz.openbmc_project.Settings",
331 "/xyz/openbmc_project/control/processor_error_config",
332 "org.freedesktop.DBus.Properties", "Set",
333 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
334 std::variant<uint8_t>{errorCount});
335 },
336 "xyz.openbmc_project.Settings",
337 "/xyz/openbmc_project/control/processor_error_config",
338 "org.freedesktop.DBus.Properties", "Get",
339 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
340}
341
Jason M. Billsa3397932019-08-06 11:07:21 -0700342static bool checkIERRCPUs()
343{
344 bool cpuIERRFound = false;
345 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
346 cpu++, addr++)
347 {
348 uint8_t cc = 0;
349 CPUModel model{};
350 uint8_t stepping = 0;
351 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
352 {
353 std::cerr << "Cannot get CPUID!\n";
354 continue;
355 }
356
357 switch (model)
358 {
359 case skx:
360 {
361 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
362 // that caused the IERR
363 uint32_t mcaErrSrcLog = 0;
364 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
365 &cc) != PECI_CC_SUCCESS)
366 {
367 continue;
368 }
369 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
370 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
371 {
372 // TODO: Light the CPU fault LED?
373 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700374 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700375 // Next check if it's a CPU/VR mismatch by reading the
376 // IA32_MC4_STATUS MSR (0x411)
377 uint64_t mc4Status = 0;
378 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
379 PECI_CC_SUCCESS)
380 {
381 continue;
382 }
383 // Check MSEC bits 31:24 for
384 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
385 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
386 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
387 if ((mc4Status & (0x40 << 24)) ||
388 (mc4Status & (0x42 << 24)) ||
389 (mc4Status & (0x43 << 24)))
390 {
391 cpuIERRLog(cpu, "CPU/VR Mismatch");
392 continue;
393 }
394
395 // Next check if it's a Core FIVR fault by looking for a
396 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
397 // 80h)
398 uint32_t coreFIVRErrLog = 0;
399 if (peci_RdPCIConfigLocal(
400 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
401 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
402 {
403 continue;
404 }
405 if (coreFIVRErrLog)
406 {
407 cpuIERRLog(cpu, "Core FIVR Fault");
408 continue;
409 }
410
411 // Next check if it's an Uncore FIVR fault by looking for a
412 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
413 // 84h)
414 uint32_t uncoreFIVRErrLog = 0;
415 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
416 sizeof(uint32_t),
417 (uint8_t*)&uncoreFIVRErrLog,
418 &cc) != PECI_CC_SUCCESS)
419 {
420 continue;
421 }
422 if (uncoreFIVRErrLog)
423 {
424 cpuIERRLog(cpu, "Uncore FIVR Fault");
425 continue;
426 }
427
428 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
429 // both zero, but MSEC bits 31:24 have either
430 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
431 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
432 // uncore FIVR fault
433 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
434 ((mc4Status & (0x51 << 24)) ||
435 (mc4Status & (0x52 << 24))))
436 {
437 cpuIERRLog(cpu, "Uncore FIVR Fault");
438 continue;
439 }
440 cpuIERRLog(cpu);
441 }
442 break;
443 }
444 case icx:
445 {
446 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
447 // that caused the IERR
448 uint32_t mcaErrSrcLog = 0;
449 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
450 &cc) != PECI_CC_SUCCESS)
451 {
452 continue;
453 }
454 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
455 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
456 {
457 // TODO: Light the CPU fault LED?
458 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700459 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700460 // Next check if it's a CPU/VR mismatch by reading the
461 // IA32_MC4_STATUS MSR (0x411)
462 uint64_t mc4Status = 0;
463 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
464 PECI_CC_SUCCESS)
465 {
466 continue;
467 }
468 // TODO: Update MSEC/MSCOD_31_24 check
469 // Check MSEC bits 31:24 for
470 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
471 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
472 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
473 if ((mc4Status & (0x40 << 24)) ||
474 (mc4Status & (0x42 << 24)) ||
475 (mc4Status & (0x43 << 24)))
476 {
477 cpuIERRLog(cpu, "CPU/VR Mismatch");
478 continue;
479 }
480
481 // Next check if it's a Core FIVR fault by looking for a
482 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
483 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
484 uint32_t coreFIVRErrLog0 = 0;
485 uint32_t coreFIVRErrLog1 = 0;
486 if (peci_RdEndPointConfigPciLocal(
487 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
488 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
489 {
490 continue;
491 }
492 if (peci_RdEndPointConfigPciLocal(
493 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
494 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
495 {
496 continue;
497 }
498 if (coreFIVRErrLog0 || coreFIVRErrLog1)
499 {
500 cpuIERRLog(cpu, "Core FIVR Fault");
501 continue;
502 }
503
504 // Next check if it's an Uncore FIVR fault by looking for a
505 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
506 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
507 uint32_t uncoreFIVRErrLog = 0;
508 if (peci_RdEndPointConfigPciLocal(
509 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
510 (uint8_t*)&uncoreFIVRErrLog,
511 &cc) != PECI_CC_SUCCESS)
512 {
513 continue;
514 }
515 if (uncoreFIVRErrLog)
516 {
517 cpuIERRLog(cpu, "Uncore FIVR Fault");
518 continue;
519 }
520
521 // TODO: Update MSEC/MSCOD_31_24 check
522 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
523 // both zero, but MSEC bits 31:24 have either
524 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
525 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
526 // uncore FIVR fault
527 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
528 !uncoreFIVRErrLog &&
529 ((mc4Status & (0x51 << 24)) ||
530 (mc4Status & (0x52 << 24))))
531 {
532 cpuIERRLog(cpu, "Uncore FIVR Fault");
533 continue;
534 }
535 cpuIERRLog(cpu);
536 }
537 break;
538 }
539 }
540 }
541 return cpuIERRFound;
542}
543
Jason M. Billsa15c2522019-08-16 10:01:44 -0700544static void caterrAssertHandler()
545{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700546 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
547 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
548 if (ec)
549 {
550 // operation_aborted is expected if timer is canceled
551 // before completion.
552 if (ec != boost::asio::error::operation_aborted)
553 {
554 std::cerr << "caterr timeout async_wait failed: "
555 << ec.message() << "\n";
556 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700557 return;
558 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700559 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
560 << " ms\n";
561 if (!checkIERRCPUs())
562 {
563 cpuIERRLog();
564 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700565 conn->async_method_call(
566 [](boost::system::error_code ec,
567 const std::variant<bool>& property) {
568 if (ec)
569 {
570 return;
571 }
572 const bool* reset = std::get_if<bool>(&property);
573 if (reset == nullptr)
574 {
575 std::cerr << "Unable to read reset on CATERR value\n";
576 return;
577 }
578 startCrashdumpAndRecovery(*reset);
579 },
580 "xyz.openbmc_project.Settings",
581 "/xyz/openbmc_project/control/processor_error_config",
582 "org.freedesktop.DBus.Properties", "Get",
583 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
584 });
585}
586
Jason M. Bills1490b142019-07-01 15:48:43 -0700587static void caterrHandler()
588{
589 if (!hostOff)
590 {
591 gpiod::line_event gpioLineEvent = caterrLine.event_read();
592
593 bool caterr =
594 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
595 if (caterr)
596 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700597 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700598 }
599 else
600 {
601 caterrAssertTimer.cancel();
602 }
603 }
604 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
605 [](const boost::system::error_code ec) {
606 if (ec)
607 {
608 std::cerr << "caterr handler error: "
609 << ec.message() << "\n";
610 return;
611 }
612 caterrHandler();
613 });
614}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800615static void pchThermtripHandler()
616{
617 if (!hostOff)
618 {
619 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
620
621 bool pchThermtrip =
622 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
623 if (pchThermtrip)
624 {
Jason M. Bills08866542019-08-16 12:04:19 -0700625 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800626 }
627 }
628 pchThermtripEvent.async_wait(
629 boost::asio::posix::stream_descriptor::wait_read,
630 [](const boost::system::error_code ec) {
631 if (ec)
632 {
633 std::cerr << "PCH Thermal trip handler error: " << ec.message()
634 << "\n";
635 return;
636 }
637 pchThermtripHandler();
638 });
639}
640
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700641static std::bitset<MAX_CPUS> checkERR2CPUs()
642{
643 std::bitset<MAX_CPUS> err2CPUs = 0;
644 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
645 cpu++, addr++)
646 {
647 if (peci_Ping(addr) == PECI_CC_SUCCESS)
648 {
649 uint8_t cc = 0;
650 CPUModel model{};
651 uint8_t stepping = 0;
652 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
653 {
654 std::cerr << "Cannot get CPUID!\n";
655 continue;
656 }
657
658 switch (model)
659 {
660 case skx:
661 {
662 // Check the ERRPINSTS to see if this is the CPU that caused
663 // the ERR2 (B(0) D8 F0 offset 210h)
664 uint32_t errpinsts = 0;
665 if (peci_RdPCIConfigLocal(
666 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
667 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
668 {
669 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
670 }
671 break;
672 }
673 case icx:
674 {
675 // Check the ERRPINSTS to see if this is the CPU that caused
676 // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
677 // accessed on PECI as bus 13)
678 uint32_t errpinsts = 0;
679 if (peci_RdEndPointConfigPciLocal(
680 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
681 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
682 {
683 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
684 }
685 break;
686 }
687 }
688 }
689 }
690 return err2CPUs;
691}
692
693static void err2AssertHandler()
694{
695 // ERR2 status is not guaranteed through the timeout, so save which
696 // CPUs have asserted ERR2 now
697 std::bitset<MAX_CPUS> err2CPUs = checkERR2CPUs();
698 err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
699 err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
700 if (ec)
701 {
702 // operation_aborted is expected if timer is canceled before
703 // completion.
704 if (ec != boost::asio::error::operation_aborted)
705 {
706 std::cerr << "err2 timeout async_wait failed: " << ec.message()
707 << "\n";
708 }
709 return;
710 }
711 std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
712 << " ms\n";
713 if (err2CPUs.count())
714 {
715 for (int i = 0; i < err2CPUs.size(); i++)
716 {
717 if (err2CPUs[i])
718 {
719 cpuERR2Log(i);
720 }
721 }
722 }
723 else
724 {
725 cpuERR2Log();
726 }
727 conn->async_method_call(
728 [](boost::system::error_code ec,
729 const std::variant<bool>& property) {
730 if (ec)
731 {
732 return;
733 }
734 const bool* reset = std::get_if<bool>(&property);
735 if (reset == nullptr)
736 {
737 std::cerr << "Unable to read reset on ERR2 value\n";
738 return;
739 }
740 startCrashdumpAndRecovery(*reset);
741 },
742 "xyz.openbmc_project.Settings",
743 "/xyz/openbmc_project/control/processor_error_config",
744 "org.freedesktop.DBus.Properties", "Get",
745 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
746 });
747}
748
749static void err2Handler()
750{
751 if (!hostOff)
752 {
753 gpiod::line_event gpioLineEvent = err2Line.event_read();
754
755 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
756 if (err2)
757 {
758 err2AssertHandler();
759 }
760 else
761 {
762 err2AssertTimer.cancel();
763 }
764 }
765 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
766 [](const boost::system::error_code ec) {
767 if (ec)
768 {
769 std::cerr
770 << "err2 handler error: " << ec.message()
771 << "\n";
772 return;
773 }
774 err2Handler();
775 });
776}
777
Jason M. Bills89922f82019-08-06 11:10:02 -0700778static void smiAssertHandler()
779{
780 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
781 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
782 if (ec)
783 {
784 // operation_aborted is expected if timer is canceled before
785 // completion.
786 if (ec != boost::asio::error::operation_aborted)
787 {
788 std::cerr << "smi timeout async_wait failed: " << ec.message()
789 << "\n";
790 }
791 return;
792 }
793 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
794 << " ms\n";
795 smiTimeoutLog();
796 conn->async_method_call(
797 [](boost::system::error_code ec,
798 const std::variant<bool>& property) {
799 if (ec)
800 {
801 return;
802 }
803 const bool* reset = std::get_if<bool>(&property);
804 if (reset == nullptr)
805 {
806 std::cerr << "Unable to read reset on SMI value\n";
807 return;
808 }
809 startCrashdumpAndRecovery(*reset);
810 },
811 "xyz.openbmc_project.Settings",
812 "/xyz/openbmc_project/control/bmc_reset_disables",
813 "org.freedesktop.DBus.Properties", "Get",
814 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
815 });
816}
817
818static void smiHandler()
819{
820 if (!hostOff)
821 {
822 gpiod::line_event gpioLineEvent = smiLine.event_read();
823
824 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
825 if (smi)
826 {
827 smiAssertHandler();
828 }
829 else
830 {
831 smiAssertTimer.cancel();
832 }
833 }
834 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
835 [](const boost::system::error_code ec) {
836 if (ec)
837 {
838 std::cerr
839 << "smi handler error: " << ec.message()
840 << "\n";
841 return;
842 }
843 smiHandler();
844 });
845}
846
Jason M. Billsa15c2522019-08-16 10:01:44 -0700847static void initializeErrorState()
848{
849 // Handle CPU_CATERR if it's asserted now
850 if (caterrLine.get_value() == 0)
851 {
852 caterrAssertHandler();
853 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700854
855 // Handle CPU_ERR2 if it's asserted now
856 if (err2Line.get_value() == 0)
857 {
858 err2AssertHandler();
859 }
Jason M. Bills89922f82019-08-06 11:10:02 -0700860
861 // Handle SMI if it's asserted now
862 if (smiLine.get_value() == 0)
863 {
864 smiAssertHandler();
865 }
Jason M. Bills08866542019-08-16 12:04:19 -0700866
867 // Handle PCH_BMC_THERMTRIP if it's asserted now
868 if (pchThermtripLine.get_value() == 0)
869 {
870 ssbThermTripLog();
871 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700872}
Jason M. Bills1490b142019-07-01 15:48:43 -0700873} // namespace host_error_monitor
874
875int main(int argc, char* argv[])
876{
877 // setup connection to dbus
878 host_error_monitor::conn =
879 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
880
881 // Host Error Monitor Object
882 host_error_monitor::conn->request_name(
883 "xyz.openbmc_project.HostErrorMonitor");
884 sdbusplus::asio::object_server server =
885 sdbusplus::asio::object_server(host_error_monitor::conn);
886
887 // Start tracking host state
888 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
889 host_error_monitor::startHostStateMonitor();
890
891 // Initialize the host state
892 host_error_monitor::initializeHostState();
893
894 // Request CPU_CATERR GPIO events
895 if (!host_error_monitor::requestGPIOEvents(
896 "CPU_CATERR", host_error_monitor::caterrHandler,
897 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
898 {
899 return -1;
900 }
901
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700902 // Request CPU_ERR2 GPIO events
903 if (!host_error_monitor::requestGPIOEvents(
904 "CPU_ERR2", host_error_monitor::err2Handler,
905 host_error_monitor::err2Line, host_error_monitor::err2Event))
906 {
907 return -1;
908 }
909
Jason M. Bills89922f82019-08-06 11:10:02 -0700910 // Request SMI GPIO events
911 if (!host_error_monitor::requestGPIOEvents(
912 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
913 host_error_monitor::smiEvent))
914 {
915 return -1;
916 }
917
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800918 // Request PCH_BMC_THERMTRIP GPIO events
919 if (!host_error_monitor::requestGPIOEvents(
920 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
921 host_error_monitor::pchThermtripLine,
922 host_error_monitor::pchThermtripEvent))
923 {
924 return -1;
925 }
926
Jason M. Bills1490b142019-07-01 15:48:43 -0700927 host_error_monitor::io.run();
928
929 return 0;
930}