blob: 1062fca25b5b6d8b07f5173bfce53103d8a24f1b [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Bills6a2cb692019-08-06 11:03:49 -070034const static constexpr size_t err2TimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070041// Timer for ERR2 asserted
42static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070043// Timer for SMI asserted
44static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070045
46// GPIO Lines and Event Descriptors
47static gpiod::line caterrLine;
48static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070049static gpiod::line err2Line;
50static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070051static gpiod::line smiLine;
52static boost::asio::posix::stream_descriptor smiEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080053//----------------------------------
54// PCH_BMC_THERMTRIP function related definition
55//----------------------------------
56// GPIO Lines and Event Descriptors
57static gpiod::line pchThermtripLine;
58static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070059
Jason M. Billsa3397932019-08-06 11:07:21 -070060static void cpuIERRLog()
61{
62 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
63 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
64 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
65}
66
67static void cpuIERRLog(const int cpuNum)
68{
69 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
70
71 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
72 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
73 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
74}
75
76static void cpuIERRLog(const int cpuNum, const std::string& type)
77{
78 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
79
80 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
81 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
82 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
83}
84
Jason M. Bills6a2cb692019-08-06 11:03:49 -070085static void cpuERR2Log()
86{
87 sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
88 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
89 "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
90}
91
92static void cpuERR2Log(const int cpuNum)
93{
94 std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
95
96 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
97 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
98 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
99}
100
Jason M. Bills89922f82019-08-06 11:10:02 -0700101static void smiTimeoutLog()
102{
103 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
104 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
105 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
106}
107
Jason M. Billsa15c2522019-08-16 10:01:44 -0700108static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700109static void initializeHostState()
110{
111 conn->async_method_call(
112 [](boost::system::error_code ec,
113 const std::variant<std::string>& property) {
114 if (ec)
115 {
116 return;
117 }
118 const std::string* state = std::get_if<std::string>(&property);
119 if (state == nullptr)
120 {
121 std::cerr << "Unable to read host state value\n";
122 return;
123 }
124 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700125 // If the system is on, initialize the error state
126 if (!hostOff)
127 {
128 initializeErrorState();
129 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700130 },
131 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
132 "org.freedesktop.DBus.Properties", "Get",
133 "xyz.openbmc_project.State.Host", "CurrentHostState");
134}
135
136static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
137{
138 return std::make_shared<sdbusplus::bus::match::match>(
139 *conn,
140 "type='signal',interface='org.freedesktop.DBus.Properties',"
141 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
142 "Host'",
143 [](sdbusplus::message::message& msg) {
144 std::string interfaceName;
145 boost::container::flat_map<std::string, std::variant<std::string>>
146 propertiesChanged;
147 std::string state;
148 try
149 {
150 msg.read(interfaceName, propertiesChanged);
151 state =
152 std::get<std::string>(propertiesChanged.begin()->second);
153 }
154 catch (std::exception& e)
155 {
156 std::cerr << "Unable to read host state\n";
157 return;
158 }
159 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
160
161 // No host events should fire while off, so cancel any pending
162 // timers
163 if (hostOff)
164 {
165 caterrAssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700166 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700167 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700168 }
169 });
170}
171
172static bool requestGPIOEvents(
173 const std::string& name, const std::function<void()>& handler,
174 gpiod::line& gpioLine,
175 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
176{
177 // Find the GPIO line
178 gpioLine = gpiod::find_line(name);
179 if (!gpioLine)
180 {
181 std::cerr << "Failed to find the " << name << " line\n";
182 return false;
183 }
184
185 try
186 {
187 gpioLine.request(
188 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
189 }
190 catch (std::exception&)
191 {
192 std::cerr << "Failed to request events for " << name << "\n";
193 return false;
194 }
195
196 int gpioLineFd = gpioLine.event_get_fd();
197 if (gpioLineFd < 0)
198 {
199 std::cerr << "Failed to get " << name << " fd\n";
200 return false;
201 }
202
203 gpioEventDescriptor.assign(gpioLineFd);
204
205 gpioEventDescriptor.async_wait(
206 boost::asio::posix::stream_descriptor::wait_read,
207 [&name, handler](const boost::system::error_code ec) {
208 if (ec)
209 {
210 std::cerr << name << " fd handler error: " << ec.message()
211 << "\n";
212 return;
213 }
214 handler();
215 });
216 return true;
217}
218
219static void startPowerCycle()
220{
221 conn->async_method_call(
222 [](boost::system::error_code ec) {
223 if (ec)
224 {
225 std::cerr << "failed to set Chassis State\n";
226 }
227 },
228 "xyz.openbmc_project.State.Chassis",
229 "/xyz/openbmc_project/state/chassis0",
230 "org.freedesktop.DBus.Properties", "Set",
231 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
232 std::variant<std::string>{
233 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
234}
235
236static void startCrashdumpAndRecovery(bool recoverSystem)
237{
238 std::cout << "Starting crashdump\n";
239 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
240 static boost::asio::steady_timer crashdumpTimer(io);
241
242 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
243 *conn,
244 "type='signal',interface='org.freedesktop.DBus.Properties',"
245 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
246 [recoverSystem](sdbusplus::message::message& msg) {
247 crashdumpTimer.cancel();
248 std::cout << "Crashdump completed\n";
249 if (recoverSystem)
250 {
251 std::cout << "Recovering the system\n";
252 startPowerCycle();
253 }
254 crashdumpCompleteMatch.reset();
255 });
256
257 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
258 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
259 if (ec)
260 {
261 // operation_aborted is expected if timer is canceled
262 if (ec != boost::asio::error::operation_aborted)
263 {
264 std::cerr << "Crashdump async_wait failed: " << ec.message()
265 << "\n";
266 }
267 std::cout << "Crashdump timer canceled\n";
268 return;
269 }
270 std::cerr << "Crashdump failed to complete before timeout\n";
271 crashdumpCompleteMatch.reset();
272 });
273
274 conn->async_method_call(
275 [](boost::system::error_code ec) {
276 if (ec)
277 {
278 std::cerr << "failed to start Crashdump\n";
279 crashdumpTimer.cancel();
280 crashdumpCompleteMatch.reset();
281 }
282 },
283 "com.intel.crashdump", "/com/intel/crashdump",
284 "com.intel.crashdump.Stored", "GenerateStoredLog");
285}
286
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700287static void incrementCPUErrorCount(int cpuNum)
288{
289 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
290
291 // Get the current count
292 conn->async_method_call(
293 [propertyName](boost::system::error_code ec,
294 const std::variant<uint8_t>& property) {
295 if (ec)
296 {
297 std::cerr << "Failed to read " << propertyName << ": "
298 << ec.message() << "\n";
299 return;
300 }
301 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
302 if (errorCountVariant == nullptr)
303 {
304 std::cerr << propertyName << " invalid\n";
305 return;
306 }
307 uint8_t errorCount = *errorCountVariant;
308 if (errorCount == std::numeric_limits<uint8_t>::max())
309 {
310 std::cerr << "Maximum error count reached\n";
311 return;
312 }
313 // Increment the count
314 errorCount++;
315 conn->async_method_call(
316 [propertyName](boost::system::error_code ec) {
317 if (ec)
318 {
319 std::cerr << "Failed to set " << propertyName << ": "
320 << ec.message() << "\n";
321 }
322 },
323 "xyz.openbmc_project.Settings",
324 "/xyz/openbmc_project/control/processor_error_config",
325 "org.freedesktop.DBus.Properties", "Set",
326 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
327 std::variant<uint8_t>{errorCount});
328 },
329 "xyz.openbmc_project.Settings",
330 "/xyz/openbmc_project/control/processor_error_config",
331 "org.freedesktop.DBus.Properties", "Get",
332 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
333}
334
Jason M. Billsa3397932019-08-06 11:07:21 -0700335static bool checkIERRCPUs()
336{
337 bool cpuIERRFound = false;
338 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
339 cpu++, addr++)
340 {
341 uint8_t cc = 0;
342 CPUModel model{};
343 uint8_t stepping = 0;
344 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
345 {
346 std::cerr << "Cannot get CPUID!\n";
347 continue;
348 }
349
350 switch (model)
351 {
352 case skx:
353 {
354 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
355 // that caused the IERR
356 uint32_t mcaErrSrcLog = 0;
357 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
358 &cc) != PECI_CC_SUCCESS)
359 {
360 continue;
361 }
362 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
363 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
364 {
365 // TODO: Light the CPU fault LED?
366 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700367 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700368 // Next check if it's a CPU/VR mismatch by reading the
369 // IA32_MC4_STATUS MSR (0x411)
370 uint64_t mc4Status = 0;
371 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
372 PECI_CC_SUCCESS)
373 {
374 continue;
375 }
376 // Check MSEC bits 31:24 for
377 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
378 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
379 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
380 if ((mc4Status & (0x40 << 24)) ||
381 (mc4Status & (0x42 << 24)) ||
382 (mc4Status & (0x43 << 24)))
383 {
384 cpuIERRLog(cpu, "CPU/VR Mismatch");
385 continue;
386 }
387
388 // Next check if it's a Core FIVR fault by looking for a
389 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
390 // 80h)
391 uint32_t coreFIVRErrLog = 0;
392 if (peci_RdPCIConfigLocal(
393 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
394 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
395 {
396 continue;
397 }
398 if (coreFIVRErrLog)
399 {
400 cpuIERRLog(cpu, "Core FIVR Fault");
401 continue;
402 }
403
404 // Next check if it's an Uncore FIVR fault by looking for a
405 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
406 // 84h)
407 uint32_t uncoreFIVRErrLog = 0;
408 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
409 sizeof(uint32_t),
410 (uint8_t*)&uncoreFIVRErrLog,
411 &cc) != PECI_CC_SUCCESS)
412 {
413 continue;
414 }
415 if (uncoreFIVRErrLog)
416 {
417 cpuIERRLog(cpu, "Uncore FIVR Fault");
418 continue;
419 }
420
421 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
422 // both zero, but MSEC bits 31:24 have either
423 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
424 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
425 // uncore FIVR fault
426 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
427 ((mc4Status & (0x51 << 24)) ||
428 (mc4Status & (0x52 << 24))))
429 {
430 cpuIERRLog(cpu, "Uncore FIVR Fault");
431 continue;
432 }
433 cpuIERRLog(cpu);
434 }
435 break;
436 }
437 case icx:
438 {
439 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
440 // that caused the IERR
441 uint32_t mcaErrSrcLog = 0;
442 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
443 &cc) != PECI_CC_SUCCESS)
444 {
445 continue;
446 }
447 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
448 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
449 {
450 // TODO: Light the CPU fault LED?
451 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700452 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700453 // Next check if it's a CPU/VR mismatch by reading the
454 // IA32_MC4_STATUS MSR (0x411)
455 uint64_t mc4Status = 0;
456 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
457 PECI_CC_SUCCESS)
458 {
459 continue;
460 }
461 // TODO: Update MSEC/MSCOD_31_24 check
462 // Check MSEC bits 31:24 for
463 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
464 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
465 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
466 if ((mc4Status & (0x40 << 24)) ||
467 (mc4Status & (0x42 << 24)) ||
468 (mc4Status & (0x43 << 24)))
469 {
470 cpuIERRLog(cpu, "CPU/VR Mismatch");
471 continue;
472 }
473
474 // Next check if it's a Core FIVR fault by looking for a
475 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
476 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
477 uint32_t coreFIVRErrLog0 = 0;
478 uint32_t coreFIVRErrLog1 = 0;
479 if (peci_RdEndPointConfigPciLocal(
480 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
481 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
482 {
483 continue;
484 }
485 if (peci_RdEndPointConfigPciLocal(
486 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
487 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
488 {
489 continue;
490 }
491 if (coreFIVRErrLog0 || coreFIVRErrLog1)
492 {
493 cpuIERRLog(cpu, "Core FIVR Fault");
494 continue;
495 }
496
497 // Next check if it's an Uncore FIVR fault by looking for a
498 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
499 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
500 uint32_t uncoreFIVRErrLog = 0;
501 if (peci_RdEndPointConfigPciLocal(
502 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
503 (uint8_t*)&uncoreFIVRErrLog,
504 &cc) != PECI_CC_SUCCESS)
505 {
506 continue;
507 }
508 if (uncoreFIVRErrLog)
509 {
510 cpuIERRLog(cpu, "Uncore FIVR Fault");
511 continue;
512 }
513
514 // TODO: Update MSEC/MSCOD_31_24 check
515 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
516 // both zero, but MSEC bits 31:24 have either
517 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
518 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
519 // uncore FIVR fault
520 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
521 !uncoreFIVRErrLog &&
522 ((mc4Status & (0x51 << 24)) ||
523 (mc4Status & (0x52 << 24))))
524 {
525 cpuIERRLog(cpu, "Uncore FIVR Fault");
526 continue;
527 }
528 cpuIERRLog(cpu);
529 }
530 break;
531 }
532 }
533 }
534 return cpuIERRFound;
535}
536
Jason M. Billsa15c2522019-08-16 10:01:44 -0700537static void caterrAssertHandler()
538{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700539 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
540 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
541 if (ec)
542 {
543 // operation_aborted is expected if timer is canceled
544 // before completion.
545 if (ec != boost::asio::error::operation_aborted)
546 {
547 std::cerr << "caterr timeout async_wait failed: "
548 << ec.message() << "\n";
549 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700550 return;
551 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700552 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
553 << " ms\n";
554 if (!checkIERRCPUs())
555 {
556 cpuIERRLog();
557 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700558 conn->async_method_call(
559 [](boost::system::error_code ec,
560 const std::variant<bool>& property) {
561 if (ec)
562 {
563 return;
564 }
565 const bool* reset = std::get_if<bool>(&property);
566 if (reset == nullptr)
567 {
568 std::cerr << "Unable to read reset on CATERR value\n";
569 return;
570 }
571 startCrashdumpAndRecovery(*reset);
572 },
573 "xyz.openbmc_project.Settings",
574 "/xyz/openbmc_project/control/processor_error_config",
575 "org.freedesktop.DBus.Properties", "Get",
576 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
577 });
578}
579
Jason M. Bills1490b142019-07-01 15:48:43 -0700580static void caterrHandler()
581{
582 if (!hostOff)
583 {
584 gpiod::line_event gpioLineEvent = caterrLine.event_read();
585
586 bool caterr =
587 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
588 if (caterr)
589 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700590 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700591 }
592 else
593 {
594 caterrAssertTimer.cancel();
595 }
596 }
597 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
598 [](const boost::system::error_code ec) {
599 if (ec)
600 {
601 std::cerr << "caterr handler error: "
602 << ec.message() << "\n";
603 return;
604 }
605 caterrHandler();
606 });
607}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800608static void pchThermtripHandler()
609{
610 if (!hostOff)
611 {
612 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
613
614 bool pchThermtrip =
615 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
616 if (pchThermtrip)
617 {
618 std::cout << "PCH Thermal trip detected \n";
619 // log to redfish, call API
620 sd_journal_send("MESSAGE=SsbThermalTrip: SSB Thermal trip",
621 "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s",
622 "OpenBMC.0.1.SsbThermalTrip", NULL);
623 }
624 }
625 pchThermtripEvent.async_wait(
626 boost::asio::posix::stream_descriptor::wait_read,
627 [](const boost::system::error_code ec) {
628 if (ec)
629 {
630 std::cerr << "PCH Thermal trip handler error: " << ec.message()
631 << "\n";
632 return;
633 }
634 pchThermtripHandler();
635 });
636}
637
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700638static std::bitset<MAX_CPUS> checkERR2CPUs()
639{
640 std::bitset<MAX_CPUS> err2CPUs = 0;
641 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
642 cpu++, addr++)
643 {
644 if (peci_Ping(addr) == PECI_CC_SUCCESS)
645 {
646 uint8_t cc = 0;
647 CPUModel model{};
648 uint8_t stepping = 0;
649 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
650 {
651 std::cerr << "Cannot get CPUID!\n";
652 continue;
653 }
654
655 switch (model)
656 {
657 case skx:
658 {
659 // Check the ERRPINSTS to see if this is the CPU that caused
660 // the ERR2 (B(0) D8 F0 offset 210h)
661 uint32_t errpinsts = 0;
662 if (peci_RdPCIConfigLocal(
663 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
664 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
665 {
666 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
667 }
668 break;
669 }
670 case icx:
671 {
672 // Check the ERRPINSTS to see if this is the CPU that caused
673 // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
674 // accessed on PECI as bus 13)
675 uint32_t errpinsts = 0;
676 if (peci_RdEndPointConfigPciLocal(
677 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
678 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
679 {
680 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
681 }
682 break;
683 }
684 }
685 }
686 }
687 return err2CPUs;
688}
689
690static void err2AssertHandler()
691{
692 // ERR2 status is not guaranteed through the timeout, so save which
693 // CPUs have asserted ERR2 now
694 std::bitset<MAX_CPUS> err2CPUs = checkERR2CPUs();
695 err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
696 err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
697 if (ec)
698 {
699 // operation_aborted is expected if timer is canceled before
700 // completion.
701 if (ec != boost::asio::error::operation_aborted)
702 {
703 std::cerr << "err2 timeout async_wait failed: " << ec.message()
704 << "\n";
705 }
706 return;
707 }
708 std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
709 << " ms\n";
710 if (err2CPUs.count())
711 {
712 for (int i = 0; i < err2CPUs.size(); i++)
713 {
714 if (err2CPUs[i])
715 {
716 cpuERR2Log(i);
717 }
718 }
719 }
720 else
721 {
722 cpuERR2Log();
723 }
724 conn->async_method_call(
725 [](boost::system::error_code ec,
726 const std::variant<bool>& property) {
727 if (ec)
728 {
729 return;
730 }
731 const bool* reset = std::get_if<bool>(&property);
732 if (reset == nullptr)
733 {
734 std::cerr << "Unable to read reset on ERR2 value\n";
735 return;
736 }
737 startCrashdumpAndRecovery(*reset);
738 },
739 "xyz.openbmc_project.Settings",
740 "/xyz/openbmc_project/control/processor_error_config",
741 "org.freedesktop.DBus.Properties", "Get",
742 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
743 });
744}
745
746static void err2Handler()
747{
748 if (!hostOff)
749 {
750 gpiod::line_event gpioLineEvent = err2Line.event_read();
751
752 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
753 if (err2)
754 {
755 err2AssertHandler();
756 }
757 else
758 {
759 err2AssertTimer.cancel();
760 }
761 }
762 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
763 [](const boost::system::error_code ec) {
764 if (ec)
765 {
766 std::cerr
767 << "err2 handler error: " << ec.message()
768 << "\n";
769 return;
770 }
771 err2Handler();
772 });
773}
774
Jason M. Bills89922f82019-08-06 11:10:02 -0700775static void smiAssertHandler()
776{
777 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
778 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
779 if (ec)
780 {
781 // operation_aborted is expected if timer is canceled before
782 // completion.
783 if (ec != boost::asio::error::operation_aborted)
784 {
785 std::cerr << "smi timeout async_wait failed: " << ec.message()
786 << "\n";
787 }
788 return;
789 }
790 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
791 << " ms\n";
792 smiTimeoutLog();
793 conn->async_method_call(
794 [](boost::system::error_code ec,
795 const std::variant<bool>& property) {
796 if (ec)
797 {
798 return;
799 }
800 const bool* reset = std::get_if<bool>(&property);
801 if (reset == nullptr)
802 {
803 std::cerr << "Unable to read reset on SMI value\n";
804 return;
805 }
806 startCrashdumpAndRecovery(*reset);
807 },
808 "xyz.openbmc_project.Settings",
809 "/xyz/openbmc_project/control/bmc_reset_disables",
810 "org.freedesktop.DBus.Properties", "Get",
811 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
812 });
813}
814
815static void smiHandler()
816{
817 if (!hostOff)
818 {
819 gpiod::line_event gpioLineEvent = smiLine.event_read();
820
821 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
822 if (smi)
823 {
824 smiAssertHandler();
825 }
826 else
827 {
828 smiAssertTimer.cancel();
829 }
830 }
831 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
832 [](const boost::system::error_code ec) {
833 if (ec)
834 {
835 std::cerr
836 << "smi handler error: " << ec.message()
837 << "\n";
838 return;
839 }
840 smiHandler();
841 });
842}
843
Jason M. Billsa15c2522019-08-16 10:01:44 -0700844static void initializeErrorState()
845{
846 // Handle CPU_CATERR if it's asserted now
847 if (caterrLine.get_value() == 0)
848 {
849 caterrAssertHandler();
850 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700851
852 // Handle CPU_ERR2 if it's asserted now
853 if (err2Line.get_value() == 0)
854 {
855 err2AssertHandler();
856 }
Jason M. Bills89922f82019-08-06 11:10:02 -0700857
858 // Handle SMI if it's asserted now
859 if (smiLine.get_value() == 0)
860 {
861 smiAssertHandler();
862 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700863}
Jason M. Bills1490b142019-07-01 15:48:43 -0700864} // namespace host_error_monitor
865
866int main(int argc, char* argv[])
867{
868 // setup connection to dbus
869 host_error_monitor::conn =
870 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
871
872 // Host Error Monitor Object
873 host_error_monitor::conn->request_name(
874 "xyz.openbmc_project.HostErrorMonitor");
875 sdbusplus::asio::object_server server =
876 sdbusplus::asio::object_server(host_error_monitor::conn);
877
878 // Start tracking host state
879 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
880 host_error_monitor::startHostStateMonitor();
881
882 // Initialize the host state
883 host_error_monitor::initializeHostState();
884
885 // Request CPU_CATERR GPIO events
886 if (!host_error_monitor::requestGPIOEvents(
887 "CPU_CATERR", host_error_monitor::caterrHandler,
888 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
889 {
890 return -1;
891 }
892
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700893 // Request CPU_ERR2 GPIO events
894 if (!host_error_monitor::requestGPIOEvents(
895 "CPU_ERR2", host_error_monitor::err2Handler,
896 host_error_monitor::err2Line, host_error_monitor::err2Event))
897 {
898 return -1;
899 }
900
Jason M. Bills89922f82019-08-06 11:10:02 -0700901 // Request SMI GPIO events
902 if (!host_error_monitor::requestGPIOEvents(
903 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
904 host_error_monitor::smiEvent))
905 {
906 return -1;
907 }
908
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800909 // Request PCH_BMC_THERMTRIP GPIO events
910 if (!host_error_monitor::requestGPIOEvents(
911 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
912 host_error_monitor::pchThermtripLine,
913 host_error_monitor::pchThermtripEvent))
914 {
915 return -1;
916 }
917
Jason M. Bills1490b142019-07-01 15:48:43 -0700918 host_error_monitor::io.run();
919
920 return 0;
921}