blob: 58231db6af613b3d093eda02c320ae4068158c8f [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070041// Timer for ERR0 asserted
42static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070043// Timer for ERR1 asserted
44static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045// Timer for ERR2 asserted
46static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070047// Timer for SMI asserted
48static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070049
50// GPIO Lines and Event Descriptors
51static gpiod::line caterrLine;
52static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070053static gpiod::line err0Line;
54static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070055static gpiod::line err1Line;
56static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070057static gpiod::line err2Line;
58static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070059static gpiod::line smiLine;
60static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills78c5eed2019-08-28 14:00:40 -070061static gpiod::line cpu1ThermtripLine;
62static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
63static gpiod::line cpu2ThermtripLine;
64static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080065//----------------------------------
66// PCH_BMC_THERMTRIP function related definition
67//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080068static gpiod::line pchThermtripLine;
69static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070070
Jason M. Billsa3397932019-08-06 11:07:21 -070071static void cpuIERRLog()
72{
73 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
74 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
75 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
76}
77
78static void cpuIERRLog(const int cpuNum)
79{
80 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
81
82 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
83 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
84 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
85}
86
87static void cpuIERRLog(const int cpuNum, const std::string& type)
88{
89 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
90
91 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
92 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
93 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
94}
95
Jason M. Billscbf78532019-08-16 15:32:11 -070096static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -070097{
Jason M. Billscbf78532019-08-16 15:32:11 -070098 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
99
100 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
101 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
102 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700103}
104
Jason M. Billscbf78532019-08-16 15:32:11 -0700105static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700106{
Jason M. Billscbf78532019-08-16 15:32:11 -0700107 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
108 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700109
110 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
111 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
112 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
113}
114
Jason M. Bills89922f82019-08-06 11:10:02 -0700115static void smiTimeoutLog()
116{
117 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
118 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
119 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
120}
121
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700122static void cpuThermTripLog(const int cpuNum)
123{
124 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
125
126 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
127 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
128 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
129 cpuNum, NULL);
130}
131
Jason M. Bills08866542019-08-16 12:04:19 -0700132static void ssbThermTripLog()
133{
134 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
135 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
136 "OpenBMC.0.1.SsbThermalTrip", NULL);
137}
138
Jason M. Billsa15c2522019-08-16 10:01:44 -0700139static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700140static void initializeHostState()
141{
142 conn->async_method_call(
143 [](boost::system::error_code ec,
144 const std::variant<std::string>& property) {
145 if (ec)
146 {
147 return;
148 }
149 const std::string* state = std::get_if<std::string>(&property);
150 if (state == nullptr)
151 {
152 std::cerr << "Unable to read host state value\n";
153 return;
154 }
155 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700156 // If the system is on, initialize the error state
157 if (!hostOff)
158 {
159 initializeErrorState();
160 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700161 },
162 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
163 "org.freedesktop.DBus.Properties", "Get",
164 "xyz.openbmc_project.State.Host", "CurrentHostState");
165}
166
167static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
168{
169 return std::make_shared<sdbusplus::bus::match::match>(
170 *conn,
171 "type='signal',interface='org.freedesktop.DBus.Properties',"
172 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
173 "Host'",
174 [](sdbusplus::message::message& msg) {
175 std::string interfaceName;
176 boost::container::flat_map<std::string, std::variant<std::string>>
177 propertiesChanged;
178 std::string state;
179 try
180 {
181 msg.read(interfaceName, propertiesChanged);
182 state =
183 std::get<std::string>(propertiesChanged.begin()->second);
184 }
185 catch (std::exception& e)
186 {
187 std::cerr << "Unable to read host state\n";
188 return;
189 }
190 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
191
192 // No host events should fire while off, so cancel any pending
193 // timers
194 if (hostOff)
195 {
196 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700197 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700198 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700199 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700200 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700201 }
202 });
203}
204
205static bool requestGPIOEvents(
206 const std::string& name, const std::function<void()>& handler,
207 gpiod::line& gpioLine,
208 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
209{
210 // Find the GPIO line
211 gpioLine = gpiod::find_line(name);
212 if (!gpioLine)
213 {
214 std::cerr << "Failed to find the " << name << " line\n";
215 return false;
216 }
217
218 try
219 {
220 gpioLine.request(
221 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
222 }
223 catch (std::exception&)
224 {
225 std::cerr << "Failed to request events for " << name << "\n";
226 return false;
227 }
228
229 int gpioLineFd = gpioLine.event_get_fd();
230 if (gpioLineFd < 0)
231 {
232 std::cerr << "Failed to get " << name << " fd\n";
233 return false;
234 }
235
236 gpioEventDescriptor.assign(gpioLineFd);
237
238 gpioEventDescriptor.async_wait(
239 boost::asio::posix::stream_descriptor::wait_read,
240 [&name, handler](const boost::system::error_code ec) {
241 if (ec)
242 {
243 std::cerr << name << " fd handler error: " << ec.message()
244 << "\n";
245 return;
246 }
247 handler();
248 });
249 return true;
250}
251
252static void startPowerCycle()
253{
254 conn->async_method_call(
255 [](boost::system::error_code ec) {
256 if (ec)
257 {
258 std::cerr << "failed to set Chassis State\n";
259 }
260 },
261 "xyz.openbmc_project.State.Chassis",
262 "/xyz/openbmc_project/state/chassis0",
263 "org.freedesktop.DBus.Properties", "Set",
264 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
265 std::variant<std::string>{
266 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
267}
268
269static void startCrashdumpAndRecovery(bool recoverSystem)
270{
271 std::cout << "Starting crashdump\n";
272 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
273 static boost::asio::steady_timer crashdumpTimer(io);
274
275 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
276 *conn,
277 "type='signal',interface='org.freedesktop.DBus.Properties',"
278 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
279 [recoverSystem](sdbusplus::message::message& msg) {
280 crashdumpTimer.cancel();
281 std::cout << "Crashdump completed\n";
282 if (recoverSystem)
283 {
284 std::cout << "Recovering the system\n";
285 startPowerCycle();
286 }
287 crashdumpCompleteMatch.reset();
288 });
289
290 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
291 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
292 if (ec)
293 {
294 // operation_aborted is expected if timer is canceled
295 if (ec != boost::asio::error::operation_aborted)
296 {
297 std::cerr << "Crashdump async_wait failed: " << ec.message()
298 << "\n";
299 }
300 std::cout << "Crashdump timer canceled\n";
301 return;
302 }
303 std::cerr << "Crashdump failed to complete before timeout\n";
304 crashdumpCompleteMatch.reset();
305 });
306
307 conn->async_method_call(
308 [](boost::system::error_code ec) {
309 if (ec)
310 {
311 std::cerr << "failed to start Crashdump\n";
312 crashdumpTimer.cancel();
313 crashdumpCompleteMatch.reset();
314 }
315 },
316 "com.intel.crashdump", "/com/intel/crashdump",
317 "com.intel.crashdump.Stored", "GenerateStoredLog");
318}
319
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700320static void incrementCPUErrorCount(int cpuNum)
321{
322 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
323
324 // Get the current count
325 conn->async_method_call(
326 [propertyName](boost::system::error_code ec,
327 const std::variant<uint8_t>& property) {
328 if (ec)
329 {
330 std::cerr << "Failed to read " << propertyName << ": "
331 << ec.message() << "\n";
332 return;
333 }
334 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
335 if (errorCountVariant == nullptr)
336 {
337 std::cerr << propertyName << " invalid\n";
338 return;
339 }
340 uint8_t errorCount = *errorCountVariant;
341 if (errorCount == std::numeric_limits<uint8_t>::max())
342 {
343 std::cerr << "Maximum error count reached\n";
344 return;
345 }
346 // Increment the count
347 errorCount++;
348 conn->async_method_call(
349 [propertyName](boost::system::error_code ec) {
350 if (ec)
351 {
352 std::cerr << "Failed to set " << propertyName << ": "
353 << ec.message() << "\n";
354 }
355 },
356 "xyz.openbmc_project.Settings",
357 "/xyz/openbmc_project/control/processor_error_config",
358 "org.freedesktop.DBus.Properties", "Set",
359 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
360 std::variant<uint8_t>{errorCount});
361 },
362 "xyz.openbmc_project.Settings",
363 "/xyz/openbmc_project/control/processor_error_config",
364 "org.freedesktop.DBus.Properties", "Get",
365 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
366}
367
Jason M. Billsa3397932019-08-06 11:07:21 -0700368static bool checkIERRCPUs()
369{
370 bool cpuIERRFound = false;
371 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
372 cpu++, addr++)
373 {
374 uint8_t cc = 0;
375 CPUModel model{};
376 uint8_t stepping = 0;
377 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
378 {
379 std::cerr << "Cannot get CPUID!\n";
380 continue;
381 }
382
383 switch (model)
384 {
385 case skx:
386 {
387 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
388 // that caused the IERR
389 uint32_t mcaErrSrcLog = 0;
390 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
391 &cc) != PECI_CC_SUCCESS)
392 {
393 continue;
394 }
395 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
396 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
397 {
398 // TODO: Light the CPU fault LED?
399 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700400 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700401 // Next check if it's a CPU/VR mismatch by reading the
402 // IA32_MC4_STATUS MSR (0x411)
403 uint64_t mc4Status = 0;
404 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
405 PECI_CC_SUCCESS)
406 {
407 continue;
408 }
409 // Check MSEC bits 31:24 for
410 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
411 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
412 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
413 if ((mc4Status & (0x40 << 24)) ||
414 (mc4Status & (0x42 << 24)) ||
415 (mc4Status & (0x43 << 24)))
416 {
417 cpuIERRLog(cpu, "CPU/VR Mismatch");
418 continue;
419 }
420
421 // Next check if it's a Core FIVR fault by looking for a
422 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
423 // 80h)
424 uint32_t coreFIVRErrLog = 0;
425 if (peci_RdPCIConfigLocal(
426 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
427 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
428 {
429 continue;
430 }
431 if (coreFIVRErrLog)
432 {
433 cpuIERRLog(cpu, "Core FIVR Fault");
434 continue;
435 }
436
437 // Next check if it's an Uncore FIVR fault by looking for a
438 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
439 // 84h)
440 uint32_t uncoreFIVRErrLog = 0;
441 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
442 sizeof(uint32_t),
443 (uint8_t*)&uncoreFIVRErrLog,
444 &cc) != PECI_CC_SUCCESS)
445 {
446 continue;
447 }
448 if (uncoreFIVRErrLog)
449 {
450 cpuIERRLog(cpu, "Uncore FIVR Fault");
451 continue;
452 }
453
454 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
455 // both zero, but MSEC bits 31:24 have either
456 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
457 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
458 // uncore FIVR fault
459 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
460 ((mc4Status & (0x51 << 24)) ||
461 (mc4Status & (0x52 << 24))))
462 {
463 cpuIERRLog(cpu, "Uncore FIVR Fault");
464 continue;
465 }
466 cpuIERRLog(cpu);
467 }
468 break;
469 }
470 case icx:
471 {
472 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
473 // that caused the IERR
474 uint32_t mcaErrSrcLog = 0;
475 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
476 &cc) != PECI_CC_SUCCESS)
477 {
478 continue;
479 }
480 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
481 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
482 {
483 // TODO: Light the CPU fault LED?
484 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700485 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700486 // Next check if it's a CPU/VR mismatch by reading the
487 // IA32_MC4_STATUS MSR (0x411)
488 uint64_t mc4Status = 0;
489 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
490 PECI_CC_SUCCESS)
491 {
492 continue;
493 }
494 // TODO: Update MSEC/MSCOD_31_24 check
495 // Check MSEC bits 31:24 for
496 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
497 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
498 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
499 if ((mc4Status & (0x40 << 24)) ||
500 (mc4Status & (0x42 << 24)) ||
501 (mc4Status & (0x43 << 24)))
502 {
503 cpuIERRLog(cpu, "CPU/VR Mismatch");
504 continue;
505 }
506
507 // Next check if it's a Core FIVR fault by looking for a
508 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
509 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
510 uint32_t coreFIVRErrLog0 = 0;
511 uint32_t coreFIVRErrLog1 = 0;
512 if (peci_RdEndPointConfigPciLocal(
513 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
514 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
515 {
516 continue;
517 }
518 if (peci_RdEndPointConfigPciLocal(
519 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
520 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
521 {
522 continue;
523 }
524 if (coreFIVRErrLog0 || coreFIVRErrLog1)
525 {
526 cpuIERRLog(cpu, "Core FIVR Fault");
527 continue;
528 }
529
530 // Next check if it's an Uncore FIVR fault by looking for a
531 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
532 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
533 uint32_t uncoreFIVRErrLog = 0;
534 if (peci_RdEndPointConfigPciLocal(
535 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
536 (uint8_t*)&uncoreFIVRErrLog,
537 &cc) != PECI_CC_SUCCESS)
538 {
539 continue;
540 }
541 if (uncoreFIVRErrLog)
542 {
543 cpuIERRLog(cpu, "Uncore FIVR Fault");
544 continue;
545 }
546
547 // TODO: Update MSEC/MSCOD_31_24 check
548 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
549 // both zero, but MSEC bits 31:24 have either
550 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
551 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
552 // uncore FIVR fault
553 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
554 !uncoreFIVRErrLog &&
555 ((mc4Status & (0x51 << 24)) ||
556 (mc4Status & (0x52 << 24))))
557 {
558 cpuIERRLog(cpu, "Uncore FIVR Fault");
559 continue;
560 }
561 cpuIERRLog(cpu);
562 }
563 break;
564 }
565 }
566 }
567 return cpuIERRFound;
568}
569
Jason M. Billsa15c2522019-08-16 10:01:44 -0700570static void caterrAssertHandler()
571{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700572 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
573 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
574 if (ec)
575 {
576 // operation_aborted is expected if timer is canceled
577 // before completion.
578 if (ec != boost::asio::error::operation_aborted)
579 {
580 std::cerr << "caterr timeout async_wait failed: "
581 << ec.message() << "\n";
582 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700583 return;
584 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700585 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
586 << " ms\n";
587 if (!checkIERRCPUs())
588 {
589 cpuIERRLog();
590 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700591 conn->async_method_call(
592 [](boost::system::error_code ec,
593 const std::variant<bool>& property) {
594 if (ec)
595 {
596 return;
597 }
598 const bool* reset = std::get_if<bool>(&property);
599 if (reset == nullptr)
600 {
601 std::cerr << "Unable to read reset on CATERR value\n";
602 return;
603 }
604 startCrashdumpAndRecovery(*reset);
605 },
606 "xyz.openbmc_project.Settings",
607 "/xyz/openbmc_project/control/processor_error_config",
608 "org.freedesktop.DBus.Properties", "Get",
609 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
610 });
611}
612
Jason M. Bills1490b142019-07-01 15:48:43 -0700613static void caterrHandler()
614{
615 if (!hostOff)
616 {
617 gpiod::line_event gpioLineEvent = caterrLine.event_read();
618
619 bool caterr =
620 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
621 if (caterr)
622 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700623 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700624 }
625 else
626 {
627 caterrAssertTimer.cancel();
628 }
629 }
630 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
631 [](const boost::system::error_code ec) {
632 if (ec)
633 {
634 std::cerr << "caterr handler error: "
635 << ec.message() << "\n";
636 return;
637 }
638 caterrHandler();
639 });
640}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700641
642static void cpu1ThermtripHandler()
643{
644 if (!hostOff)
645 {
646 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
647
648 bool cpu1Thermtrip =
649 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
650 if (cpu1Thermtrip)
651 {
652 cpuThermTripLog(1);
653 }
654 }
655 cpu1ThermtripEvent.async_wait(
656 boost::asio::posix::stream_descriptor::wait_read,
657 [](const boost::system::error_code ec) {
658 if (ec)
659 {
660 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
661 << "\n";
662 return;
663 }
664 cpu1ThermtripHandler();
665 });
666}
667
668static void cpu2ThermtripHandler()
669{
670 if (!hostOff)
671 {
672 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
673
674 bool cpu2Thermtrip =
675 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
676 if (cpu2Thermtrip)
677 {
678 cpuThermTripLog(2);
679 }
680 }
681 cpu2ThermtripEvent.async_wait(
682 boost::asio::posix::stream_descriptor::wait_read,
683 [](const boost::system::error_code ec) {
684 if (ec)
685 {
686 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
687 << "\n";
688 return;
689 }
690 cpu2ThermtripHandler();
691 });
692}
693
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800694static void pchThermtripHandler()
695{
696 if (!hostOff)
697 {
698 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
699
700 bool pchThermtrip =
701 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
702 if (pchThermtrip)
703 {
Jason M. Bills08866542019-08-16 12:04:19 -0700704 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800705 }
706 }
707 pchThermtripEvent.async_wait(
708 boost::asio::posix::stream_descriptor::wait_read,
709 [](const boost::system::error_code ec) {
710 if (ec)
711 {
712 std::cerr << "PCH Thermal trip handler error: " << ec.message()
713 << "\n";
714 return;
715 }
716 pchThermtripHandler();
717 });
718}
719
Jason M. Billscbf78532019-08-16 15:32:11 -0700720static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700721{
Jason M. Billscbf78532019-08-16 15:32:11 -0700722 int errPinSts = (1 << errPin);
723 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700724 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
725 cpu++, addr++)
726 {
727 if (peci_Ping(addr) == PECI_CC_SUCCESS)
728 {
729 uint8_t cc = 0;
730 CPUModel model{};
731 uint8_t stepping = 0;
732 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
733 {
734 std::cerr << "Cannot get CPUID!\n";
735 continue;
736 }
737
738 switch (model)
739 {
740 case skx:
741 {
742 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700743 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700744 uint32_t errpinsts = 0;
745 if (peci_RdPCIConfigLocal(
746 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
747 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
748 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700749 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700750 }
751 break;
752 }
753 case icx:
754 {
755 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700756 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700757 // accessed on PECI as bus 13)
758 uint32_t errpinsts = 0;
759 if (peci_RdEndPointConfigPciLocal(
760 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
761 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
762 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700763 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700764 }
765 break;
766 }
767 }
768 }
769 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700770 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700771}
772
Jason M. Billscbf78532019-08-16 15:32:11 -0700773static void errXAssertHandler(const int errPin,
774 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700775{
Jason M. Billscbf78532019-08-16 15:32:11 -0700776 // ERRx status is not guaranteed through the timeout, so save which
777 // CPUs have it asserted
778 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
779 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
780 errXAssertTimer.async_wait([errPin, errPinCPUs](
781 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700782 if (ec)
783 {
784 // operation_aborted is expected if timer is canceled before
785 // completion.
786 if (ec != boost::asio::error::operation_aborted)
787 {
788 std::cerr << "err2 timeout async_wait failed: " << ec.message()
789 << "\n";
790 }
791 return;
792 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700793 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
794 << std::to_string(errTimeoutMs) << " ms\n";
795 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700796 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700797 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700798 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700799 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700800 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700801 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700802 }
803 }
804 }
805 else
806 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700807 cpuERRXLog(errPin);
808 }
809 });
810}
811
Jason M. Bills8c584392019-08-19 11:05:51 -0700812static void err0AssertHandler()
813{
814 // Handle the standard ERR0 detection and logging
815 const static constexpr int err0 = 0;
816 errXAssertHandler(err0, err0AssertTimer);
817}
818
819static void err0Handler()
820{
821 if (!hostOff)
822 {
823 gpiod::line_event gpioLineEvent = err0Line.event_read();
824
825 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
826 if (err0)
827 {
828 err0AssertHandler();
829 }
830 else
831 {
832 err0AssertTimer.cancel();
833 }
834 }
835 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
836 [](const boost::system::error_code ec) {
837 if (ec)
838 {
839 std::cerr
840 << "err0 handler error: " << ec.message()
841 << "\n";
842 return;
843 }
844 err0Handler();
845 });
846}
847
Jason M. Bills75af3962019-08-19 11:07:17 -0700848static void err1AssertHandler()
849{
850 // Handle the standard ERR1 detection and logging
851 const static constexpr int err1 = 1;
852 errXAssertHandler(err1, err1AssertTimer);
853}
854
855static void err1Handler()
856{
857 if (!hostOff)
858 {
859 gpiod::line_event gpioLineEvent = err1Line.event_read();
860
861 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
862 if (err1)
863 {
864 err1AssertHandler();
865 }
866 else
867 {
868 err1AssertTimer.cancel();
869 }
870 }
871 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
872 [](const boost::system::error_code ec) {
873 if (ec)
874 {
875 std::cerr
876 << "err1 handler error: " << ec.message()
877 << "\n";
878 return;
879 }
880 err1Handler();
881 });
882}
883
Jason M. Billscbf78532019-08-16 15:32:11 -0700884static void err2AssertHandler()
885{
886 // Handle the standard ERR2 detection and logging
887 const static constexpr int err2 = 2;
888 errXAssertHandler(err2, err2AssertTimer);
889 // Also handle reset for ERR2
890 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
891 if (ec)
892 {
893 // operation_aborted is expected if timer is canceled before
894 // completion.
895 if (ec != boost::asio::error::operation_aborted)
896 {
897 std::cerr << "err2 timeout async_wait failed: " << ec.message()
898 << "\n";
899 }
900 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700901 }
902 conn->async_method_call(
903 [](boost::system::error_code ec,
904 const std::variant<bool>& property) {
905 if (ec)
906 {
907 return;
908 }
909 const bool* reset = std::get_if<bool>(&property);
910 if (reset == nullptr)
911 {
912 std::cerr << "Unable to read reset on ERR2 value\n";
913 return;
914 }
915 startCrashdumpAndRecovery(*reset);
916 },
917 "xyz.openbmc_project.Settings",
918 "/xyz/openbmc_project/control/processor_error_config",
919 "org.freedesktop.DBus.Properties", "Get",
920 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
921 });
922}
923
924static void err2Handler()
925{
926 if (!hostOff)
927 {
928 gpiod::line_event gpioLineEvent = err2Line.event_read();
929
930 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
931 if (err2)
932 {
933 err2AssertHandler();
934 }
935 else
936 {
937 err2AssertTimer.cancel();
938 }
939 }
940 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
941 [](const boost::system::error_code ec) {
942 if (ec)
943 {
944 std::cerr
945 << "err2 handler error: " << ec.message()
946 << "\n";
947 return;
948 }
949 err2Handler();
950 });
951}
952
Jason M. Bills89922f82019-08-06 11:10:02 -0700953static void smiAssertHandler()
954{
955 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
956 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
957 if (ec)
958 {
959 // operation_aborted is expected if timer is canceled before
960 // completion.
961 if (ec != boost::asio::error::operation_aborted)
962 {
963 std::cerr << "smi timeout async_wait failed: " << ec.message()
964 << "\n";
965 }
966 return;
967 }
968 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
969 << " ms\n";
970 smiTimeoutLog();
971 conn->async_method_call(
972 [](boost::system::error_code ec,
973 const std::variant<bool>& property) {
974 if (ec)
975 {
976 return;
977 }
978 const bool* reset = std::get_if<bool>(&property);
979 if (reset == nullptr)
980 {
981 std::cerr << "Unable to read reset on SMI value\n";
982 return;
983 }
984 startCrashdumpAndRecovery(*reset);
985 },
986 "xyz.openbmc_project.Settings",
987 "/xyz/openbmc_project/control/bmc_reset_disables",
988 "org.freedesktop.DBus.Properties", "Get",
989 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
990 });
991}
992
993static void smiHandler()
994{
995 if (!hostOff)
996 {
997 gpiod::line_event gpioLineEvent = smiLine.event_read();
998
999 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1000 if (smi)
1001 {
1002 smiAssertHandler();
1003 }
1004 else
1005 {
1006 smiAssertTimer.cancel();
1007 }
1008 }
1009 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1010 [](const boost::system::error_code ec) {
1011 if (ec)
1012 {
1013 std::cerr
1014 << "smi handler error: " << ec.message()
1015 << "\n";
1016 return;
1017 }
1018 smiHandler();
1019 });
1020}
1021
Jason M. Billsa15c2522019-08-16 10:01:44 -07001022static void initializeErrorState()
1023{
1024 // Handle CPU_CATERR if it's asserted now
1025 if (caterrLine.get_value() == 0)
1026 {
1027 caterrAssertHandler();
1028 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001029
Jason M. Bills8c584392019-08-19 11:05:51 -07001030 // Handle CPU_ERR0 if it's asserted now
1031 if (err0Line.get_value() == 0)
1032 {
1033 err0AssertHandler();
1034 }
1035
Jason M. Bills75af3962019-08-19 11:07:17 -07001036 // Handle CPU_ERR1 if it's asserted now
1037 if (err1Line.get_value() == 0)
1038 {
1039 err1AssertHandler();
1040 }
1041
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001042 // Handle CPU_ERR2 if it's asserted now
1043 if (err2Line.get_value() == 0)
1044 {
1045 err2AssertHandler();
1046 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001047
1048 // Handle SMI if it's asserted now
1049 if (smiLine.get_value() == 0)
1050 {
1051 smiAssertHandler();
1052 }
Jason M. Bills08866542019-08-16 12:04:19 -07001053
1054 // Handle PCH_BMC_THERMTRIP if it's asserted now
1055 if (pchThermtripLine.get_value() == 0)
1056 {
1057 ssbThermTripLog();
1058 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001059}
Jason M. Bills1490b142019-07-01 15:48:43 -07001060} // namespace host_error_monitor
1061
1062int main(int argc, char* argv[])
1063{
1064 // setup connection to dbus
1065 host_error_monitor::conn =
1066 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1067
1068 // Host Error Monitor Object
1069 host_error_monitor::conn->request_name(
1070 "xyz.openbmc_project.HostErrorMonitor");
1071 sdbusplus::asio::object_server server =
1072 sdbusplus::asio::object_server(host_error_monitor::conn);
1073
1074 // Start tracking host state
1075 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1076 host_error_monitor::startHostStateMonitor();
1077
1078 // Initialize the host state
1079 host_error_monitor::initializeHostState();
1080
1081 // Request CPU_CATERR GPIO events
1082 if (!host_error_monitor::requestGPIOEvents(
1083 "CPU_CATERR", host_error_monitor::caterrHandler,
1084 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1085 {
1086 return -1;
1087 }
1088
Jason M. Bills8c584392019-08-19 11:05:51 -07001089 // Request CPU_ERR0 GPIO events
1090 if (!host_error_monitor::requestGPIOEvents(
1091 "CPU_ERR0", host_error_monitor::err0Handler,
1092 host_error_monitor::err0Line, host_error_monitor::err0Event))
1093 {
1094 return -1;
1095 }
1096
Jason M. Bills75af3962019-08-19 11:07:17 -07001097 // Request CPU_ERR1 GPIO events
1098 if (!host_error_monitor::requestGPIOEvents(
1099 "CPU_ERR1", host_error_monitor::err1Handler,
1100 host_error_monitor::err1Line, host_error_monitor::err1Event))
1101 {
1102 return -1;
1103 }
1104
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001105 // Request CPU_ERR2 GPIO events
1106 if (!host_error_monitor::requestGPIOEvents(
1107 "CPU_ERR2", host_error_monitor::err2Handler,
1108 host_error_monitor::err2Line, host_error_monitor::err2Event))
1109 {
1110 return -1;
1111 }
1112
Jason M. Bills89922f82019-08-06 11:10:02 -07001113 // Request SMI GPIO events
1114 if (!host_error_monitor::requestGPIOEvents(
1115 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1116 host_error_monitor::smiEvent))
1117 {
1118 return -1;
1119 }
1120
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001121 // Request CPU1_THERMTRIP GPIO events
1122 if (!host_error_monitor::requestGPIOEvents(
1123 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1124 host_error_monitor::cpu1ThermtripLine,
1125 host_error_monitor::cpu1ThermtripEvent))
1126 {
1127 return -1;
1128 }
1129
1130 // Request CPU2_THERMTRIP GPIO events
1131 if (!host_error_monitor::requestGPIOEvents(
1132 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1133 host_error_monitor::cpu2ThermtripLine,
1134 host_error_monitor::cpu2ThermtripEvent))
1135 {
1136 return -1;
1137 }
1138
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001139 // Request PCH_BMC_THERMTRIP GPIO events
1140 if (!host_error_monitor::requestGPIOEvents(
1141 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1142 host_error_monitor::pchThermtripLine,
1143 host_error_monitor::pchThermtripEvent))
1144 {
1145 return -1;
1146 }
1147
Jason M. Bills1490b142019-07-01 15:48:43 -07001148 host_error_monitor::io.run();
1149
1150 return 0;
1151}