blob: 27acf5e5130b464577e2cb9967763629c276637c [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070041// Timer for ERR0 asserted
42static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070043// Timer for ERR1 asserted
44static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045// Timer for ERR2 asserted
46static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070047// Timer for SMI asserted
48static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070049
50// GPIO Lines and Event Descriptors
51static gpiod::line caterrLine;
52static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070053static gpiod::line err0Line;
54static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070055static gpiod::line err1Line;
56static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070057static gpiod::line err2Line;
58static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070059static gpiod::line smiLine;
60static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070061static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070062static gpiod::line cpu1ThermtripLine;
63static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070064static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070065static gpiod::line cpu2ThermtripLine;
66static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070067static gpiod::line cpu1VRHotLine;
68static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
69static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070070static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
71static gpiod::line cpu1MemEFGHVRHotLine;
72static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
73static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070074static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070075static gpiod::line cpu1MemABCDVRHotLine;
76static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
77static gpiod::line cpu2MemEFGHVRHotLine;
78static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080079//----------------------------------
80// PCH_BMC_THERMTRIP function related definition
81//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080082static gpiod::line pchThermtripLine;
83static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070084
Jason M. Billsa3397932019-08-06 11:07:21 -070085static void cpuIERRLog()
86{
87 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
88 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
89 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
90}
91
92static void cpuIERRLog(const int cpuNum)
93{
94 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
95
96 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
97 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
98 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
99}
100
101static void cpuIERRLog(const int cpuNum, const std::string& type)
102{
103 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
104
105 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
106 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
107 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
108}
109
Jason M. Billscbf78532019-08-16 15:32:11 -0700110static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700111{
Jason M. Billscbf78532019-08-16 15:32:11 -0700112 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
113
114 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
115 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
116 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700117}
118
Jason M. Billscbf78532019-08-16 15:32:11 -0700119static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700120{
Jason M. Billscbf78532019-08-16 15:32:11 -0700121 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
122 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700123
124 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
125 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
126 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
127}
128
Jason M. Bills89922f82019-08-06 11:10:02 -0700129static void smiTimeoutLog()
130{
131 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
132 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
133 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
134}
135
Jason M. Bills45e87e02019-09-09 14:45:38 -0700136static void cpuBootFIVRFaultLog(const int cpuNum)
137{
138 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
139
140 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
141 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
142 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
143}
144
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700145static void cpuThermTripLog(const int cpuNum)
146{
147 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
148
149 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
150 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
151 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
152 cpuNum, NULL);
153}
154
Jason M. Bills250fa632019-08-28 15:58:25 -0700155static void cpuVRHotLog(const std::string& vr)
156{
157 std::string msg = vr + " Voltage Regulator Overheated.";
158
159 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
160 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
161 "OpenBMC.0.1.VoltageRegulatorOverheated",
162 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
163}
164
Jason M. Bills08866542019-08-16 12:04:19 -0700165static void ssbThermTripLog()
166{
167 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
168 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
169 "OpenBMC.0.1.SsbThermalTrip", NULL);
170}
171
Jason M. Billsa15c2522019-08-16 10:01:44 -0700172static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700173static void initializeHostState()
174{
175 conn->async_method_call(
176 [](boost::system::error_code ec,
177 const std::variant<std::string>& property) {
178 if (ec)
179 {
180 return;
181 }
182 const std::string* state = std::get_if<std::string>(&property);
183 if (state == nullptr)
184 {
185 std::cerr << "Unable to read host state value\n";
186 return;
187 }
188 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700189 // If the system is on, initialize the error state
190 if (!hostOff)
191 {
192 initializeErrorState();
193 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700194 },
195 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
196 "org.freedesktop.DBus.Properties", "Get",
197 "xyz.openbmc_project.State.Host", "CurrentHostState");
198}
199
200static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
201{
202 return std::make_shared<sdbusplus::bus::match::match>(
203 *conn,
204 "type='signal',interface='org.freedesktop.DBus.Properties',"
205 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
206 "Host'",
207 [](sdbusplus::message::message& msg) {
208 std::string interfaceName;
209 boost::container::flat_map<std::string, std::variant<std::string>>
210 propertiesChanged;
211 std::string state;
212 try
213 {
214 msg.read(interfaceName, propertiesChanged);
215 state =
216 std::get<std::string>(propertiesChanged.begin()->second);
217 }
218 catch (std::exception& e)
219 {
220 std::cerr << "Unable to read host state\n";
221 return;
222 }
223 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
224
Jason M. Bills1490b142019-07-01 15:48:43 -0700225 if (hostOff)
226 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700227 // No host events should fire while off, so cancel any pending
228 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700229 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700230 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700231 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700232 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700233 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700234 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700235 else
236 {
237 // Handle any initial errors when the host turns on
238 initializeErrorState();
239 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700240 });
241}
242
243static bool requestGPIOEvents(
244 const std::string& name, const std::function<void()>& handler,
245 gpiod::line& gpioLine,
246 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
247{
248 // Find the GPIO line
249 gpioLine = gpiod::find_line(name);
250 if (!gpioLine)
251 {
252 std::cerr << "Failed to find the " << name << " line\n";
253 return false;
254 }
255
256 try
257 {
258 gpioLine.request(
259 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
260 }
261 catch (std::exception&)
262 {
263 std::cerr << "Failed to request events for " << name << "\n";
264 return false;
265 }
266
267 int gpioLineFd = gpioLine.event_get_fd();
268 if (gpioLineFd < 0)
269 {
270 std::cerr << "Failed to get " << name << " fd\n";
271 return false;
272 }
273
274 gpioEventDescriptor.assign(gpioLineFd);
275
276 gpioEventDescriptor.async_wait(
277 boost::asio::posix::stream_descriptor::wait_read,
278 [&name, handler](const boost::system::error_code ec) {
279 if (ec)
280 {
281 std::cerr << name << " fd handler error: " << ec.message()
282 << "\n";
283 return;
284 }
285 handler();
286 });
287 return true;
288}
289
Jason M. Bills45e87e02019-09-09 14:45:38 -0700290static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
291{
292 // Find the GPIO line
293 gpioLine = gpiod::find_line(name);
294 if (!gpioLine)
295 {
296 std::cerr << "Failed to find the " << name << " line.\n";
297 return false;
298 }
299
300 // Request GPIO input
301 try
302 {
303 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
304 }
305 catch (std::exception&)
306 {
307 std::cerr << "Failed to request " << name << " input\n";
308 return false;
309 }
310
311 return true;
312}
313
Jason M. Bills1490b142019-07-01 15:48:43 -0700314static void startPowerCycle()
315{
316 conn->async_method_call(
317 [](boost::system::error_code ec) {
318 if (ec)
319 {
320 std::cerr << "failed to set Chassis State\n";
321 }
322 },
323 "xyz.openbmc_project.State.Chassis",
324 "/xyz/openbmc_project/state/chassis0",
325 "org.freedesktop.DBus.Properties", "Set",
326 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
327 std::variant<std::string>{
328 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
329}
330
331static void startCrashdumpAndRecovery(bool recoverSystem)
332{
333 std::cout << "Starting crashdump\n";
334 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
335 static boost::asio::steady_timer crashdumpTimer(io);
336
337 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
338 *conn,
339 "type='signal',interface='org.freedesktop.DBus.Properties',"
340 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
341 [recoverSystem](sdbusplus::message::message& msg) {
342 crashdumpTimer.cancel();
343 std::cout << "Crashdump completed\n";
344 if (recoverSystem)
345 {
346 std::cout << "Recovering the system\n";
347 startPowerCycle();
348 }
349 crashdumpCompleteMatch.reset();
350 });
351
352 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
353 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
354 if (ec)
355 {
356 // operation_aborted is expected if timer is canceled
357 if (ec != boost::asio::error::operation_aborted)
358 {
359 std::cerr << "Crashdump async_wait failed: " << ec.message()
360 << "\n";
361 }
362 std::cout << "Crashdump timer canceled\n";
363 return;
364 }
365 std::cerr << "Crashdump failed to complete before timeout\n";
366 crashdumpCompleteMatch.reset();
367 });
368
369 conn->async_method_call(
370 [](boost::system::error_code ec) {
371 if (ec)
372 {
373 std::cerr << "failed to start Crashdump\n";
374 crashdumpTimer.cancel();
375 crashdumpCompleteMatch.reset();
376 }
377 },
378 "com.intel.crashdump", "/com/intel/crashdump",
379 "com.intel.crashdump.Stored", "GenerateStoredLog");
380}
381
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700382static void incrementCPUErrorCount(int cpuNum)
383{
384 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
385
386 // Get the current count
387 conn->async_method_call(
388 [propertyName](boost::system::error_code ec,
389 const std::variant<uint8_t>& property) {
390 if (ec)
391 {
392 std::cerr << "Failed to read " << propertyName << ": "
393 << ec.message() << "\n";
394 return;
395 }
396 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
397 if (errorCountVariant == nullptr)
398 {
399 std::cerr << propertyName << " invalid\n";
400 return;
401 }
402 uint8_t errorCount = *errorCountVariant;
403 if (errorCount == std::numeric_limits<uint8_t>::max())
404 {
405 std::cerr << "Maximum error count reached\n";
406 return;
407 }
408 // Increment the count
409 errorCount++;
410 conn->async_method_call(
411 [propertyName](boost::system::error_code ec) {
412 if (ec)
413 {
414 std::cerr << "Failed to set " << propertyName << ": "
415 << ec.message() << "\n";
416 }
417 },
418 "xyz.openbmc_project.Settings",
419 "/xyz/openbmc_project/control/processor_error_config",
420 "org.freedesktop.DBus.Properties", "Set",
421 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
422 std::variant<uint8_t>{errorCount});
423 },
424 "xyz.openbmc_project.Settings",
425 "/xyz/openbmc_project/control/processor_error_config",
426 "org.freedesktop.DBus.Properties", "Get",
427 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
428}
429
Jason M. Billsa3397932019-08-06 11:07:21 -0700430static bool checkIERRCPUs()
431{
432 bool cpuIERRFound = false;
433 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
434 cpu++, addr++)
435 {
436 uint8_t cc = 0;
437 CPUModel model{};
438 uint8_t stepping = 0;
439 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
440 {
441 std::cerr << "Cannot get CPUID!\n";
442 continue;
443 }
444
445 switch (model)
446 {
447 case skx:
448 {
449 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
450 // that caused the IERR
451 uint32_t mcaErrSrcLog = 0;
452 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
453 &cc) != PECI_CC_SUCCESS)
454 {
455 continue;
456 }
457 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
458 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
459 {
460 // TODO: Light the CPU fault LED?
461 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700462 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700463 // Next check if it's a CPU/VR mismatch by reading the
464 // IA32_MC4_STATUS MSR (0x411)
465 uint64_t mc4Status = 0;
466 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
467 PECI_CC_SUCCESS)
468 {
469 continue;
470 }
471 // Check MSEC bits 31:24 for
472 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
473 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
474 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
475 if ((mc4Status & (0x40 << 24)) ||
476 (mc4Status & (0x42 << 24)) ||
477 (mc4Status & (0x43 << 24)))
478 {
479 cpuIERRLog(cpu, "CPU/VR Mismatch");
480 continue;
481 }
482
483 // Next check if it's a Core FIVR fault by looking for a
484 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
485 // 80h)
486 uint32_t coreFIVRErrLog = 0;
487 if (peci_RdPCIConfigLocal(
488 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
489 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
490 {
491 continue;
492 }
493 if (coreFIVRErrLog)
494 {
495 cpuIERRLog(cpu, "Core FIVR Fault");
496 continue;
497 }
498
499 // Next check if it's an Uncore FIVR fault by looking for a
500 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
501 // 84h)
502 uint32_t uncoreFIVRErrLog = 0;
503 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
504 sizeof(uint32_t),
505 (uint8_t*)&uncoreFIVRErrLog,
506 &cc) != PECI_CC_SUCCESS)
507 {
508 continue;
509 }
510 if (uncoreFIVRErrLog)
511 {
512 cpuIERRLog(cpu, "Uncore FIVR Fault");
513 continue;
514 }
515
516 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
517 // both zero, but MSEC bits 31:24 have either
518 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
519 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
520 // uncore FIVR fault
521 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
522 ((mc4Status & (0x51 << 24)) ||
523 (mc4Status & (0x52 << 24))))
524 {
525 cpuIERRLog(cpu, "Uncore FIVR Fault");
526 continue;
527 }
528 cpuIERRLog(cpu);
529 }
530 break;
531 }
532 case icx:
533 {
534 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
535 // that caused the IERR
536 uint32_t mcaErrSrcLog = 0;
537 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
538 &cc) != PECI_CC_SUCCESS)
539 {
540 continue;
541 }
542 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
543 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
544 {
545 // TODO: Light the CPU fault LED?
546 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700547 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700548 // Next check if it's a CPU/VR mismatch by reading the
549 // IA32_MC4_STATUS MSR (0x411)
550 uint64_t mc4Status = 0;
551 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
552 PECI_CC_SUCCESS)
553 {
554 continue;
555 }
556 // TODO: Update MSEC/MSCOD_31_24 check
557 // Check MSEC bits 31:24 for
558 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
559 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
560 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
561 if ((mc4Status & (0x40 << 24)) ||
562 (mc4Status & (0x42 << 24)) ||
563 (mc4Status & (0x43 << 24)))
564 {
565 cpuIERRLog(cpu, "CPU/VR Mismatch");
566 continue;
567 }
568
569 // Next check if it's a Core FIVR fault by looking for a
570 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
571 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
572 uint32_t coreFIVRErrLog0 = 0;
573 uint32_t coreFIVRErrLog1 = 0;
574 if (peci_RdEndPointConfigPciLocal(
575 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
576 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
577 {
578 continue;
579 }
580 if (peci_RdEndPointConfigPciLocal(
581 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
582 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
583 {
584 continue;
585 }
586 if (coreFIVRErrLog0 || coreFIVRErrLog1)
587 {
588 cpuIERRLog(cpu, "Core FIVR Fault");
589 continue;
590 }
591
592 // Next check if it's an Uncore FIVR fault by looking for a
593 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
594 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
595 uint32_t uncoreFIVRErrLog = 0;
596 if (peci_RdEndPointConfigPciLocal(
597 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
598 (uint8_t*)&uncoreFIVRErrLog,
599 &cc) != PECI_CC_SUCCESS)
600 {
601 continue;
602 }
603 if (uncoreFIVRErrLog)
604 {
605 cpuIERRLog(cpu, "Uncore FIVR Fault");
606 continue;
607 }
608
609 // TODO: Update MSEC/MSCOD_31_24 check
610 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
611 // both zero, but MSEC bits 31:24 have either
612 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
613 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
614 // uncore FIVR fault
615 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
616 !uncoreFIVRErrLog &&
617 ((mc4Status & (0x51 << 24)) ||
618 (mc4Status & (0x52 << 24))))
619 {
620 cpuIERRLog(cpu, "Uncore FIVR Fault");
621 continue;
622 }
623 cpuIERRLog(cpu);
624 }
625 break;
626 }
627 }
628 }
629 return cpuIERRFound;
630}
631
Jason M. Billsa15c2522019-08-16 10:01:44 -0700632static void caterrAssertHandler()
633{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700634 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
635 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
636 if (ec)
637 {
638 // operation_aborted is expected if timer is canceled
639 // before completion.
640 if (ec != boost::asio::error::operation_aborted)
641 {
642 std::cerr << "caterr timeout async_wait failed: "
643 << ec.message() << "\n";
644 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700645 return;
646 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700647 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
648 << " ms\n";
649 if (!checkIERRCPUs())
650 {
651 cpuIERRLog();
652 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700653 conn->async_method_call(
654 [](boost::system::error_code ec,
655 const std::variant<bool>& property) {
656 if (ec)
657 {
658 return;
659 }
660 const bool* reset = std::get_if<bool>(&property);
661 if (reset == nullptr)
662 {
663 std::cerr << "Unable to read reset on CATERR value\n";
664 return;
665 }
666 startCrashdumpAndRecovery(*reset);
667 },
668 "xyz.openbmc_project.Settings",
669 "/xyz/openbmc_project/control/processor_error_config",
670 "org.freedesktop.DBus.Properties", "Get",
671 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
672 });
673}
674
Jason M. Bills1490b142019-07-01 15:48:43 -0700675static void caterrHandler()
676{
677 if (!hostOff)
678 {
679 gpiod::line_event gpioLineEvent = caterrLine.event_read();
680
681 bool caterr =
682 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
683 if (caterr)
684 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700685 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700686 }
687 else
688 {
689 caterrAssertTimer.cancel();
690 }
691 }
692 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
693 [](const boost::system::error_code ec) {
694 if (ec)
695 {
696 std::cerr << "caterr handler error: "
697 << ec.message() << "\n";
698 return;
699 }
700 caterrHandler();
701 });
702}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700703
Jason M. Billse94f5e12019-09-13 11:11:34 -0700704static void cpu1ThermtripAssertHandler()
705{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700706 if (cpu1FIVRFaultLine.get_value() == 0)
707 {
708 cpuBootFIVRFaultLog(1);
709 }
710 else
711 {
712 cpuThermTripLog(1);
713 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700714}
715
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700716static void cpu1ThermtripHandler()
717{
718 if (!hostOff)
719 {
720 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
721
722 bool cpu1Thermtrip =
723 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
724 if (cpu1Thermtrip)
725 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700726 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700727 }
728 }
729 cpu1ThermtripEvent.async_wait(
730 boost::asio::posix::stream_descriptor::wait_read,
731 [](const boost::system::error_code ec) {
732 if (ec)
733 {
734 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
735 << "\n";
736 return;
737 }
738 cpu1ThermtripHandler();
739 });
740}
741
Jason M. Billse94f5e12019-09-13 11:11:34 -0700742static void cpu2ThermtripAssertHandler()
743{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700744 if (cpu2FIVRFaultLine.get_value() == 0)
745 {
746 cpuBootFIVRFaultLog(2);
747 }
748 else
749 {
750 cpuThermTripLog(2);
751 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700752}
753
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700754static void cpu2ThermtripHandler()
755{
756 if (!hostOff)
757 {
758 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
759
760 bool cpu2Thermtrip =
761 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
762 if (cpu2Thermtrip)
763 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700764 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700765 }
766 }
767 cpu2ThermtripEvent.async_wait(
768 boost::asio::posix::stream_descriptor::wait_read,
769 [](const boost::system::error_code ec) {
770 if (ec)
771 {
772 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
773 << "\n";
774 return;
775 }
776 cpu2ThermtripHandler();
777 });
778}
779
Jason M. Billse94f5e12019-09-13 11:11:34 -0700780static void cpu1VRHotAssertHandler()
781{
782 cpuVRHotLog("CPU 1");
783}
784
Jason M. Bills250fa632019-08-28 15:58:25 -0700785static void cpu1VRHotHandler()
786{
787 if (!hostOff)
788 {
789 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
790
791 bool cpu1VRHot =
792 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
793 if (cpu1VRHot)
794 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700795 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700796 }
797 }
798 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
799 [](const boost::system::error_code ec) {
800 if (ec)
801 {
802 std::cerr << "CPU 1 VRHot handler error: "
803 << ec.message() << "\n";
804 return;
805 }
806 cpu1VRHotHandler();
807 });
808}
809
Jason M. Billse94f5e12019-09-13 11:11:34 -0700810static void cpu1MemABCDVRHotAssertHandler()
811{
812 cpuVRHotLog("CPU 1 Memory ABCD");
813}
814
Jason M. Bills9647ba72019-08-29 14:19:19 -0700815static void cpu1MemABCDVRHotHandler()
816{
817 if (!hostOff)
818 {
819 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
820
821 bool cpu1MemABCDVRHot =
822 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
823 if (cpu1MemABCDVRHot)
824 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700825 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700826 }
827 }
828 cpu1MemABCDVRHotEvent.async_wait(
829 boost::asio::posix::stream_descriptor::wait_read,
830 [](const boost::system::error_code ec) {
831 if (ec)
832 {
833 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
834 << ec.message() << "\n";
835 return;
836 }
837 cpu1MemABCDVRHotHandler();
838 });
839}
840
Jason M. Billse94f5e12019-09-13 11:11:34 -0700841static void cpu1MemEFGHVRHotAssertHandler()
842{
843 cpuVRHotLog("CPU 1 Memory EFGH");
844}
845
Jason M. Bills9647ba72019-08-29 14:19:19 -0700846static void cpu1MemEFGHVRHotHandler()
847{
848 if (!hostOff)
849 {
850 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
851
852 bool cpu1MemEFGHVRHot =
853 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
854 if (cpu1MemEFGHVRHot)
855 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700856 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700857 }
858 }
859 cpu1MemEFGHVRHotEvent.async_wait(
860 boost::asio::posix::stream_descriptor::wait_read,
861 [](const boost::system::error_code ec) {
862 if (ec)
863 {
864 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
865 << ec.message() << "\n";
866 return;
867 }
868 cpu1MemEFGHVRHotHandler();
869 });
870}
871
Jason M. Billse94f5e12019-09-13 11:11:34 -0700872static void cpu2VRHotAssertHandler()
873{
874 cpuVRHotLog("CPU 2");
875}
876
Jason M. Bills250fa632019-08-28 15:58:25 -0700877static void cpu2VRHotHandler()
878{
879 if (!hostOff)
880 {
881 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
882
883 bool cpu2VRHot =
884 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
885 if (cpu2VRHot)
886 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700887 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700888 }
889 }
890 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
891 [](const boost::system::error_code ec) {
892 if (ec)
893 {
894 std::cerr << "CPU 2 VRHot handler error: "
895 << ec.message() << "\n";
896 return;
897 }
898 cpu2VRHotHandler();
899 });
900}
901
Jason M. Billse94f5e12019-09-13 11:11:34 -0700902static void cpu2MemABCDVRHotAssertHandler()
903{
904 cpuVRHotLog("CPU 2 Memory ABCD");
905}
906
Jason M. Bills9647ba72019-08-29 14:19:19 -0700907static void cpu2MemABCDVRHotHandler()
908{
909 if (!hostOff)
910 {
911 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
912
913 bool cpu2MemABCDVRHot =
914 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
915 if (cpu2MemABCDVRHot)
916 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700917 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700918 }
919 }
920 cpu2MemABCDVRHotEvent.async_wait(
921 boost::asio::posix::stream_descriptor::wait_read,
922 [](const boost::system::error_code ec) {
923 if (ec)
924 {
925 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
926 << ec.message() << "\n";
927 return;
928 }
929 cpu2MemABCDVRHotHandler();
930 });
931}
932
Jason M. Billse94f5e12019-09-13 11:11:34 -0700933static void cpu2MemEFGHVRHotAssertHandler()
934{
935 cpuVRHotLog("CPU 2 Memory EFGH");
936}
937
Jason M. Bills9647ba72019-08-29 14:19:19 -0700938static void cpu2MemEFGHVRHotHandler()
939{
940 if (!hostOff)
941 {
942 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
943
944 bool cpu2MemEFGHVRHot =
945 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
946 if (cpu2MemEFGHVRHot)
947 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700948 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700949 }
950 }
951 cpu2MemEFGHVRHotEvent.async_wait(
952 boost::asio::posix::stream_descriptor::wait_read,
953 [](const boost::system::error_code ec) {
954 if (ec)
955 {
956 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
957 << ec.message() << "\n";
958 return;
959 }
960 cpu2MemEFGHVRHotHandler();
961 });
962}
963
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800964static void pchThermtripHandler()
965{
966 if (!hostOff)
967 {
968 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
969
970 bool pchThermtrip =
971 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
972 if (pchThermtrip)
973 {
Jason M. Bills08866542019-08-16 12:04:19 -0700974 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800975 }
976 }
977 pchThermtripEvent.async_wait(
978 boost::asio::posix::stream_descriptor::wait_read,
979 [](const boost::system::error_code ec) {
980 if (ec)
981 {
982 std::cerr << "PCH Thermal trip handler error: " << ec.message()
983 << "\n";
984 return;
985 }
986 pchThermtripHandler();
987 });
988}
989
Jason M. Billscbf78532019-08-16 15:32:11 -0700990static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700991{
Jason M. Billscbf78532019-08-16 15:32:11 -0700992 int errPinSts = (1 << errPin);
993 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700994 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
995 cpu++, addr++)
996 {
997 if (peci_Ping(addr) == PECI_CC_SUCCESS)
998 {
999 uint8_t cc = 0;
1000 CPUModel model{};
1001 uint8_t stepping = 0;
1002 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1003 {
1004 std::cerr << "Cannot get CPUID!\n";
1005 continue;
1006 }
1007
1008 switch (model)
1009 {
1010 case skx:
1011 {
1012 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001013 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001014 uint32_t errpinsts = 0;
1015 if (peci_RdPCIConfigLocal(
1016 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1017 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1018 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001019 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001020 }
1021 break;
1022 }
1023 case icx:
1024 {
1025 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001026 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001027 // accessed on PECI as bus 13)
1028 uint32_t errpinsts = 0;
1029 if (peci_RdEndPointConfigPciLocal(
1030 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1031 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1032 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001033 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001034 }
1035 break;
1036 }
1037 }
1038 }
1039 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001040 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001041}
1042
Jason M. Billscbf78532019-08-16 15:32:11 -07001043static void errXAssertHandler(const int errPin,
1044 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001045{
Jason M. Billscbf78532019-08-16 15:32:11 -07001046 // ERRx status is not guaranteed through the timeout, so save which
1047 // CPUs have it asserted
1048 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1049 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1050 errXAssertTimer.async_wait([errPin, errPinCPUs](
1051 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001052 if (ec)
1053 {
1054 // operation_aborted is expected if timer is canceled before
1055 // completion.
1056 if (ec != boost::asio::error::operation_aborted)
1057 {
1058 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1059 << "\n";
1060 }
1061 return;
1062 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001063 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1064 << std::to_string(errTimeoutMs) << " ms\n";
1065 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001066 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001067 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001068 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001069 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001070 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001071 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001072 }
1073 }
1074 }
1075 else
1076 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001077 cpuERRXLog(errPin);
1078 }
1079 });
1080}
1081
Jason M. Bills8c584392019-08-19 11:05:51 -07001082static void err0AssertHandler()
1083{
1084 // Handle the standard ERR0 detection and logging
1085 const static constexpr int err0 = 0;
1086 errXAssertHandler(err0, err0AssertTimer);
1087}
1088
1089static void err0Handler()
1090{
1091 if (!hostOff)
1092 {
1093 gpiod::line_event gpioLineEvent = err0Line.event_read();
1094
1095 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1096 if (err0)
1097 {
1098 err0AssertHandler();
1099 }
1100 else
1101 {
1102 err0AssertTimer.cancel();
1103 }
1104 }
1105 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1106 [](const boost::system::error_code ec) {
1107 if (ec)
1108 {
1109 std::cerr
1110 << "err0 handler error: " << ec.message()
1111 << "\n";
1112 return;
1113 }
1114 err0Handler();
1115 });
1116}
1117
Jason M. Bills75af3962019-08-19 11:07:17 -07001118static void err1AssertHandler()
1119{
1120 // Handle the standard ERR1 detection and logging
1121 const static constexpr int err1 = 1;
1122 errXAssertHandler(err1, err1AssertTimer);
1123}
1124
1125static void err1Handler()
1126{
1127 if (!hostOff)
1128 {
1129 gpiod::line_event gpioLineEvent = err1Line.event_read();
1130
1131 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1132 if (err1)
1133 {
1134 err1AssertHandler();
1135 }
1136 else
1137 {
1138 err1AssertTimer.cancel();
1139 }
1140 }
1141 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1142 [](const boost::system::error_code ec) {
1143 if (ec)
1144 {
1145 std::cerr
1146 << "err1 handler error: " << ec.message()
1147 << "\n";
1148 return;
1149 }
1150 err1Handler();
1151 });
1152}
1153
Jason M. Billscbf78532019-08-16 15:32:11 -07001154static void err2AssertHandler()
1155{
1156 // Handle the standard ERR2 detection and logging
1157 const static constexpr int err2 = 2;
1158 errXAssertHandler(err2, err2AssertTimer);
1159 // Also handle reset for ERR2
1160 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1161 if (ec)
1162 {
1163 // operation_aborted is expected if timer is canceled before
1164 // completion.
1165 if (ec != boost::asio::error::operation_aborted)
1166 {
1167 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1168 << "\n";
1169 }
1170 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001171 }
1172 conn->async_method_call(
1173 [](boost::system::error_code ec,
1174 const std::variant<bool>& property) {
1175 if (ec)
1176 {
1177 return;
1178 }
1179 const bool* reset = std::get_if<bool>(&property);
1180 if (reset == nullptr)
1181 {
1182 std::cerr << "Unable to read reset on ERR2 value\n";
1183 return;
1184 }
1185 startCrashdumpAndRecovery(*reset);
1186 },
1187 "xyz.openbmc_project.Settings",
1188 "/xyz/openbmc_project/control/processor_error_config",
1189 "org.freedesktop.DBus.Properties", "Get",
1190 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
1191 });
1192}
1193
1194static void err2Handler()
1195{
1196 if (!hostOff)
1197 {
1198 gpiod::line_event gpioLineEvent = err2Line.event_read();
1199
1200 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1201 if (err2)
1202 {
1203 err2AssertHandler();
1204 }
1205 else
1206 {
1207 err2AssertTimer.cancel();
1208 }
1209 }
1210 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1211 [](const boost::system::error_code ec) {
1212 if (ec)
1213 {
1214 std::cerr
1215 << "err2 handler error: " << ec.message()
1216 << "\n";
1217 return;
1218 }
1219 err2Handler();
1220 });
1221}
1222
Jason M. Bills89922f82019-08-06 11:10:02 -07001223static void smiAssertHandler()
1224{
1225 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1226 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1227 if (ec)
1228 {
1229 // operation_aborted is expected if timer is canceled before
1230 // completion.
1231 if (ec != boost::asio::error::operation_aborted)
1232 {
1233 std::cerr << "smi timeout async_wait failed: " << ec.message()
1234 << "\n";
1235 }
1236 return;
1237 }
1238 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1239 << " ms\n";
1240 smiTimeoutLog();
1241 conn->async_method_call(
1242 [](boost::system::error_code ec,
1243 const std::variant<bool>& property) {
1244 if (ec)
1245 {
1246 return;
1247 }
1248 const bool* reset = std::get_if<bool>(&property);
1249 if (reset == nullptr)
1250 {
1251 std::cerr << "Unable to read reset on SMI value\n";
1252 return;
1253 }
1254 startCrashdumpAndRecovery(*reset);
1255 },
1256 "xyz.openbmc_project.Settings",
1257 "/xyz/openbmc_project/control/bmc_reset_disables",
1258 "org.freedesktop.DBus.Properties", "Get",
1259 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1260 });
1261}
1262
1263static void smiHandler()
1264{
1265 if (!hostOff)
1266 {
1267 gpiod::line_event gpioLineEvent = smiLine.event_read();
1268
1269 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1270 if (smi)
1271 {
1272 smiAssertHandler();
1273 }
1274 else
1275 {
1276 smiAssertTimer.cancel();
1277 }
1278 }
1279 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1280 [](const boost::system::error_code ec) {
1281 if (ec)
1282 {
1283 std::cerr
1284 << "smi handler error: " << ec.message()
1285 << "\n";
1286 return;
1287 }
1288 smiHandler();
1289 });
1290}
1291
Jason M. Billsa15c2522019-08-16 10:01:44 -07001292static void initializeErrorState()
1293{
1294 // Handle CPU_CATERR if it's asserted now
1295 if (caterrLine.get_value() == 0)
1296 {
1297 caterrAssertHandler();
1298 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001299
Jason M. Bills8c584392019-08-19 11:05:51 -07001300 // Handle CPU_ERR0 if it's asserted now
1301 if (err0Line.get_value() == 0)
1302 {
1303 err0AssertHandler();
1304 }
1305
Jason M. Bills75af3962019-08-19 11:07:17 -07001306 // Handle CPU_ERR1 if it's asserted now
1307 if (err1Line.get_value() == 0)
1308 {
1309 err1AssertHandler();
1310 }
1311
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001312 // Handle CPU_ERR2 if it's asserted now
1313 if (err2Line.get_value() == 0)
1314 {
1315 err2AssertHandler();
1316 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001317
1318 // Handle SMI if it's asserted now
1319 if (smiLine.get_value() == 0)
1320 {
1321 smiAssertHandler();
1322 }
Jason M. Bills08866542019-08-16 12:04:19 -07001323
Jason M. Billse94f5e12019-09-13 11:11:34 -07001324 // Handle CPU1_THERMTRIP if it's asserted now
1325 if (cpu1ThermtripLine.get_value() == 0)
1326 {
1327 cpu1ThermtripAssertHandler();
1328 }
1329
1330 // Handle CPU2_THERMTRIP if it's asserted now
1331 if (cpu2ThermtripLine.get_value() == 0)
1332 {
1333 cpu2ThermtripAssertHandler();
1334 }
1335
1336 // Handle CPU1_VRHOT if it's asserted now
1337 if (cpu1VRHotLine.get_value() == 0)
1338 {
1339 cpu1VRHotAssertHandler();
1340 }
1341
1342 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1343 if (cpu1MemABCDVRHotLine.get_value() == 0)
1344 {
1345 cpu1MemABCDVRHotAssertHandler();
1346 }
1347
1348 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1349 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1350 {
1351 cpu1MemEFGHVRHotAssertHandler();
1352 }
1353
1354 // Handle CPU2_VRHOT if it's asserted now
1355 if (cpu2VRHotLine.get_value() == 0)
1356 {
1357 cpu2VRHotAssertHandler();
1358 }
1359
1360 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1361 if (cpu2MemABCDVRHotLine.get_value() == 0)
1362 {
1363 cpu2MemABCDVRHotAssertHandler();
1364 }
1365
1366 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1367 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1368 {
1369 cpu2MemEFGHVRHotAssertHandler();
1370 }
1371
Jason M. Bills08866542019-08-16 12:04:19 -07001372 // Handle PCH_BMC_THERMTRIP if it's asserted now
1373 if (pchThermtripLine.get_value() == 0)
1374 {
1375 ssbThermTripLog();
1376 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001377}
Jason M. Bills1490b142019-07-01 15:48:43 -07001378} // namespace host_error_monitor
1379
1380int main(int argc, char* argv[])
1381{
1382 // setup connection to dbus
1383 host_error_monitor::conn =
1384 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1385
1386 // Host Error Monitor Object
1387 host_error_monitor::conn->request_name(
1388 "xyz.openbmc_project.HostErrorMonitor");
1389 sdbusplus::asio::object_server server =
1390 sdbusplus::asio::object_server(host_error_monitor::conn);
1391
1392 // Start tracking host state
1393 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1394 host_error_monitor::startHostStateMonitor();
1395
1396 // Initialize the host state
1397 host_error_monitor::initializeHostState();
1398
1399 // Request CPU_CATERR GPIO events
1400 if (!host_error_monitor::requestGPIOEvents(
1401 "CPU_CATERR", host_error_monitor::caterrHandler,
1402 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1403 {
1404 return -1;
1405 }
1406
Jason M. Bills8c584392019-08-19 11:05:51 -07001407 // Request CPU_ERR0 GPIO events
1408 if (!host_error_monitor::requestGPIOEvents(
1409 "CPU_ERR0", host_error_monitor::err0Handler,
1410 host_error_monitor::err0Line, host_error_monitor::err0Event))
1411 {
1412 return -1;
1413 }
1414
Jason M. Bills75af3962019-08-19 11:07:17 -07001415 // Request CPU_ERR1 GPIO events
1416 if (!host_error_monitor::requestGPIOEvents(
1417 "CPU_ERR1", host_error_monitor::err1Handler,
1418 host_error_monitor::err1Line, host_error_monitor::err1Event))
1419 {
1420 return -1;
1421 }
1422
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001423 // Request CPU_ERR2 GPIO events
1424 if (!host_error_monitor::requestGPIOEvents(
1425 "CPU_ERR2", host_error_monitor::err2Handler,
1426 host_error_monitor::err2Line, host_error_monitor::err2Event))
1427 {
1428 return -1;
1429 }
1430
Jason M. Bills89922f82019-08-06 11:10:02 -07001431 // Request SMI GPIO events
1432 if (!host_error_monitor::requestGPIOEvents(
1433 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1434 host_error_monitor::smiEvent))
1435 {
1436 return -1;
1437 }
1438
Jason M. Bills45e87e02019-09-09 14:45:38 -07001439 // Request CPU1_FIVR_FAULT GPIO input
1440 if (!host_error_monitor::requestGPIOInput(
1441 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1442 {
1443 return -1;
1444 }
1445
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001446 // Request CPU1_THERMTRIP GPIO events
1447 if (!host_error_monitor::requestGPIOEvents(
1448 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1449 host_error_monitor::cpu1ThermtripLine,
1450 host_error_monitor::cpu1ThermtripEvent))
1451 {
1452 return -1;
1453 }
1454
Jason M. Bills45e87e02019-09-09 14:45:38 -07001455 // Request CPU2_FIVR_FAULT GPIO input
1456 if (!host_error_monitor::requestGPIOInput(
1457 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1458 {
1459 return -1;
1460 }
1461
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001462 // Request CPU2_THERMTRIP GPIO events
1463 if (!host_error_monitor::requestGPIOEvents(
1464 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1465 host_error_monitor::cpu2ThermtripLine,
1466 host_error_monitor::cpu2ThermtripEvent))
1467 {
1468 return -1;
1469 }
1470
Jason M. Bills250fa632019-08-28 15:58:25 -07001471 // Request CPU1_VRHOT GPIO events
1472 if (!host_error_monitor::requestGPIOEvents(
1473 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1474 host_error_monitor::cpu1VRHotLine,
1475 host_error_monitor::cpu1VRHotEvent))
1476 {
1477 return -1;
1478 }
1479
Jason M. Bills9647ba72019-08-29 14:19:19 -07001480 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1481 if (!host_error_monitor::requestGPIOEvents(
1482 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1483 host_error_monitor::cpu1MemABCDVRHotLine,
1484 host_error_monitor::cpu1MemABCDVRHotEvent))
1485 {
1486 return -1;
1487 }
1488
1489 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1490 if (!host_error_monitor::requestGPIOEvents(
1491 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1492 host_error_monitor::cpu1MemEFGHVRHotLine,
1493 host_error_monitor::cpu1MemEFGHVRHotEvent))
1494 {
1495 return -1;
1496 }
1497
Jason M. Bills250fa632019-08-28 15:58:25 -07001498 // Request CPU2_VRHOT GPIO events
1499 if (!host_error_monitor::requestGPIOEvents(
1500 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1501 host_error_monitor::cpu2VRHotLine,
1502 host_error_monitor::cpu2VRHotEvent))
1503 {
1504 return -1;
1505 }
1506
Jason M. Bills9647ba72019-08-29 14:19:19 -07001507 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1508 if (!host_error_monitor::requestGPIOEvents(
1509 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1510 host_error_monitor::cpu2MemABCDVRHotLine,
1511 host_error_monitor::cpu2MemABCDVRHotEvent))
1512 {
1513 return -1;
1514 }
1515
1516 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1517 if (!host_error_monitor::requestGPIOEvents(
1518 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1519 host_error_monitor::cpu2MemEFGHVRHotLine,
1520 host_error_monitor::cpu2MemEFGHVRHotEvent))
1521 {
1522 return -1;
1523 }
1524
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001525 // Request PCH_BMC_THERMTRIP GPIO events
1526 if (!host_error_monitor::requestGPIOEvents(
1527 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1528 host_error_monitor::pchThermtripLine,
1529 host_error_monitor::pchThermtripEvent))
1530 {
1531 return -1;
1532 }
1533
Jason M. Bills1490b142019-07-01 15:48:43 -07001534 host_error_monitor::io.run();
1535
1536 return 0;
1537}