blob: 36b6384358e5871a9ab70adb478f4bfa6e0192ec [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070041// Timer for ERR0 asserted
42static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070043// Timer for ERR1 asserted
44static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045// Timer for ERR2 asserted
46static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070047// Timer for SMI asserted
48static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070049
50// GPIO Lines and Event Descriptors
51static gpiod::line caterrLine;
52static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070053static gpiod::line err0Line;
54static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070055static gpiod::line err1Line;
56static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070057static gpiod::line err2Line;
58static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070059static gpiod::line smiLine;
60static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills78c5eed2019-08-28 14:00:40 -070061static gpiod::line cpu1ThermtripLine;
62static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
63static gpiod::line cpu2ThermtripLine;
64static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070065static gpiod::line cpu1VRHotLine;
66static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
67static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070068static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
69static gpiod::line cpu1MemEFGHVRHotLine;
70static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
71static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070072static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070073static gpiod::line cpu1MemABCDVRHotLine;
74static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
75static gpiod::line cpu2MemEFGHVRHotLine;
76static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080077//----------------------------------
78// PCH_BMC_THERMTRIP function related definition
79//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080080static gpiod::line pchThermtripLine;
81static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070082
Jason M. Billsa3397932019-08-06 11:07:21 -070083static void cpuIERRLog()
84{
85 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
86 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
87 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
88}
89
90static void cpuIERRLog(const int cpuNum)
91{
92 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
93
94 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
95 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
96 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
97}
98
99static void cpuIERRLog(const int cpuNum, const std::string& type)
100{
101 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
102
103 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
104 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
105 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
106}
107
Jason M. Billscbf78532019-08-16 15:32:11 -0700108static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700109{
Jason M. Billscbf78532019-08-16 15:32:11 -0700110 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
111
112 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
113 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
114 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700115}
116
Jason M. Billscbf78532019-08-16 15:32:11 -0700117static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700118{
Jason M. Billscbf78532019-08-16 15:32:11 -0700119 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
120 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700121
122 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
123 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
124 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
125}
126
Jason M. Bills89922f82019-08-06 11:10:02 -0700127static void smiTimeoutLog()
128{
129 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
130 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
131 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
132}
133
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700134static void cpuThermTripLog(const int cpuNum)
135{
136 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
137
138 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
139 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
140 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
141 cpuNum, NULL);
142}
143
Jason M. Bills250fa632019-08-28 15:58:25 -0700144static void cpuVRHotLog(const std::string& vr)
145{
146 std::string msg = vr + " Voltage Regulator Overheated.";
147
148 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
149 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
150 "OpenBMC.0.1.VoltageRegulatorOverheated",
151 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
152}
153
Jason M. Bills08866542019-08-16 12:04:19 -0700154static void ssbThermTripLog()
155{
156 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
157 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
158 "OpenBMC.0.1.SsbThermalTrip", NULL);
159}
160
Jason M. Billsa15c2522019-08-16 10:01:44 -0700161static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700162static void initializeHostState()
163{
164 conn->async_method_call(
165 [](boost::system::error_code ec,
166 const std::variant<std::string>& property) {
167 if (ec)
168 {
169 return;
170 }
171 const std::string* state = std::get_if<std::string>(&property);
172 if (state == nullptr)
173 {
174 std::cerr << "Unable to read host state value\n";
175 return;
176 }
177 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700178 // If the system is on, initialize the error state
179 if (!hostOff)
180 {
181 initializeErrorState();
182 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700183 },
184 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
185 "org.freedesktop.DBus.Properties", "Get",
186 "xyz.openbmc_project.State.Host", "CurrentHostState");
187}
188
189static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
190{
191 return std::make_shared<sdbusplus::bus::match::match>(
192 *conn,
193 "type='signal',interface='org.freedesktop.DBus.Properties',"
194 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
195 "Host'",
196 [](sdbusplus::message::message& msg) {
197 std::string interfaceName;
198 boost::container::flat_map<std::string, std::variant<std::string>>
199 propertiesChanged;
200 std::string state;
201 try
202 {
203 msg.read(interfaceName, propertiesChanged);
204 state =
205 std::get<std::string>(propertiesChanged.begin()->second);
206 }
207 catch (std::exception& e)
208 {
209 std::cerr << "Unable to read host state\n";
210 return;
211 }
212 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
213
Jason M. Bills1490b142019-07-01 15:48:43 -0700214 if (hostOff)
215 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700216 // No host events should fire while off, so cancel any pending
217 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700218 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700219 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700220 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700221 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700222 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700223 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700224 else
225 {
226 // Handle any initial errors when the host turns on
227 initializeErrorState();
228 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700229 });
230}
231
232static bool requestGPIOEvents(
233 const std::string& name, const std::function<void()>& handler,
234 gpiod::line& gpioLine,
235 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
236{
237 // Find the GPIO line
238 gpioLine = gpiod::find_line(name);
239 if (!gpioLine)
240 {
241 std::cerr << "Failed to find the " << name << " line\n";
242 return false;
243 }
244
245 try
246 {
247 gpioLine.request(
248 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
249 }
250 catch (std::exception&)
251 {
252 std::cerr << "Failed to request events for " << name << "\n";
253 return false;
254 }
255
256 int gpioLineFd = gpioLine.event_get_fd();
257 if (gpioLineFd < 0)
258 {
259 std::cerr << "Failed to get " << name << " fd\n";
260 return false;
261 }
262
263 gpioEventDescriptor.assign(gpioLineFd);
264
265 gpioEventDescriptor.async_wait(
266 boost::asio::posix::stream_descriptor::wait_read,
267 [&name, handler](const boost::system::error_code ec) {
268 if (ec)
269 {
270 std::cerr << name << " fd handler error: " << ec.message()
271 << "\n";
272 return;
273 }
274 handler();
275 });
276 return true;
277}
278
279static void startPowerCycle()
280{
281 conn->async_method_call(
282 [](boost::system::error_code ec) {
283 if (ec)
284 {
285 std::cerr << "failed to set Chassis State\n";
286 }
287 },
288 "xyz.openbmc_project.State.Chassis",
289 "/xyz/openbmc_project/state/chassis0",
290 "org.freedesktop.DBus.Properties", "Set",
291 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
292 std::variant<std::string>{
293 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
294}
295
296static void startCrashdumpAndRecovery(bool recoverSystem)
297{
298 std::cout << "Starting crashdump\n";
299 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
300 static boost::asio::steady_timer crashdumpTimer(io);
301
302 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
303 *conn,
304 "type='signal',interface='org.freedesktop.DBus.Properties',"
305 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
306 [recoverSystem](sdbusplus::message::message& msg) {
307 crashdumpTimer.cancel();
308 std::cout << "Crashdump completed\n";
309 if (recoverSystem)
310 {
311 std::cout << "Recovering the system\n";
312 startPowerCycle();
313 }
314 crashdumpCompleteMatch.reset();
315 });
316
317 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
318 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
319 if (ec)
320 {
321 // operation_aborted is expected if timer is canceled
322 if (ec != boost::asio::error::operation_aborted)
323 {
324 std::cerr << "Crashdump async_wait failed: " << ec.message()
325 << "\n";
326 }
327 std::cout << "Crashdump timer canceled\n";
328 return;
329 }
330 std::cerr << "Crashdump failed to complete before timeout\n";
331 crashdumpCompleteMatch.reset();
332 });
333
334 conn->async_method_call(
335 [](boost::system::error_code ec) {
336 if (ec)
337 {
338 std::cerr << "failed to start Crashdump\n";
339 crashdumpTimer.cancel();
340 crashdumpCompleteMatch.reset();
341 }
342 },
343 "com.intel.crashdump", "/com/intel/crashdump",
344 "com.intel.crashdump.Stored", "GenerateStoredLog");
345}
346
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700347static void incrementCPUErrorCount(int cpuNum)
348{
349 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
350
351 // Get the current count
352 conn->async_method_call(
353 [propertyName](boost::system::error_code ec,
354 const std::variant<uint8_t>& property) {
355 if (ec)
356 {
357 std::cerr << "Failed to read " << propertyName << ": "
358 << ec.message() << "\n";
359 return;
360 }
361 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
362 if (errorCountVariant == nullptr)
363 {
364 std::cerr << propertyName << " invalid\n";
365 return;
366 }
367 uint8_t errorCount = *errorCountVariant;
368 if (errorCount == std::numeric_limits<uint8_t>::max())
369 {
370 std::cerr << "Maximum error count reached\n";
371 return;
372 }
373 // Increment the count
374 errorCount++;
375 conn->async_method_call(
376 [propertyName](boost::system::error_code ec) {
377 if (ec)
378 {
379 std::cerr << "Failed to set " << propertyName << ": "
380 << ec.message() << "\n";
381 }
382 },
383 "xyz.openbmc_project.Settings",
384 "/xyz/openbmc_project/control/processor_error_config",
385 "org.freedesktop.DBus.Properties", "Set",
386 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
387 std::variant<uint8_t>{errorCount});
388 },
389 "xyz.openbmc_project.Settings",
390 "/xyz/openbmc_project/control/processor_error_config",
391 "org.freedesktop.DBus.Properties", "Get",
392 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
393}
394
Jason M. Billsa3397932019-08-06 11:07:21 -0700395static bool checkIERRCPUs()
396{
397 bool cpuIERRFound = false;
398 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
399 cpu++, addr++)
400 {
401 uint8_t cc = 0;
402 CPUModel model{};
403 uint8_t stepping = 0;
404 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
405 {
406 std::cerr << "Cannot get CPUID!\n";
407 continue;
408 }
409
410 switch (model)
411 {
412 case skx:
413 {
414 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
415 // that caused the IERR
416 uint32_t mcaErrSrcLog = 0;
417 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
418 &cc) != PECI_CC_SUCCESS)
419 {
420 continue;
421 }
422 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
423 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
424 {
425 // TODO: Light the CPU fault LED?
426 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700427 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700428 // Next check if it's a CPU/VR mismatch by reading the
429 // IA32_MC4_STATUS MSR (0x411)
430 uint64_t mc4Status = 0;
431 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
432 PECI_CC_SUCCESS)
433 {
434 continue;
435 }
436 // Check MSEC bits 31:24 for
437 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
438 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
439 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
440 if ((mc4Status & (0x40 << 24)) ||
441 (mc4Status & (0x42 << 24)) ||
442 (mc4Status & (0x43 << 24)))
443 {
444 cpuIERRLog(cpu, "CPU/VR Mismatch");
445 continue;
446 }
447
448 // Next check if it's a Core FIVR fault by looking for a
449 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
450 // 80h)
451 uint32_t coreFIVRErrLog = 0;
452 if (peci_RdPCIConfigLocal(
453 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
454 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
455 {
456 continue;
457 }
458 if (coreFIVRErrLog)
459 {
460 cpuIERRLog(cpu, "Core FIVR Fault");
461 continue;
462 }
463
464 // Next check if it's an Uncore FIVR fault by looking for a
465 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
466 // 84h)
467 uint32_t uncoreFIVRErrLog = 0;
468 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
469 sizeof(uint32_t),
470 (uint8_t*)&uncoreFIVRErrLog,
471 &cc) != PECI_CC_SUCCESS)
472 {
473 continue;
474 }
475 if (uncoreFIVRErrLog)
476 {
477 cpuIERRLog(cpu, "Uncore FIVR Fault");
478 continue;
479 }
480
481 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
482 // both zero, but MSEC bits 31:24 have either
483 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
484 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
485 // uncore FIVR fault
486 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
487 ((mc4Status & (0x51 << 24)) ||
488 (mc4Status & (0x52 << 24))))
489 {
490 cpuIERRLog(cpu, "Uncore FIVR Fault");
491 continue;
492 }
493 cpuIERRLog(cpu);
494 }
495 break;
496 }
497 case icx:
498 {
499 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
500 // that caused the IERR
501 uint32_t mcaErrSrcLog = 0;
502 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
503 &cc) != PECI_CC_SUCCESS)
504 {
505 continue;
506 }
507 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
508 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
509 {
510 // TODO: Light the CPU fault LED?
511 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700512 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700513 // Next check if it's a CPU/VR mismatch by reading the
514 // IA32_MC4_STATUS MSR (0x411)
515 uint64_t mc4Status = 0;
516 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
517 PECI_CC_SUCCESS)
518 {
519 continue;
520 }
521 // TODO: Update MSEC/MSCOD_31_24 check
522 // Check MSEC bits 31:24 for
523 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
524 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
525 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
526 if ((mc4Status & (0x40 << 24)) ||
527 (mc4Status & (0x42 << 24)) ||
528 (mc4Status & (0x43 << 24)))
529 {
530 cpuIERRLog(cpu, "CPU/VR Mismatch");
531 continue;
532 }
533
534 // Next check if it's a Core FIVR fault by looking for a
535 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
536 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
537 uint32_t coreFIVRErrLog0 = 0;
538 uint32_t coreFIVRErrLog1 = 0;
539 if (peci_RdEndPointConfigPciLocal(
540 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
541 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
542 {
543 continue;
544 }
545 if (peci_RdEndPointConfigPciLocal(
546 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
547 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
548 {
549 continue;
550 }
551 if (coreFIVRErrLog0 || coreFIVRErrLog1)
552 {
553 cpuIERRLog(cpu, "Core FIVR Fault");
554 continue;
555 }
556
557 // Next check if it's an Uncore FIVR fault by looking for a
558 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
559 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
560 uint32_t uncoreFIVRErrLog = 0;
561 if (peci_RdEndPointConfigPciLocal(
562 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
563 (uint8_t*)&uncoreFIVRErrLog,
564 &cc) != PECI_CC_SUCCESS)
565 {
566 continue;
567 }
568 if (uncoreFIVRErrLog)
569 {
570 cpuIERRLog(cpu, "Uncore FIVR Fault");
571 continue;
572 }
573
574 // TODO: Update MSEC/MSCOD_31_24 check
575 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
576 // both zero, but MSEC bits 31:24 have either
577 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
578 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
579 // uncore FIVR fault
580 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
581 !uncoreFIVRErrLog &&
582 ((mc4Status & (0x51 << 24)) ||
583 (mc4Status & (0x52 << 24))))
584 {
585 cpuIERRLog(cpu, "Uncore FIVR Fault");
586 continue;
587 }
588 cpuIERRLog(cpu);
589 }
590 break;
591 }
592 }
593 }
594 return cpuIERRFound;
595}
596
Jason M. Billsa15c2522019-08-16 10:01:44 -0700597static void caterrAssertHandler()
598{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700599 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
600 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
601 if (ec)
602 {
603 // operation_aborted is expected if timer is canceled
604 // before completion.
605 if (ec != boost::asio::error::operation_aborted)
606 {
607 std::cerr << "caterr timeout async_wait failed: "
608 << ec.message() << "\n";
609 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700610 return;
611 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700612 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
613 << " ms\n";
614 if (!checkIERRCPUs())
615 {
616 cpuIERRLog();
617 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700618 conn->async_method_call(
619 [](boost::system::error_code ec,
620 const std::variant<bool>& property) {
621 if (ec)
622 {
623 return;
624 }
625 const bool* reset = std::get_if<bool>(&property);
626 if (reset == nullptr)
627 {
628 std::cerr << "Unable to read reset on CATERR value\n";
629 return;
630 }
631 startCrashdumpAndRecovery(*reset);
632 },
633 "xyz.openbmc_project.Settings",
634 "/xyz/openbmc_project/control/processor_error_config",
635 "org.freedesktop.DBus.Properties", "Get",
636 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
637 });
638}
639
Jason M. Bills1490b142019-07-01 15:48:43 -0700640static void caterrHandler()
641{
642 if (!hostOff)
643 {
644 gpiod::line_event gpioLineEvent = caterrLine.event_read();
645
646 bool caterr =
647 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
648 if (caterr)
649 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700650 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700651 }
652 else
653 {
654 caterrAssertTimer.cancel();
655 }
656 }
657 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
658 [](const boost::system::error_code ec) {
659 if (ec)
660 {
661 std::cerr << "caterr handler error: "
662 << ec.message() << "\n";
663 return;
664 }
665 caterrHandler();
666 });
667}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700668
Jason M. Billse94f5e12019-09-13 11:11:34 -0700669static void cpu1ThermtripAssertHandler()
670{
671 cpuThermTripLog(1);
672}
673
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700674static void cpu1ThermtripHandler()
675{
676 if (!hostOff)
677 {
678 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
679
680 bool cpu1Thermtrip =
681 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
682 if (cpu1Thermtrip)
683 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700684 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700685 }
686 }
687 cpu1ThermtripEvent.async_wait(
688 boost::asio::posix::stream_descriptor::wait_read,
689 [](const boost::system::error_code ec) {
690 if (ec)
691 {
692 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
693 << "\n";
694 return;
695 }
696 cpu1ThermtripHandler();
697 });
698}
699
Jason M. Billse94f5e12019-09-13 11:11:34 -0700700static void cpu2ThermtripAssertHandler()
701{
702 cpuThermTripLog(2);
703}
704
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700705static void cpu2ThermtripHandler()
706{
707 if (!hostOff)
708 {
709 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
710
711 bool cpu2Thermtrip =
712 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
713 if (cpu2Thermtrip)
714 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700715 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700716 }
717 }
718 cpu2ThermtripEvent.async_wait(
719 boost::asio::posix::stream_descriptor::wait_read,
720 [](const boost::system::error_code ec) {
721 if (ec)
722 {
723 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
724 << "\n";
725 return;
726 }
727 cpu2ThermtripHandler();
728 });
729}
730
Jason M. Billse94f5e12019-09-13 11:11:34 -0700731static void cpu1VRHotAssertHandler()
732{
733 cpuVRHotLog("CPU 1");
734}
735
Jason M. Bills250fa632019-08-28 15:58:25 -0700736static void cpu1VRHotHandler()
737{
738 if (!hostOff)
739 {
740 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
741
742 bool cpu1VRHot =
743 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
744 if (cpu1VRHot)
745 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700746 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700747 }
748 }
749 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
750 [](const boost::system::error_code ec) {
751 if (ec)
752 {
753 std::cerr << "CPU 1 VRHot handler error: "
754 << ec.message() << "\n";
755 return;
756 }
757 cpu1VRHotHandler();
758 });
759}
760
Jason M. Billse94f5e12019-09-13 11:11:34 -0700761static void cpu1MemABCDVRHotAssertHandler()
762{
763 cpuVRHotLog("CPU 1 Memory ABCD");
764}
765
Jason M. Bills9647ba72019-08-29 14:19:19 -0700766static void cpu1MemABCDVRHotHandler()
767{
768 if (!hostOff)
769 {
770 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
771
772 bool cpu1MemABCDVRHot =
773 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
774 if (cpu1MemABCDVRHot)
775 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700776 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700777 }
778 }
779 cpu1MemABCDVRHotEvent.async_wait(
780 boost::asio::posix::stream_descriptor::wait_read,
781 [](const boost::system::error_code ec) {
782 if (ec)
783 {
784 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
785 << ec.message() << "\n";
786 return;
787 }
788 cpu1MemABCDVRHotHandler();
789 });
790}
791
Jason M. Billse94f5e12019-09-13 11:11:34 -0700792static void cpu1MemEFGHVRHotAssertHandler()
793{
794 cpuVRHotLog("CPU 1 Memory EFGH");
795}
796
Jason M. Bills9647ba72019-08-29 14:19:19 -0700797static void cpu1MemEFGHVRHotHandler()
798{
799 if (!hostOff)
800 {
801 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
802
803 bool cpu1MemEFGHVRHot =
804 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
805 if (cpu1MemEFGHVRHot)
806 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700807 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700808 }
809 }
810 cpu1MemEFGHVRHotEvent.async_wait(
811 boost::asio::posix::stream_descriptor::wait_read,
812 [](const boost::system::error_code ec) {
813 if (ec)
814 {
815 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
816 << ec.message() << "\n";
817 return;
818 }
819 cpu1MemEFGHVRHotHandler();
820 });
821}
822
Jason M. Billse94f5e12019-09-13 11:11:34 -0700823static void cpu2VRHotAssertHandler()
824{
825 cpuVRHotLog("CPU 2");
826}
827
Jason M. Bills250fa632019-08-28 15:58:25 -0700828static void cpu2VRHotHandler()
829{
830 if (!hostOff)
831 {
832 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
833
834 bool cpu2VRHot =
835 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
836 if (cpu2VRHot)
837 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700838 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700839 }
840 }
841 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
842 [](const boost::system::error_code ec) {
843 if (ec)
844 {
845 std::cerr << "CPU 2 VRHot handler error: "
846 << ec.message() << "\n";
847 return;
848 }
849 cpu2VRHotHandler();
850 });
851}
852
Jason M. Billse94f5e12019-09-13 11:11:34 -0700853static void cpu2MemABCDVRHotAssertHandler()
854{
855 cpuVRHotLog("CPU 2 Memory ABCD");
856}
857
Jason M. Bills9647ba72019-08-29 14:19:19 -0700858static void cpu2MemABCDVRHotHandler()
859{
860 if (!hostOff)
861 {
862 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
863
864 bool cpu2MemABCDVRHot =
865 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
866 if (cpu2MemABCDVRHot)
867 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700868 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700869 }
870 }
871 cpu2MemABCDVRHotEvent.async_wait(
872 boost::asio::posix::stream_descriptor::wait_read,
873 [](const boost::system::error_code ec) {
874 if (ec)
875 {
876 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
877 << ec.message() << "\n";
878 return;
879 }
880 cpu2MemABCDVRHotHandler();
881 });
882}
883
Jason M. Billse94f5e12019-09-13 11:11:34 -0700884static void cpu2MemEFGHVRHotAssertHandler()
885{
886 cpuVRHotLog("CPU 2 Memory EFGH");
887}
888
Jason M. Bills9647ba72019-08-29 14:19:19 -0700889static void cpu2MemEFGHVRHotHandler()
890{
891 if (!hostOff)
892 {
893 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
894
895 bool cpu2MemEFGHVRHot =
896 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
897 if (cpu2MemEFGHVRHot)
898 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700899 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700900 }
901 }
902 cpu2MemEFGHVRHotEvent.async_wait(
903 boost::asio::posix::stream_descriptor::wait_read,
904 [](const boost::system::error_code ec) {
905 if (ec)
906 {
907 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
908 << ec.message() << "\n";
909 return;
910 }
911 cpu2MemEFGHVRHotHandler();
912 });
913}
914
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800915static void pchThermtripHandler()
916{
917 if (!hostOff)
918 {
919 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
920
921 bool pchThermtrip =
922 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
923 if (pchThermtrip)
924 {
Jason M. Bills08866542019-08-16 12:04:19 -0700925 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800926 }
927 }
928 pchThermtripEvent.async_wait(
929 boost::asio::posix::stream_descriptor::wait_read,
930 [](const boost::system::error_code ec) {
931 if (ec)
932 {
933 std::cerr << "PCH Thermal trip handler error: " << ec.message()
934 << "\n";
935 return;
936 }
937 pchThermtripHandler();
938 });
939}
940
Jason M. Billscbf78532019-08-16 15:32:11 -0700941static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700942{
Jason M. Billscbf78532019-08-16 15:32:11 -0700943 int errPinSts = (1 << errPin);
944 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700945 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
946 cpu++, addr++)
947 {
948 if (peci_Ping(addr) == PECI_CC_SUCCESS)
949 {
950 uint8_t cc = 0;
951 CPUModel model{};
952 uint8_t stepping = 0;
953 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
954 {
955 std::cerr << "Cannot get CPUID!\n";
956 continue;
957 }
958
959 switch (model)
960 {
961 case skx:
962 {
963 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700964 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700965 uint32_t errpinsts = 0;
966 if (peci_RdPCIConfigLocal(
967 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
968 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
969 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700970 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700971 }
972 break;
973 }
974 case icx:
975 {
976 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700977 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700978 // accessed on PECI as bus 13)
979 uint32_t errpinsts = 0;
980 if (peci_RdEndPointConfigPciLocal(
981 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
982 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
983 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700984 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700985 }
986 break;
987 }
988 }
989 }
990 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700991 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700992}
993
Jason M. Billscbf78532019-08-16 15:32:11 -0700994static void errXAssertHandler(const int errPin,
995 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700996{
Jason M. Billscbf78532019-08-16 15:32:11 -0700997 // ERRx status is not guaranteed through the timeout, so save which
998 // CPUs have it asserted
999 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1000 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1001 errXAssertTimer.async_wait([errPin, errPinCPUs](
1002 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001003 if (ec)
1004 {
1005 // operation_aborted is expected if timer is canceled before
1006 // completion.
1007 if (ec != boost::asio::error::operation_aborted)
1008 {
1009 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1010 << "\n";
1011 }
1012 return;
1013 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001014 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1015 << std::to_string(errTimeoutMs) << " ms\n";
1016 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001017 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001018 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001019 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001020 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001021 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001022 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001023 }
1024 }
1025 }
1026 else
1027 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001028 cpuERRXLog(errPin);
1029 }
1030 });
1031}
1032
Jason M. Bills8c584392019-08-19 11:05:51 -07001033static void err0AssertHandler()
1034{
1035 // Handle the standard ERR0 detection and logging
1036 const static constexpr int err0 = 0;
1037 errXAssertHandler(err0, err0AssertTimer);
1038}
1039
1040static void err0Handler()
1041{
1042 if (!hostOff)
1043 {
1044 gpiod::line_event gpioLineEvent = err0Line.event_read();
1045
1046 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1047 if (err0)
1048 {
1049 err0AssertHandler();
1050 }
1051 else
1052 {
1053 err0AssertTimer.cancel();
1054 }
1055 }
1056 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1057 [](const boost::system::error_code ec) {
1058 if (ec)
1059 {
1060 std::cerr
1061 << "err0 handler error: " << ec.message()
1062 << "\n";
1063 return;
1064 }
1065 err0Handler();
1066 });
1067}
1068
Jason M. Bills75af3962019-08-19 11:07:17 -07001069static void err1AssertHandler()
1070{
1071 // Handle the standard ERR1 detection and logging
1072 const static constexpr int err1 = 1;
1073 errXAssertHandler(err1, err1AssertTimer);
1074}
1075
1076static void err1Handler()
1077{
1078 if (!hostOff)
1079 {
1080 gpiod::line_event gpioLineEvent = err1Line.event_read();
1081
1082 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1083 if (err1)
1084 {
1085 err1AssertHandler();
1086 }
1087 else
1088 {
1089 err1AssertTimer.cancel();
1090 }
1091 }
1092 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1093 [](const boost::system::error_code ec) {
1094 if (ec)
1095 {
1096 std::cerr
1097 << "err1 handler error: " << ec.message()
1098 << "\n";
1099 return;
1100 }
1101 err1Handler();
1102 });
1103}
1104
Jason M. Billscbf78532019-08-16 15:32:11 -07001105static void err2AssertHandler()
1106{
1107 // Handle the standard ERR2 detection and logging
1108 const static constexpr int err2 = 2;
1109 errXAssertHandler(err2, err2AssertTimer);
1110 // Also handle reset for ERR2
1111 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1112 if (ec)
1113 {
1114 // operation_aborted is expected if timer is canceled before
1115 // completion.
1116 if (ec != boost::asio::error::operation_aborted)
1117 {
1118 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1119 << "\n";
1120 }
1121 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001122 }
1123 conn->async_method_call(
1124 [](boost::system::error_code ec,
1125 const std::variant<bool>& property) {
1126 if (ec)
1127 {
1128 return;
1129 }
1130 const bool* reset = std::get_if<bool>(&property);
1131 if (reset == nullptr)
1132 {
1133 std::cerr << "Unable to read reset on ERR2 value\n";
1134 return;
1135 }
1136 startCrashdumpAndRecovery(*reset);
1137 },
1138 "xyz.openbmc_project.Settings",
1139 "/xyz/openbmc_project/control/processor_error_config",
1140 "org.freedesktop.DBus.Properties", "Get",
1141 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
1142 });
1143}
1144
1145static void err2Handler()
1146{
1147 if (!hostOff)
1148 {
1149 gpiod::line_event gpioLineEvent = err2Line.event_read();
1150
1151 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1152 if (err2)
1153 {
1154 err2AssertHandler();
1155 }
1156 else
1157 {
1158 err2AssertTimer.cancel();
1159 }
1160 }
1161 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1162 [](const boost::system::error_code ec) {
1163 if (ec)
1164 {
1165 std::cerr
1166 << "err2 handler error: " << ec.message()
1167 << "\n";
1168 return;
1169 }
1170 err2Handler();
1171 });
1172}
1173
Jason M. Bills89922f82019-08-06 11:10:02 -07001174static void smiAssertHandler()
1175{
1176 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1177 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1178 if (ec)
1179 {
1180 // operation_aborted is expected if timer is canceled before
1181 // completion.
1182 if (ec != boost::asio::error::operation_aborted)
1183 {
1184 std::cerr << "smi timeout async_wait failed: " << ec.message()
1185 << "\n";
1186 }
1187 return;
1188 }
1189 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1190 << " ms\n";
1191 smiTimeoutLog();
1192 conn->async_method_call(
1193 [](boost::system::error_code ec,
1194 const std::variant<bool>& property) {
1195 if (ec)
1196 {
1197 return;
1198 }
1199 const bool* reset = std::get_if<bool>(&property);
1200 if (reset == nullptr)
1201 {
1202 std::cerr << "Unable to read reset on SMI value\n";
1203 return;
1204 }
1205 startCrashdumpAndRecovery(*reset);
1206 },
1207 "xyz.openbmc_project.Settings",
1208 "/xyz/openbmc_project/control/bmc_reset_disables",
1209 "org.freedesktop.DBus.Properties", "Get",
1210 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1211 });
1212}
1213
1214static void smiHandler()
1215{
1216 if (!hostOff)
1217 {
1218 gpiod::line_event gpioLineEvent = smiLine.event_read();
1219
1220 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1221 if (smi)
1222 {
1223 smiAssertHandler();
1224 }
1225 else
1226 {
1227 smiAssertTimer.cancel();
1228 }
1229 }
1230 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1231 [](const boost::system::error_code ec) {
1232 if (ec)
1233 {
1234 std::cerr
1235 << "smi handler error: " << ec.message()
1236 << "\n";
1237 return;
1238 }
1239 smiHandler();
1240 });
1241}
1242
Jason M. Billsa15c2522019-08-16 10:01:44 -07001243static void initializeErrorState()
1244{
1245 // Handle CPU_CATERR if it's asserted now
1246 if (caterrLine.get_value() == 0)
1247 {
1248 caterrAssertHandler();
1249 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001250
Jason M. Bills8c584392019-08-19 11:05:51 -07001251 // Handle CPU_ERR0 if it's asserted now
1252 if (err0Line.get_value() == 0)
1253 {
1254 err0AssertHandler();
1255 }
1256
Jason M. Bills75af3962019-08-19 11:07:17 -07001257 // Handle CPU_ERR1 if it's asserted now
1258 if (err1Line.get_value() == 0)
1259 {
1260 err1AssertHandler();
1261 }
1262
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001263 // Handle CPU_ERR2 if it's asserted now
1264 if (err2Line.get_value() == 0)
1265 {
1266 err2AssertHandler();
1267 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001268
1269 // Handle SMI if it's asserted now
1270 if (smiLine.get_value() == 0)
1271 {
1272 smiAssertHandler();
1273 }
Jason M. Bills08866542019-08-16 12:04:19 -07001274
Jason M. Billse94f5e12019-09-13 11:11:34 -07001275 // Handle CPU1_THERMTRIP if it's asserted now
1276 if (cpu1ThermtripLine.get_value() == 0)
1277 {
1278 cpu1ThermtripAssertHandler();
1279 }
1280
1281 // Handle CPU2_THERMTRIP if it's asserted now
1282 if (cpu2ThermtripLine.get_value() == 0)
1283 {
1284 cpu2ThermtripAssertHandler();
1285 }
1286
1287 // Handle CPU1_VRHOT if it's asserted now
1288 if (cpu1VRHotLine.get_value() == 0)
1289 {
1290 cpu1VRHotAssertHandler();
1291 }
1292
1293 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1294 if (cpu1MemABCDVRHotLine.get_value() == 0)
1295 {
1296 cpu1MemABCDVRHotAssertHandler();
1297 }
1298
1299 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1300 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1301 {
1302 cpu1MemEFGHVRHotAssertHandler();
1303 }
1304
1305 // Handle CPU2_VRHOT if it's asserted now
1306 if (cpu2VRHotLine.get_value() == 0)
1307 {
1308 cpu2VRHotAssertHandler();
1309 }
1310
1311 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1312 if (cpu2MemABCDVRHotLine.get_value() == 0)
1313 {
1314 cpu2MemABCDVRHotAssertHandler();
1315 }
1316
1317 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1318 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1319 {
1320 cpu2MemEFGHVRHotAssertHandler();
1321 }
1322
Jason M. Bills08866542019-08-16 12:04:19 -07001323 // Handle PCH_BMC_THERMTRIP if it's asserted now
1324 if (pchThermtripLine.get_value() == 0)
1325 {
1326 ssbThermTripLog();
1327 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001328}
Jason M. Bills1490b142019-07-01 15:48:43 -07001329} // namespace host_error_monitor
1330
1331int main(int argc, char* argv[])
1332{
1333 // setup connection to dbus
1334 host_error_monitor::conn =
1335 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1336
1337 // Host Error Monitor Object
1338 host_error_monitor::conn->request_name(
1339 "xyz.openbmc_project.HostErrorMonitor");
1340 sdbusplus::asio::object_server server =
1341 sdbusplus::asio::object_server(host_error_monitor::conn);
1342
1343 // Start tracking host state
1344 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1345 host_error_monitor::startHostStateMonitor();
1346
1347 // Initialize the host state
1348 host_error_monitor::initializeHostState();
1349
1350 // Request CPU_CATERR GPIO events
1351 if (!host_error_monitor::requestGPIOEvents(
1352 "CPU_CATERR", host_error_monitor::caterrHandler,
1353 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1354 {
1355 return -1;
1356 }
1357
Jason M. Bills8c584392019-08-19 11:05:51 -07001358 // Request CPU_ERR0 GPIO events
1359 if (!host_error_monitor::requestGPIOEvents(
1360 "CPU_ERR0", host_error_monitor::err0Handler,
1361 host_error_monitor::err0Line, host_error_monitor::err0Event))
1362 {
1363 return -1;
1364 }
1365
Jason M. Bills75af3962019-08-19 11:07:17 -07001366 // Request CPU_ERR1 GPIO events
1367 if (!host_error_monitor::requestGPIOEvents(
1368 "CPU_ERR1", host_error_monitor::err1Handler,
1369 host_error_monitor::err1Line, host_error_monitor::err1Event))
1370 {
1371 return -1;
1372 }
1373
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001374 // Request CPU_ERR2 GPIO events
1375 if (!host_error_monitor::requestGPIOEvents(
1376 "CPU_ERR2", host_error_monitor::err2Handler,
1377 host_error_monitor::err2Line, host_error_monitor::err2Event))
1378 {
1379 return -1;
1380 }
1381
Jason M. Bills89922f82019-08-06 11:10:02 -07001382 // Request SMI GPIO events
1383 if (!host_error_monitor::requestGPIOEvents(
1384 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1385 host_error_monitor::smiEvent))
1386 {
1387 return -1;
1388 }
1389
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001390 // Request CPU1_THERMTRIP GPIO events
1391 if (!host_error_monitor::requestGPIOEvents(
1392 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1393 host_error_monitor::cpu1ThermtripLine,
1394 host_error_monitor::cpu1ThermtripEvent))
1395 {
1396 return -1;
1397 }
1398
1399 // Request CPU2_THERMTRIP GPIO events
1400 if (!host_error_monitor::requestGPIOEvents(
1401 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1402 host_error_monitor::cpu2ThermtripLine,
1403 host_error_monitor::cpu2ThermtripEvent))
1404 {
1405 return -1;
1406 }
1407
Jason M. Bills250fa632019-08-28 15:58:25 -07001408 // Request CPU1_VRHOT GPIO events
1409 if (!host_error_monitor::requestGPIOEvents(
1410 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1411 host_error_monitor::cpu1VRHotLine,
1412 host_error_monitor::cpu1VRHotEvent))
1413 {
1414 return -1;
1415 }
1416
Jason M. Bills9647ba72019-08-29 14:19:19 -07001417 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1418 if (!host_error_monitor::requestGPIOEvents(
1419 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1420 host_error_monitor::cpu1MemABCDVRHotLine,
1421 host_error_monitor::cpu1MemABCDVRHotEvent))
1422 {
1423 return -1;
1424 }
1425
1426 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1427 if (!host_error_monitor::requestGPIOEvents(
1428 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1429 host_error_monitor::cpu1MemEFGHVRHotLine,
1430 host_error_monitor::cpu1MemEFGHVRHotEvent))
1431 {
1432 return -1;
1433 }
1434
Jason M. Bills250fa632019-08-28 15:58:25 -07001435 // Request CPU2_VRHOT GPIO events
1436 if (!host_error_monitor::requestGPIOEvents(
1437 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1438 host_error_monitor::cpu2VRHotLine,
1439 host_error_monitor::cpu2VRHotEvent))
1440 {
1441 return -1;
1442 }
1443
Jason M. Bills9647ba72019-08-29 14:19:19 -07001444 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1445 if (!host_error_monitor::requestGPIOEvents(
1446 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1447 host_error_monitor::cpu2MemABCDVRHotLine,
1448 host_error_monitor::cpu2MemABCDVRHotEvent))
1449 {
1450 return -1;
1451 }
1452
1453 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1454 if (!host_error_monitor::requestGPIOEvents(
1455 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1456 host_error_monitor::cpu2MemEFGHVRHotLine,
1457 host_error_monitor::cpu2MemEFGHVRHotEvent))
1458 {
1459 return -1;
1460 }
1461
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001462 // Request PCH_BMC_THERMTRIP GPIO events
1463 if (!host_error_monitor::requestGPIOEvents(
1464 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1465 host_error_monitor::pchThermtripLine,
1466 host_error_monitor::pchThermtripEvent))
1467 {
1468 return -1;
1469 }
1470
Jason M. Bills1490b142019-07-01 15:48:43 -07001471 host_error_monitor::io.run();
1472
1473 return 0;
1474}