blob: 0172ca2997ee807de495a15e88e826e6b96ac269 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080030static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070031
32static bool hostOff = true;
33
Jason M. Billsc4b91f22019-11-26 17:04:50 -080034static size_t caterrTimeoutMs = 2000;
35const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070036const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070037const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070038const static constexpr size_t crashdumpTimeoutS = 300;
39
40// Timers
41// Timer for CATERR asserted
42static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070043// Timer for ERR0 asserted
44static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070045// Timer for ERR1 asserted
46static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070047// Timer for ERR2 asserted
48static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070049// Timer for SMI asserted
50static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070051
52// GPIO Lines and Event Descriptors
53static gpiod::line caterrLine;
54static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070055static gpiod::line err0Line;
56static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070057static gpiod::line err1Line;
58static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070059static gpiod::line err2Line;
60static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070061static gpiod::line smiLine;
62static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070063static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070064static gpiod::line cpu1ThermtripLine;
65static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070066static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070067static gpiod::line cpu2ThermtripLine;
68static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070069static gpiod::line cpu1VRHotLine;
70static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
71static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070072static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
73static gpiod::line cpu1MemEFGHVRHotLine;
74static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
75static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070076static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070077static gpiod::line cpu1MemABCDVRHotLine;
78static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
79static gpiod::line cpu2MemEFGHVRHotLine;
80static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080081//----------------------------------
82// PCH_BMC_THERMTRIP function related definition
83//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080084static gpiod::line pchThermtripLine;
85static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070086
Jason M. Billsa3397932019-08-06 11:07:21 -070087static void cpuIERRLog()
88{
89 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
90 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
91 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
92}
93
94static void cpuIERRLog(const int cpuNum)
95{
96 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
97
98 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
99 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
100 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
101}
102
103static void cpuIERRLog(const int cpuNum, const std::string& type)
104{
105 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
106
107 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
108 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
109 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
110}
111
Jason M. Billscbf78532019-08-16 15:32:11 -0700112static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700113{
Jason M. Billscbf78532019-08-16 15:32:11 -0700114 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
115
116 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
117 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
118 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700119}
120
Jason M. Billscbf78532019-08-16 15:32:11 -0700121static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700122{
Jason M. Billscbf78532019-08-16 15:32:11 -0700123 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
124 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700125
126 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
127 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
128 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
129}
130
Jason M. Bills89922f82019-08-06 11:10:02 -0700131static void smiTimeoutLog()
132{
133 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
134 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
135 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
136}
137
Jason M. Bills45e87e02019-09-09 14:45:38 -0700138static void cpuBootFIVRFaultLog(const int cpuNum)
139{
140 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
141
142 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
143 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
144 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
145}
146
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700147static void cpuThermTripLog(const int cpuNum)
148{
149 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
150
151 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
152 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
153 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
154 cpuNum, NULL);
155}
156
Jason M. Bills250fa632019-08-28 15:58:25 -0700157static void cpuVRHotLog(const std::string& vr)
158{
159 std::string msg = vr + " Voltage Regulator Overheated.";
160
161 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
162 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
163 "OpenBMC.0.1.VoltageRegulatorOverheated",
164 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
165}
166
Jason M. Bills08866542019-08-16 12:04:19 -0700167static void ssbThermTripLog()
168{
169 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
170 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
171 "OpenBMC.0.1.SsbThermalTrip", NULL);
172}
173
Jason M. Billsa15c2522019-08-16 10:01:44 -0700174static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700175static void initializeHostState()
176{
177 conn->async_method_call(
178 [](boost::system::error_code ec,
179 const std::variant<std::string>& property) {
180 if (ec)
181 {
182 return;
183 }
184 const std::string* state = std::get_if<std::string>(&property);
185 if (state == nullptr)
186 {
187 std::cerr << "Unable to read host state value\n";
188 return;
189 }
190 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700191 // If the system is on, initialize the error state
192 if (!hostOff)
193 {
194 initializeErrorState();
195 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700196 },
197 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
198 "org.freedesktop.DBus.Properties", "Get",
199 "xyz.openbmc_project.State.Host", "CurrentHostState");
200}
201
202static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
203{
204 return std::make_shared<sdbusplus::bus::match::match>(
205 *conn,
206 "type='signal',interface='org.freedesktop.DBus.Properties',"
207 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
208 "Host'",
209 [](sdbusplus::message::message& msg) {
210 std::string interfaceName;
211 boost::container::flat_map<std::string, std::variant<std::string>>
212 propertiesChanged;
213 std::string state;
214 try
215 {
216 msg.read(interfaceName, propertiesChanged);
217 state =
218 std::get<std::string>(propertiesChanged.begin()->second);
219 }
220 catch (std::exception& e)
221 {
222 std::cerr << "Unable to read host state\n";
223 return;
224 }
225 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
226
Jason M. Bills1490b142019-07-01 15:48:43 -0700227 if (hostOff)
228 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700229 // No host events should fire while off, so cancel any pending
230 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700231 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700232 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700233 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700234 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700235 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700236 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700237 else
238 {
239 // Handle any initial errors when the host turns on
240 initializeErrorState();
241 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700242 });
243}
244
245static bool requestGPIOEvents(
246 const std::string& name, const std::function<void()>& handler,
247 gpiod::line& gpioLine,
248 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
249{
250 // Find the GPIO line
251 gpioLine = gpiod::find_line(name);
252 if (!gpioLine)
253 {
254 std::cerr << "Failed to find the " << name << " line\n";
255 return false;
256 }
257
258 try
259 {
260 gpioLine.request(
261 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
262 }
263 catch (std::exception&)
264 {
265 std::cerr << "Failed to request events for " << name << "\n";
266 return false;
267 }
268
269 int gpioLineFd = gpioLine.event_get_fd();
270 if (gpioLineFd < 0)
271 {
272 std::cerr << "Failed to get " << name << " fd\n";
273 return false;
274 }
275
276 gpioEventDescriptor.assign(gpioLineFd);
277
278 gpioEventDescriptor.async_wait(
279 boost::asio::posix::stream_descriptor::wait_read,
280 [&name, handler](const boost::system::error_code ec) {
281 if (ec)
282 {
283 std::cerr << name << " fd handler error: " << ec.message()
284 << "\n";
285 return;
286 }
287 handler();
288 });
289 return true;
290}
291
Jason M. Bills45e87e02019-09-09 14:45:38 -0700292static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
293{
294 // Find the GPIO line
295 gpioLine = gpiod::find_line(name);
296 if (!gpioLine)
297 {
298 std::cerr << "Failed to find the " << name << " line.\n";
299 return false;
300 }
301
302 // Request GPIO input
303 try
304 {
305 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
306 }
307 catch (std::exception&)
308 {
309 std::cerr << "Failed to request " << name << " input\n";
310 return false;
311 }
312
313 return true;
314}
315
Jason M. Bills1490b142019-07-01 15:48:43 -0700316static void startPowerCycle()
317{
318 conn->async_method_call(
319 [](boost::system::error_code ec) {
320 if (ec)
321 {
322 std::cerr << "failed to set Chassis State\n";
323 }
324 },
325 "xyz.openbmc_project.State.Chassis",
326 "/xyz/openbmc_project/state/chassis0",
327 "org.freedesktop.DBus.Properties", "Set",
328 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
329 std::variant<std::string>{
330 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
331}
332
Jason M. Billsb61766b2019-11-26 17:02:44 -0800333static void startCrashdumpAndRecovery(bool recoverSystem,
334 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700335{
336 std::cout << "Starting crashdump\n";
337 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
338 static boost::asio::steady_timer crashdumpTimer(io);
339
340 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
341 *conn,
342 "type='signal',interface='org.freedesktop.DBus.Properties',"
343 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
344 [recoverSystem](sdbusplus::message::message& msg) {
345 crashdumpTimer.cancel();
346 std::cout << "Crashdump completed\n";
347 if (recoverSystem)
348 {
349 std::cout << "Recovering the system\n";
350 startPowerCycle();
351 }
352 crashdumpCompleteMatch.reset();
353 });
354
355 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
356 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
357 if (ec)
358 {
359 // operation_aborted is expected if timer is canceled
360 if (ec != boost::asio::error::operation_aborted)
361 {
362 std::cerr << "Crashdump async_wait failed: " << ec.message()
363 << "\n";
364 }
365 std::cout << "Crashdump timer canceled\n";
366 return;
367 }
368 std::cerr << "Crashdump failed to complete before timeout\n";
369 crashdumpCompleteMatch.reset();
370 });
371
372 conn->async_method_call(
373 [](boost::system::error_code ec) {
374 if (ec)
375 {
376 std::cerr << "failed to start Crashdump\n";
377 crashdumpTimer.cancel();
378 crashdumpCompleteMatch.reset();
379 }
380 },
381 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800382 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700383}
384
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700385static void incrementCPUErrorCount(int cpuNum)
386{
387 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
388
389 // Get the current count
390 conn->async_method_call(
391 [propertyName](boost::system::error_code ec,
392 const std::variant<uint8_t>& property) {
393 if (ec)
394 {
395 std::cerr << "Failed to read " << propertyName << ": "
396 << ec.message() << "\n";
397 return;
398 }
399 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
400 if (errorCountVariant == nullptr)
401 {
402 std::cerr << propertyName << " invalid\n";
403 return;
404 }
405 uint8_t errorCount = *errorCountVariant;
406 if (errorCount == std::numeric_limits<uint8_t>::max())
407 {
408 std::cerr << "Maximum error count reached\n";
409 return;
410 }
411 // Increment the count
412 errorCount++;
413 conn->async_method_call(
414 [propertyName](boost::system::error_code ec) {
415 if (ec)
416 {
417 std::cerr << "Failed to set " << propertyName << ": "
418 << ec.message() << "\n";
419 }
420 },
421 "xyz.openbmc_project.Settings",
422 "/xyz/openbmc_project/control/processor_error_config",
423 "org.freedesktop.DBus.Properties", "Set",
424 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
425 std::variant<uint8_t>{errorCount});
426 },
427 "xyz.openbmc_project.Settings",
428 "/xyz/openbmc_project/control/processor_error_config",
429 "org.freedesktop.DBus.Properties", "Get",
430 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
431}
432
Jason M. Billsa3397932019-08-06 11:07:21 -0700433static bool checkIERRCPUs()
434{
435 bool cpuIERRFound = false;
436 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
437 cpu++, addr++)
438 {
439 uint8_t cc = 0;
440 CPUModel model{};
441 uint8_t stepping = 0;
442 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
443 {
444 std::cerr << "Cannot get CPUID!\n";
445 continue;
446 }
447
448 switch (model)
449 {
450 case skx:
451 {
452 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
453 // that caused the IERR
454 uint32_t mcaErrSrcLog = 0;
455 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
456 &cc) != PECI_CC_SUCCESS)
457 {
458 continue;
459 }
460 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
461 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
462 {
463 // TODO: Light the CPU fault LED?
464 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700465 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700466 // Next check if it's a CPU/VR mismatch by reading the
467 // IA32_MC4_STATUS MSR (0x411)
468 uint64_t mc4Status = 0;
469 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
470 PECI_CC_SUCCESS)
471 {
472 continue;
473 }
474 // Check MSEC bits 31:24 for
475 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
476 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
477 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
478 if ((mc4Status & (0x40 << 24)) ||
479 (mc4Status & (0x42 << 24)) ||
480 (mc4Status & (0x43 << 24)))
481 {
482 cpuIERRLog(cpu, "CPU/VR Mismatch");
483 continue;
484 }
485
486 // Next check if it's a Core FIVR fault by looking for a
487 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
488 // 80h)
489 uint32_t coreFIVRErrLog = 0;
490 if (peci_RdPCIConfigLocal(
491 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
492 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
493 {
494 continue;
495 }
496 if (coreFIVRErrLog)
497 {
498 cpuIERRLog(cpu, "Core FIVR Fault");
499 continue;
500 }
501
502 // Next check if it's an Uncore FIVR fault by looking for a
503 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
504 // 84h)
505 uint32_t uncoreFIVRErrLog = 0;
506 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
507 sizeof(uint32_t),
508 (uint8_t*)&uncoreFIVRErrLog,
509 &cc) != PECI_CC_SUCCESS)
510 {
511 continue;
512 }
513 if (uncoreFIVRErrLog)
514 {
515 cpuIERRLog(cpu, "Uncore FIVR Fault");
516 continue;
517 }
518
519 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
520 // both zero, but MSEC bits 31:24 have either
521 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
522 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
523 // uncore FIVR fault
524 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
525 ((mc4Status & (0x51 << 24)) ||
526 (mc4Status & (0x52 << 24))))
527 {
528 cpuIERRLog(cpu, "Uncore FIVR Fault");
529 continue;
530 }
531 cpuIERRLog(cpu);
532 }
533 break;
534 }
535 case icx:
536 {
537 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
538 // that caused the IERR
539 uint32_t mcaErrSrcLog = 0;
540 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
541 &cc) != PECI_CC_SUCCESS)
542 {
543 continue;
544 }
545 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
546 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
547 {
548 // TODO: Light the CPU fault LED?
549 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700550 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700551 // Next check if it's a CPU/VR mismatch by reading the
552 // IA32_MC4_STATUS MSR (0x411)
553 uint64_t mc4Status = 0;
554 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
555 PECI_CC_SUCCESS)
556 {
557 continue;
558 }
559 // TODO: Update MSEC/MSCOD_31_24 check
560 // Check MSEC bits 31:24 for
561 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
562 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
563 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
564 if ((mc4Status & (0x40 << 24)) ||
565 (mc4Status & (0x42 << 24)) ||
566 (mc4Status & (0x43 << 24)))
567 {
568 cpuIERRLog(cpu, "CPU/VR Mismatch");
569 continue;
570 }
571
572 // Next check if it's a Core FIVR fault by looking for a
573 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
574 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
575 uint32_t coreFIVRErrLog0 = 0;
576 uint32_t coreFIVRErrLog1 = 0;
577 if (peci_RdEndPointConfigPciLocal(
578 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
579 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
580 {
581 continue;
582 }
583 if (peci_RdEndPointConfigPciLocal(
584 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
585 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
586 {
587 continue;
588 }
589 if (coreFIVRErrLog0 || coreFIVRErrLog1)
590 {
591 cpuIERRLog(cpu, "Core FIVR Fault");
592 continue;
593 }
594
595 // Next check if it's an Uncore FIVR fault by looking for a
596 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
597 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
598 uint32_t uncoreFIVRErrLog = 0;
599 if (peci_RdEndPointConfigPciLocal(
600 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
601 (uint8_t*)&uncoreFIVRErrLog,
602 &cc) != PECI_CC_SUCCESS)
603 {
604 continue;
605 }
606 if (uncoreFIVRErrLog)
607 {
608 cpuIERRLog(cpu, "Uncore FIVR Fault");
609 continue;
610 }
611
612 // TODO: Update MSEC/MSCOD_31_24 check
613 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
614 // both zero, but MSEC bits 31:24 have either
615 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
616 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
617 // uncore FIVR fault
618 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
619 !uncoreFIVRErrLog &&
620 ((mc4Status & (0x51 << 24)) ||
621 (mc4Status & (0x52 << 24))))
622 {
623 cpuIERRLog(cpu, "Uncore FIVR Fault");
624 continue;
625 }
626 cpuIERRLog(cpu);
627 }
628 break;
629 }
630 }
631 }
632 return cpuIERRFound;
633}
634
Jason M. Billsa15c2522019-08-16 10:01:44 -0700635static void caterrAssertHandler()
636{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700637 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
638 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
639 if (ec)
640 {
641 // operation_aborted is expected if timer is canceled
642 // before completion.
643 if (ec != boost::asio::error::operation_aborted)
644 {
645 std::cerr << "caterr timeout async_wait failed: "
646 << ec.message() << "\n";
647 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700648 return;
649 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700650 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
651 << " ms\n";
652 if (!checkIERRCPUs())
653 {
654 cpuIERRLog();
655 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700656 conn->async_method_call(
657 [](boost::system::error_code ec,
658 const std::variant<bool>& property) {
659 if (ec)
660 {
661 return;
662 }
663 const bool* reset = std::get_if<bool>(&property);
664 if (reset == nullptr)
665 {
666 std::cerr << "Unable to read reset on CATERR value\n";
667 return;
668 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800669 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700670 },
671 "xyz.openbmc_project.Settings",
672 "/xyz/openbmc_project/control/processor_error_config",
673 "org.freedesktop.DBus.Properties", "Get",
674 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
675 });
676}
677
Jason M. Bills1490b142019-07-01 15:48:43 -0700678static void caterrHandler()
679{
680 if (!hostOff)
681 {
682 gpiod::line_event gpioLineEvent = caterrLine.event_read();
683
684 bool caterr =
685 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
686 if (caterr)
687 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700688 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700689 }
690 else
691 {
692 caterrAssertTimer.cancel();
693 }
694 }
695 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
696 [](const boost::system::error_code ec) {
697 if (ec)
698 {
699 std::cerr << "caterr handler error: "
700 << ec.message() << "\n";
701 return;
702 }
703 caterrHandler();
704 });
705}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700706
Jason M. Billse94f5e12019-09-13 11:11:34 -0700707static void cpu1ThermtripAssertHandler()
708{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700709 if (cpu1FIVRFaultLine.get_value() == 0)
710 {
711 cpuBootFIVRFaultLog(1);
712 }
713 else
714 {
715 cpuThermTripLog(1);
716 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700717}
718
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700719static void cpu1ThermtripHandler()
720{
721 if (!hostOff)
722 {
723 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
724
725 bool cpu1Thermtrip =
726 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
727 if (cpu1Thermtrip)
728 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700729 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700730 }
731 }
732 cpu1ThermtripEvent.async_wait(
733 boost::asio::posix::stream_descriptor::wait_read,
734 [](const boost::system::error_code ec) {
735 if (ec)
736 {
737 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
738 << "\n";
739 return;
740 }
741 cpu1ThermtripHandler();
742 });
743}
744
Jason M. Billse94f5e12019-09-13 11:11:34 -0700745static void cpu2ThermtripAssertHandler()
746{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700747 if (cpu2FIVRFaultLine.get_value() == 0)
748 {
749 cpuBootFIVRFaultLog(2);
750 }
751 else
752 {
753 cpuThermTripLog(2);
754 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700755}
756
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700757static void cpu2ThermtripHandler()
758{
759 if (!hostOff)
760 {
761 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
762
763 bool cpu2Thermtrip =
764 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
765 if (cpu2Thermtrip)
766 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700767 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700768 }
769 }
770 cpu2ThermtripEvent.async_wait(
771 boost::asio::posix::stream_descriptor::wait_read,
772 [](const boost::system::error_code ec) {
773 if (ec)
774 {
775 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
776 << "\n";
777 return;
778 }
779 cpu2ThermtripHandler();
780 });
781}
782
Jason M. Billse94f5e12019-09-13 11:11:34 -0700783static void cpu1VRHotAssertHandler()
784{
785 cpuVRHotLog("CPU 1");
786}
787
Jason M. Bills250fa632019-08-28 15:58:25 -0700788static void cpu1VRHotHandler()
789{
790 if (!hostOff)
791 {
792 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
793
794 bool cpu1VRHot =
795 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
796 if (cpu1VRHot)
797 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700798 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700799 }
800 }
801 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
802 [](const boost::system::error_code ec) {
803 if (ec)
804 {
805 std::cerr << "CPU 1 VRHot handler error: "
806 << ec.message() << "\n";
807 return;
808 }
809 cpu1VRHotHandler();
810 });
811}
812
Jason M. Billse94f5e12019-09-13 11:11:34 -0700813static void cpu1MemABCDVRHotAssertHandler()
814{
815 cpuVRHotLog("CPU 1 Memory ABCD");
816}
817
Jason M. Bills9647ba72019-08-29 14:19:19 -0700818static void cpu1MemABCDVRHotHandler()
819{
820 if (!hostOff)
821 {
822 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
823
824 bool cpu1MemABCDVRHot =
825 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
826 if (cpu1MemABCDVRHot)
827 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700828 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700829 }
830 }
831 cpu1MemABCDVRHotEvent.async_wait(
832 boost::asio::posix::stream_descriptor::wait_read,
833 [](const boost::system::error_code ec) {
834 if (ec)
835 {
836 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
837 << ec.message() << "\n";
838 return;
839 }
840 cpu1MemABCDVRHotHandler();
841 });
842}
843
Jason M. Billse94f5e12019-09-13 11:11:34 -0700844static void cpu1MemEFGHVRHotAssertHandler()
845{
846 cpuVRHotLog("CPU 1 Memory EFGH");
847}
848
Jason M. Bills9647ba72019-08-29 14:19:19 -0700849static void cpu1MemEFGHVRHotHandler()
850{
851 if (!hostOff)
852 {
853 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
854
855 bool cpu1MemEFGHVRHot =
856 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
857 if (cpu1MemEFGHVRHot)
858 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700859 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700860 }
861 }
862 cpu1MemEFGHVRHotEvent.async_wait(
863 boost::asio::posix::stream_descriptor::wait_read,
864 [](const boost::system::error_code ec) {
865 if (ec)
866 {
867 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
868 << ec.message() << "\n";
869 return;
870 }
871 cpu1MemEFGHVRHotHandler();
872 });
873}
874
Jason M. Billse94f5e12019-09-13 11:11:34 -0700875static void cpu2VRHotAssertHandler()
876{
877 cpuVRHotLog("CPU 2");
878}
879
Jason M. Bills250fa632019-08-28 15:58:25 -0700880static void cpu2VRHotHandler()
881{
882 if (!hostOff)
883 {
884 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
885
886 bool cpu2VRHot =
887 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
888 if (cpu2VRHot)
889 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700890 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700891 }
892 }
893 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
894 [](const boost::system::error_code ec) {
895 if (ec)
896 {
897 std::cerr << "CPU 2 VRHot handler error: "
898 << ec.message() << "\n";
899 return;
900 }
901 cpu2VRHotHandler();
902 });
903}
904
Jason M. Billse94f5e12019-09-13 11:11:34 -0700905static void cpu2MemABCDVRHotAssertHandler()
906{
907 cpuVRHotLog("CPU 2 Memory ABCD");
908}
909
Jason M. Bills9647ba72019-08-29 14:19:19 -0700910static void cpu2MemABCDVRHotHandler()
911{
912 if (!hostOff)
913 {
914 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
915
916 bool cpu2MemABCDVRHot =
917 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
918 if (cpu2MemABCDVRHot)
919 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700920 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700921 }
922 }
923 cpu2MemABCDVRHotEvent.async_wait(
924 boost::asio::posix::stream_descriptor::wait_read,
925 [](const boost::system::error_code ec) {
926 if (ec)
927 {
928 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
929 << ec.message() << "\n";
930 return;
931 }
932 cpu2MemABCDVRHotHandler();
933 });
934}
935
Jason M. Billse94f5e12019-09-13 11:11:34 -0700936static void cpu2MemEFGHVRHotAssertHandler()
937{
938 cpuVRHotLog("CPU 2 Memory EFGH");
939}
940
Jason M. Bills9647ba72019-08-29 14:19:19 -0700941static void cpu2MemEFGHVRHotHandler()
942{
943 if (!hostOff)
944 {
945 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
946
947 bool cpu2MemEFGHVRHot =
948 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
949 if (cpu2MemEFGHVRHot)
950 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700951 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700952 }
953 }
954 cpu2MemEFGHVRHotEvent.async_wait(
955 boost::asio::posix::stream_descriptor::wait_read,
956 [](const boost::system::error_code ec) {
957 if (ec)
958 {
959 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
960 << ec.message() << "\n";
961 return;
962 }
963 cpu2MemEFGHVRHotHandler();
964 });
965}
966
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800967static void pchThermtripHandler()
968{
969 if (!hostOff)
970 {
971 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
972
973 bool pchThermtrip =
974 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
975 if (pchThermtrip)
976 {
Jason M. Bills08866542019-08-16 12:04:19 -0700977 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800978 }
979 }
980 pchThermtripEvent.async_wait(
981 boost::asio::posix::stream_descriptor::wait_read,
982 [](const boost::system::error_code ec) {
983 if (ec)
984 {
985 std::cerr << "PCH Thermal trip handler error: " << ec.message()
986 << "\n";
987 return;
988 }
989 pchThermtripHandler();
990 });
991}
992
Jason M. Billscbf78532019-08-16 15:32:11 -0700993static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700994{
Jason M. Billscbf78532019-08-16 15:32:11 -0700995 int errPinSts = (1 << errPin);
996 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700997 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
998 cpu++, addr++)
999 {
1000 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1001 {
1002 uint8_t cc = 0;
1003 CPUModel model{};
1004 uint8_t stepping = 0;
1005 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1006 {
1007 std::cerr << "Cannot get CPUID!\n";
1008 continue;
1009 }
1010
1011 switch (model)
1012 {
1013 case skx:
1014 {
1015 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001016 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001017 uint32_t errpinsts = 0;
1018 if (peci_RdPCIConfigLocal(
1019 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1020 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1021 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001022 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001023 }
1024 break;
1025 }
1026 case icx:
1027 {
1028 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001029 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001030 // accessed on PECI as bus 13)
1031 uint32_t errpinsts = 0;
1032 if (peci_RdEndPointConfigPciLocal(
1033 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1034 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1035 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001036 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001037 }
1038 break;
1039 }
1040 }
1041 }
1042 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001043 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001044}
1045
Jason M. Billscbf78532019-08-16 15:32:11 -07001046static void errXAssertHandler(const int errPin,
1047 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001048{
Jason M. Billscbf78532019-08-16 15:32:11 -07001049 // ERRx status is not guaranteed through the timeout, so save which
1050 // CPUs have it asserted
1051 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1052 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1053 errXAssertTimer.async_wait([errPin, errPinCPUs](
1054 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001055 if (ec)
1056 {
1057 // operation_aborted is expected if timer is canceled before
1058 // completion.
1059 if (ec != boost::asio::error::operation_aborted)
1060 {
1061 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1062 << "\n";
1063 }
1064 return;
1065 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001066 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1067 << std::to_string(errTimeoutMs) << " ms\n";
1068 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001069 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001070 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001071 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001072 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001073 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001074 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001075 }
1076 }
1077 }
1078 else
1079 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001080 cpuERRXLog(errPin);
1081 }
1082 });
1083}
1084
Jason M. Bills8c584392019-08-19 11:05:51 -07001085static void err0AssertHandler()
1086{
1087 // Handle the standard ERR0 detection and logging
1088 const static constexpr int err0 = 0;
1089 errXAssertHandler(err0, err0AssertTimer);
1090}
1091
1092static void err0Handler()
1093{
1094 if (!hostOff)
1095 {
1096 gpiod::line_event gpioLineEvent = err0Line.event_read();
1097
1098 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1099 if (err0)
1100 {
1101 err0AssertHandler();
1102 }
1103 else
1104 {
1105 err0AssertTimer.cancel();
1106 }
1107 }
1108 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1109 [](const boost::system::error_code ec) {
1110 if (ec)
1111 {
1112 std::cerr
1113 << "err0 handler error: " << ec.message()
1114 << "\n";
1115 return;
1116 }
1117 err0Handler();
1118 });
1119}
1120
Jason M. Bills75af3962019-08-19 11:07:17 -07001121static void err1AssertHandler()
1122{
1123 // Handle the standard ERR1 detection and logging
1124 const static constexpr int err1 = 1;
1125 errXAssertHandler(err1, err1AssertTimer);
1126}
1127
1128static void err1Handler()
1129{
1130 if (!hostOff)
1131 {
1132 gpiod::line_event gpioLineEvent = err1Line.event_read();
1133
1134 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1135 if (err1)
1136 {
1137 err1AssertHandler();
1138 }
1139 else
1140 {
1141 err1AssertTimer.cancel();
1142 }
1143 }
1144 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1145 [](const boost::system::error_code ec) {
1146 if (ec)
1147 {
1148 std::cerr
1149 << "err1 handler error: " << ec.message()
1150 << "\n";
1151 return;
1152 }
1153 err1Handler();
1154 });
1155}
1156
Jason M. Billscbf78532019-08-16 15:32:11 -07001157static void err2AssertHandler()
1158{
1159 // Handle the standard ERR2 detection and logging
1160 const static constexpr int err2 = 2;
1161 errXAssertHandler(err2, err2AssertTimer);
1162 // Also handle reset for ERR2
1163 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1164 if (ec)
1165 {
1166 // operation_aborted is expected if timer is canceled before
1167 // completion.
1168 if (ec != boost::asio::error::operation_aborted)
1169 {
1170 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1171 << "\n";
1172 }
1173 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001174 }
1175 conn->async_method_call(
1176 [](boost::system::error_code ec,
1177 const std::variant<bool>& property) {
1178 if (ec)
1179 {
1180 return;
1181 }
1182 const bool* reset = std::get_if<bool>(&property);
1183 if (reset == nullptr)
1184 {
1185 std::cerr << "Unable to read reset on ERR2 value\n";
1186 return;
1187 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001188 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001189 },
1190 "xyz.openbmc_project.Settings",
1191 "/xyz/openbmc_project/control/processor_error_config",
1192 "org.freedesktop.DBus.Properties", "Get",
1193 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
1194 });
1195}
1196
1197static void err2Handler()
1198{
1199 if (!hostOff)
1200 {
1201 gpiod::line_event gpioLineEvent = err2Line.event_read();
1202
1203 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1204 if (err2)
1205 {
1206 err2AssertHandler();
1207 }
1208 else
1209 {
1210 err2AssertTimer.cancel();
1211 }
1212 }
1213 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1214 [](const boost::system::error_code ec) {
1215 if (ec)
1216 {
1217 std::cerr
1218 << "err2 handler error: " << ec.message()
1219 << "\n";
1220 return;
1221 }
1222 err2Handler();
1223 });
1224}
1225
Jason M. Bills89922f82019-08-06 11:10:02 -07001226static void smiAssertHandler()
1227{
1228 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1229 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1230 if (ec)
1231 {
1232 // operation_aborted is expected if timer is canceled before
1233 // completion.
1234 if (ec != boost::asio::error::operation_aborted)
1235 {
1236 std::cerr << "smi timeout async_wait failed: " << ec.message()
1237 << "\n";
1238 }
1239 return;
1240 }
1241 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1242 << " ms\n";
1243 smiTimeoutLog();
1244 conn->async_method_call(
1245 [](boost::system::error_code ec,
1246 const std::variant<bool>& property) {
1247 if (ec)
1248 {
1249 return;
1250 }
1251 const bool* reset = std::get_if<bool>(&property);
1252 if (reset == nullptr)
1253 {
1254 std::cerr << "Unable to read reset on SMI value\n";
1255 return;
1256 }
Jason M. Bills94785442020-01-07 15:22:09 -08001257#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001258 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001259#else
1260 if (*reset)
1261 {
1262 std::cout << "Recovering the system\n";
1263 startPowerCycle();
1264 }
1265#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001266 },
1267 "xyz.openbmc_project.Settings",
1268 "/xyz/openbmc_project/control/bmc_reset_disables",
1269 "org.freedesktop.DBus.Properties", "Get",
1270 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1271 });
1272}
1273
1274static void smiHandler()
1275{
1276 if (!hostOff)
1277 {
1278 gpiod::line_event gpioLineEvent = smiLine.event_read();
1279
1280 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1281 if (smi)
1282 {
1283 smiAssertHandler();
1284 }
1285 else
1286 {
1287 smiAssertTimer.cancel();
1288 }
1289 }
1290 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1291 [](const boost::system::error_code ec) {
1292 if (ec)
1293 {
1294 std::cerr
1295 << "smi handler error: " << ec.message()
1296 << "\n";
1297 return;
1298 }
1299 smiHandler();
1300 });
1301}
1302
Jason M. Billsa15c2522019-08-16 10:01:44 -07001303static void initializeErrorState()
1304{
1305 // Handle CPU_CATERR if it's asserted now
1306 if (caterrLine.get_value() == 0)
1307 {
1308 caterrAssertHandler();
1309 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001310
Jason M. Bills8c584392019-08-19 11:05:51 -07001311 // Handle CPU_ERR0 if it's asserted now
1312 if (err0Line.get_value() == 0)
1313 {
1314 err0AssertHandler();
1315 }
1316
Jason M. Bills75af3962019-08-19 11:07:17 -07001317 // Handle CPU_ERR1 if it's asserted now
1318 if (err1Line.get_value() == 0)
1319 {
1320 err1AssertHandler();
1321 }
1322
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001323 // Handle CPU_ERR2 if it's asserted now
1324 if (err2Line.get_value() == 0)
1325 {
1326 err2AssertHandler();
1327 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001328
1329 // Handle SMI if it's asserted now
1330 if (smiLine.get_value() == 0)
1331 {
1332 smiAssertHandler();
1333 }
Jason M. Bills08866542019-08-16 12:04:19 -07001334
Jason M. Billse94f5e12019-09-13 11:11:34 -07001335 // Handle CPU1_THERMTRIP if it's asserted now
1336 if (cpu1ThermtripLine.get_value() == 0)
1337 {
1338 cpu1ThermtripAssertHandler();
1339 }
1340
1341 // Handle CPU2_THERMTRIP if it's asserted now
1342 if (cpu2ThermtripLine.get_value() == 0)
1343 {
1344 cpu2ThermtripAssertHandler();
1345 }
1346
1347 // Handle CPU1_VRHOT if it's asserted now
1348 if (cpu1VRHotLine.get_value() == 0)
1349 {
1350 cpu1VRHotAssertHandler();
1351 }
1352
1353 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1354 if (cpu1MemABCDVRHotLine.get_value() == 0)
1355 {
1356 cpu1MemABCDVRHotAssertHandler();
1357 }
1358
1359 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1360 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1361 {
1362 cpu1MemEFGHVRHotAssertHandler();
1363 }
1364
1365 // Handle CPU2_VRHOT if it's asserted now
1366 if (cpu2VRHotLine.get_value() == 0)
1367 {
1368 cpu2VRHotAssertHandler();
1369 }
1370
1371 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1372 if (cpu2MemABCDVRHotLine.get_value() == 0)
1373 {
1374 cpu2MemABCDVRHotAssertHandler();
1375 }
1376
1377 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1378 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1379 {
1380 cpu2MemEFGHVRHotAssertHandler();
1381 }
1382
Jason M. Bills08866542019-08-16 12:04:19 -07001383 // Handle PCH_BMC_THERMTRIP if it's asserted now
1384 if (pchThermtripLine.get_value() == 0)
1385 {
1386 ssbThermTripLog();
1387 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001388}
Jason M. Bills1490b142019-07-01 15:48:43 -07001389} // namespace host_error_monitor
1390
1391int main(int argc, char* argv[])
1392{
1393 // setup connection to dbus
1394 host_error_monitor::conn =
1395 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1396
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001397 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001398 host_error_monitor::conn->request_name(
1399 "xyz.openbmc_project.HostErrorMonitor");
1400 sdbusplus::asio::object_server server =
1401 sdbusplus::asio::object_server(host_error_monitor::conn);
1402
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001403 // Restart Cause Interface
1404 host_error_monitor::hostErrorTimeoutIface =
1405 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1406 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1407
1408 host_error_monitor::hostErrorTimeoutIface->register_property(
1409 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1410 [](const std::size_t& requested, std::size_t& resp) {
1411 if (requested > host_error_monitor::caterrTimeoutMsMax)
1412 {
1413 std::cerr << "IERRTimeoutMs update to " << requested
1414 << "ms rejected. Cannot be greater than "
1415 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1416 return 0;
1417 }
1418 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1419 host_error_monitor::caterrTimeoutMs = requested;
1420 resp = requested;
1421 return 1;
1422 },
1423 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1424 host_error_monitor::hostErrorTimeoutIface->initialize();
1425
Jason M. Bills1490b142019-07-01 15:48:43 -07001426 // Start tracking host state
1427 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1428 host_error_monitor::startHostStateMonitor();
1429
1430 // Initialize the host state
1431 host_error_monitor::initializeHostState();
1432
1433 // Request CPU_CATERR GPIO events
1434 if (!host_error_monitor::requestGPIOEvents(
1435 "CPU_CATERR", host_error_monitor::caterrHandler,
1436 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1437 {
1438 return -1;
1439 }
1440
Jason M. Bills8c584392019-08-19 11:05:51 -07001441 // Request CPU_ERR0 GPIO events
1442 if (!host_error_monitor::requestGPIOEvents(
1443 "CPU_ERR0", host_error_monitor::err0Handler,
1444 host_error_monitor::err0Line, host_error_monitor::err0Event))
1445 {
1446 return -1;
1447 }
1448
Jason M. Bills75af3962019-08-19 11:07:17 -07001449 // Request CPU_ERR1 GPIO events
1450 if (!host_error_monitor::requestGPIOEvents(
1451 "CPU_ERR1", host_error_monitor::err1Handler,
1452 host_error_monitor::err1Line, host_error_monitor::err1Event))
1453 {
1454 return -1;
1455 }
1456
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001457 // Request CPU_ERR2 GPIO events
1458 if (!host_error_monitor::requestGPIOEvents(
1459 "CPU_ERR2", host_error_monitor::err2Handler,
1460 host_error_monitor::err2Line, host_error_monitor::err2Event))
1461 {
1462 return -1;
1463 }
1464
Jason M. Bills89922f82019-08-06 11:10:02 -07001465 // Request SMI GPIO events
1466 if (!host_error_monitor::requestGPIOEvents(
1467 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1468 host_error_monitor::smiEvent))
1469 {
1470 return -1;
1471 }
1472
Jason M. Bills45e87e02019-09-09 14:45:38 -07001473 // Request CPU1_FIVR_FAULT GPIO input
1474 if (!host_error_monitor::requestGPIOInput(
1475 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1476 {
1477 return -1;
1478 }
1479
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001480 // Request CPU1_THERMTRIP GPIO events
1481 if (!host_error_monitor::requestGPIOEvents(
1482 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1483 host_error_monitor::cpu1ThermtripLine,
1484 host_error_monitor::cpu1ThermtripEvent))
1485 {
1486 return -1;
1487 }
1488
Jason M. Bills45e87e02019-09-09 14:45:38 -07001489 // Request CPU2_FIVR_FAULT GPIO input
1490 if (!host_error_monitor::requestGPIOInput(
1491 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1492 {
1493 return -1;
1494 }
1495
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001496 // Request CPU2_THERMTRIP GPIO events
1497 if (!host_error_monitor::requestGPIOEvents(
1498 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1499 host_error_monitor::cpu2ThermtripLine,
1500 host_error_monitor::cpu2ThermtripEvent))
1501 {
1502 return -1;
1503 }
1504
Jason M. Bills250fa632019-08-28 15:58:25 -07001505 // Request CPU1_VRHOT GPIO events
1506 if (!host_error_monitor::requestGPIOEvents(
1507 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1508 host_error_monitor::cpu1VRHotLine,
1509 host_error_monitor::cpu1VRHotEvent))
1510 {
1511 return -1;
1512 }
1513
Jason M. Bills9647ba72019-08-29 14:19:19 -07001514 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1515 if (!host_error_monitor::requestGPIOEvents(
1516 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1517 host_error_monitor::cpu1MemABCDVRHotLine,
1518 host_error_monitor::cpu1MemABCDVRHotEvent))
1519 {
1520 return -1;
1521 }
1522
1523 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1524 if (!host_error_monitor::requestGPIOEvents(
1525 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1526 host_error_monitor::cpu1MemEFGHVRHotLine,
1527 host_error_monitor::cpu1MemEFGHVRHotEvent))
1528 {
1529 return -1;
1530 }
1531
Jason M. Bills250fa632019-08-28 15:58:25 -07001532 // Request CPU2_VRHOT GPIO events
1533 if (!host_error_monitor::requestGPIOEvents(
1534 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1535 host_error_monitor::cpu2VRHotLine,
1536 host_error_monitor::cpu2VRHotEvent))
1537 {
1538 return -1;
1539 }
1540
Jason M. Bills9647ba72019-08-29 14:19:19 -07001541 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1542 if (!host_error_monitor::requestGPIOEvents(
1543 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1544 host_error_monitor::cpu2MemABCDVRHotLine,
1545 host_error_monitor::cpu2MemABCDVRHotEvent))
1546 {
1547 return -1;
1548 }
1549
1550 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1551 if (!host_error_monitor::requestGPIOEvents(
1552 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1553 host_error_monitor::cpu2MemEFGHVRHotLine,
1554 host_error_monitor::cpu2MemEFGHVRHotEvent))
1555 {
1556 return -1;
1557 }
1558
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001559 // Request PCH_BMC_THERMTRIP GPIO events
1560 if (!host_error_monitor::requestGPIOEvents(
1561 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1562 host_error_monitor::pchThermtripLine,
1563 host_error_monitor::pchThermtripEvent))
1564 {
1565 return -1;
1566 }
1567
Jason M. Bills1490b142019-07-01 15:48:43 -07001568 host_error_monitor::io.run();
1569
1570 return 0;
1571}