blob: f24925d85d92b96820fe87bb56b998ecf48ec305 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080030static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070031
32static bool hostOff = true;
33
Jason M. Billsc4b91f22019-11-26 17:04:50 -080034static size_t caterrTimeoutMs = 2000;
35const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070036const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070037const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070038const static constexpr size_t crashdumpTimeoutS = 300;
39
40// Timers
41// Timer for CATERR asserted
42static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070043// Timer for ERR0 asserted
44static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070045// Timer for ERR1 asserted
46static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070047// Timer for ERR2 asserted
48static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070049// Timer for SMI asserted
50static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070051
52// GPIO Lines and Event Descriptors
53static gpiod::line caterrLine;
54static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070055static gpiod::line err0Line;
56static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070057static gpiod::line err1Line;
58static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070059static gpiod::line err2Line;
60static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070061static gpiod::line smiLine;
62static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070063static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070064static gpiod::line cpu1ThermtripLine;
65static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070066static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070067static gpiod::line cpu2ThermtripLine;
68static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070069static gpiod::line cpu1VRHotLine;
70static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
71static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070072static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
73static gpiod::line cpu1MemEFGHVRHotLine;
74static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
75static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070076static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070077static gpiod::line cpu1MemABCDVRHotLine;
78static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
79static gpiod::line cpu2MemEFGHVRHotLine;
80static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080081//----------------------------------
82// PCH_BMC_THERMTRIP function related definition
83//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080084static gpiod::line pchThermtripLine;
85static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000086//----------------------------------
87// CPU_MEM_THERM_EVENT function related definition
88//----------------------------------
89static gpiod::line cpu1MemtripLine;
90static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
91static gpiod::line cpu2MemtripLine;
92static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070093
Jason M. Billsa3397932019-08-06 11:07:21 -070094static void cpuIERRLog()
95{
96 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
97 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
98 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
99}
100
101static void cpuIERRLog(const int cpuNum)
102{
103 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
104
105 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
106 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
107 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
108}
109
110static void cpuIERRLog(const int cpuNum, const std::string& type)
111{
112 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
113
114 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
115 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
116 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
117}
118
Jason M. Billscbf78532019-08-16 15:32:11 -0700119static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700120{
Jason M. Billscbf78532019-08-16 15:32:11 -0700121 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
122
123 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
124 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
125 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700126}
127
Jason M. Billscbf78532019-08-16 15:32:11 -0700128static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700129{
Jason M. Billscbf78532019-08-16 15:32:11 -0700130 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
131 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700132
133 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
134 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
135 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
136}
137
Jason M. Bills89922f82019-08-06 11:10:02 -0700138static void smiTimeoutLog()
139{
140 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
141 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
142 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
143}
144
Jason M. Bills45e87e02019-09-09 14:45:38 -0700145static void cpuBootFIVRFaultLog(const int cpuNum)
146{
147 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
148
149 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
150 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
151 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
152}
153
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700154static void cpuThermTripLog(const int cpuNum)
155{
156 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
157
158 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
159 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
160 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
161 cpuNum, NULL);
162}
163
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000164static void memThermTripLog(const int cpuNum)
165{
166 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
167 std::string msg = cpuNumber + " Memory Thermal trip.";
168
169 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
170 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
171 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
172 cpuNumber.c_str(), NULL);
173}
174
Jason M. Bills250fa632019-08-28 15:58:25 -0700175static void cpuVRHotLog(const std::string& vr)
176{
177 std::string msg = vr + " Voltage Regulator Overheated.";
178
179 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
180 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
181 "OpenBMC.0.1.VoltageRegulatorOverheated",
182 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
183}
184
Jason M. Bills08866542019-08-16 12:04:19 -0700185static void ssbThermTripLog()
186{
187 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
188 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
189 "OpenBMC.0.1.SsbThermalTrip", NULL);
190}
191
Jason M. Billsa15c2522019-08-16 10:01:44 -0700192static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700193static void initializeHostState()
194{
195 conn->async_method_call(
196 [](boost::system::error_code ec,
197 const std::variant<std::string>& property) {
198 if (ec)
199 {
200 return;
201 }
202 const std::string* state = std::get_if<std::string>(&property);
203 if (state == nullptr)
204 {
205 std::cerr << "Unable to read host state value\n";
206 return;
207 }
208 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700209 // If the system is on, initialize the error state
210 if (!hostOff)
211 {
212 initializeErrorState();
213 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700214 },
215 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
216 "org.freedesktop.DBus.Properties", "Get",
217 "xyz.openbmc_project.State.Host", "CurrentHostState");
218}
219
220static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
221{
222 return std::make_shared<sdbusplus::bus::match::match>(
223 *conn,
224 "type='signal',interface='org.freedesktop.DBus.Properties',"
225 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
226 "Host'",
227 [](sdbusplus::message::message& msg) {
228 std::string interfaceName;
229 boost::container::flat_map<std::string, std::variant<std::string>>
230 propertiesChanged;
231 std::string state;
232 try
233 {
234 msg.read(interfaceName, propertiesChanged);
235 state =
236 std::get<std::string>(propertiesChanged.begin()->second);
237 }
238 catch (std::exception& e)
239 {
240 std::cerr << "Unable to read host state\n";
241 return;
242 }
243 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
244
Jason M. Bills1490b142019-07-01 15:48:43 -0700245 if (hostOff)
246 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700247 // No host events should fire while off, so cancel any pending
248 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700249 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700250 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700251 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700252 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700253 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700254 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700255 else
256 {
257 // Handle any initial errors when the host turns on
258 initializeErrorState();
259 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700260 });
261}
262
263static bool requestGPIOEvents(
264 const std::string& name, const std::function<void()>& handler,
265 gpiod::line& gpioLine,
266 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
267{
268 // Find the GPIO line
269 gpioLine = gpiod::find_line(name);
270 if (!gpioLine)
271 {
272 std::cerr << "Failed to find the " << name << " line\n";
273 return false;
274 }
275
276 try
277 {
278 gpioLine.request(
279 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
280 }
281 catch (std::exception&)
282 {
283 std::cerr << "Failed to request events for " << name << "\n";
284 return false;
285 }
286
287 int gpioLineFd = gpioLine.event_get_fd();
288 if (gpioLineFd < 0)
289 {
290 std::cerr << "Failed to get " << name << " fd\n";
291 return false;
292 }
293
294 gpioEventDescriptor.assign(gpioLineFd);
295
296 gpioEventDescriptor.async_wait(
297 boost::asio::posix::stream_descriptor::wait_read,
298 [&name, handler](const boost::system::error_code ec) {
299 if (ec)
300 {
301 std::cerr << name << " fd handler error: " << ec.message()
302 << "\n";
303 return;
304 }
305 handler();
306 });
307 return true;
308}
309
Jason M. Bills45e87e02019-09-09 14:45:38 -0700310static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
311{
312 // Find the GPIO line
313 gpioLine = gpiod::find_line(name);
314 if (!gpioLine)
315 {
316 std::cerr << "Failed to find the " << name << " line.\n";
317 return false;
318 }
319
320 // Request GPIO input
321 try
322 {
323 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
324 }
325 catch (std::exception&)
326 {
327 std::cerr << "Failed to request " << name << " input\n";
328 return false;
329 }
330
331 return true;
332}
333
Jason M. Bills1490b142019-07-01 15:48:43 -0700334static void startPowerCycle()
335{
336 conn->async_method_call(
337 [](boost::system::error_code ec) {
338 if (ec)
339 {
340 std::cerr << "failed to set Chassis State\n";
341 }
342 },
343 "xyz.openbmc_project.State.Chassis",
344 "/xyz/openbmc_project/state/chassis0",
345 "org.freedesktop.DBus.Properties", "Set",
346 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
347 std::variant<std::string>{
348 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
349}
350
Jason M. Billsb61766b2019-11-26 17:02:44 -0800351static void startCrashdumpAndRecovery(bool recoverSystem,
352 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700353{
354 std::cout << "Starting crashdump\n";
355 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
356 static boost::asio::steady_timer crashdumpTimer(io);
357
358 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
359 *conn,
360 "type='signal',interface='org.freedesktop.DBus.Properties',"
361 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
362 [recoverSystem](sdbusplus::message::message& msg) {
363 crashdumpTimer.cancel();
364 std::cout << "Crashdump completed\n";
365 if (recoverSystem)
366 {
367 std::cout << "Recovering the system\n";
368 startPowerCycle();
369 }
370 crashdumpCompleteMatch.reset();
371 });
372
373 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
374 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
375 if (ec)
376 {
377 // operation_aborted is expected if timer is canceled
378 if (ec != boost::asio::error::operation_aborted)
379 {
380 std::cerr << "Crashdump async_wait failed: " << ec.message()
381 << "\n";
382 }
383 std::cout << "Crashdump timer canceled\n";
384 return;
385 }
386 std::cerr << "Crashdump failed to complete before timeout\n";
387 crashdumpCompleteMatch.reset();
388 });
389
390 conn->async_method_call(
391 [](boost::system::error_code ec) {
392 if (ec)
393 {
394 std::cerr << "failed to start Crashdump\n";
395 crashdumpTimer.cancel();
396 crashdumpCompleteMatch.reset();
397 }
398 },
399 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800400 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700401}
402
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700403static void incrementCPUErrorCount(int cpuNum)
404{
405 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
406
407 // Get the current count
408 conn->async_method_call(
409 [propertyName](boost::system::error_code ec,
410 const std::variant<uint8_t>& property) {
411 if (ec)
412 {
413 std::cerr << "Failed to read " << propertyName << ": "
414 << ec.message() << "\n";
415 return;
416 }
417 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
418 if (errorCountVariant == nullptr)
419 {
420 std::cerr << propertyName << " invalid\n";
421 return;
422 }
423 uint8_t errorCount = *errorCountVariant;
424 if (errorCount == std::numeric_limits<uint8_t>::max())
425 {
426 std::cerr << "Maximum error count reached\n";
427 return;
428 }
429 // Increment the count
430 errorCount++;
431 conn->async_method_call(
432 [propertyName](boost::system::error_code ec) {
433 if (ec)
434 {
435 std::cerr << "Failed to set " << propertyName << ": "
436 << ec.message() << "\n";
437 }
438 },
439 "xyz.openbmc_project.Settings",
440 "/xyz/openbmc_project/control/processor_error_config",
441 "org.freedesktop.DBus.Properties", "Set",
442 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
443 std::variant<uint8_t>{errorCount});
444 },
445 "xyz.openbmc_project.Settings",
446 "/xyz/openbmc_project/control/processor_error_config",
447 "org.freedesktop.DBus.Properties", "Get",
448 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
449}
450
Jason M. Billsa3397932019-08-06 11:07:21 -0700451static bool checkIERRCPUs()
452{
453 bool cpuIERRFound = false;
454 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
455 cpu++, addr++)
456 {
457 uint8_t cc = 0;
458 CPUModel model{};
459 uint8_t stepping = 0;
460 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
461 {
462 std::cerr << "Cannot get CPUID!\n";
463 continue;
464 }
465
466 switch (model)
467 {
468 case skx:
469 {
470 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
471 // that caused the IERR
472 uint32_t mcaErrSrcLog = 0;
473 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
474 &cc) != PECI_CC_SUCCESS)
475 {
476 continue;
477 }
478 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
479 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
480 {
481 // TODO: Light the CPU fault LED?
482 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700483 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700484 // Next check if it's a CPU/VR mismatch by reading the
485 // IA32_MC4_STATUS MSR (0x411)
486 uint64_t mc4Status = 0;
487 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
488 PECI_CC_SUCCESS)
489 {
490 continue;
491 }
492 // Check MSEC bits 31:24 for
493 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
494 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
495 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
496 if ((mc4Status & (0x40 << 24)) ||
497 (mc4Status & (0x42 << 24)) ||
498 (mc4Status & (0x43 << 24)))
499 {
500 cpuIERRLog(cpu, "CPU/VR Mismatch");
501 continue;
502 }
503
504 // Next check if it's a Core FIVR fault by looking for a
505 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
506 // 80h)
507 uint32_t coreFIVRErrLog = 0;
508 if (peci_RdPCIConfigLocal(
509 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
510 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
511 {
512 continue;
513 }
514 if (coreFIVRErrLog)
515 {
516 cpuIERRLog(cpu, "Core FIVR Fault");
517 continue;
518 }
519
520 // Next check if it's an Uncore FIVR fault by looking for a
521 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
522 // 84h)
523 uint32_t uncoreFIVRErrLog = 0;
524 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
525 sizeof(uint32_t),
526 (uint8_t*)&uncoreFIVRErrLog,
527 &cc) != PECI_CC_SUCCESS)
528 {
529 continue;
530 }
531 if (uncoreFIVRErrLog)
532 {
533 cpuIERRLog(cpu, "Uncore FIVR Fault");
534 continue;
535 }
536
537 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
538 // both zero, but MSEC bits 31:24 have either
539 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
540 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
541 // uncore FIVR fault
542 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
543 ((mc4Status & (0x51 << 24)) ||
544 (mc4Status & (0x52 << 24))))
545 {
546 cpuIERRLog(cpu, "Uncore FIVR Fault");
547 continue;
548 }
549 cpuIERRLog(cpu);
550 }
551 break;
552 }
553 case icx:
554 {
555 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
556 // that caused the IERR
557 uint32_t mcaErrSrcLog = 0;
558 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
559 &cc) != PECI_CC_SUCCESS)
560 {
561 continue;
562 }
563 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
564 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
565 {
566 // TODO: Light the CPU fault LED?
567 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700568 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700569 // Next check if it's a CPU/VR mismatch by reading the
570 // IA32_MC4_STATUS MSR (0x411)
571 uint64_t mc4Status = 0;
572 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
573 PECI_CC_SUCCESS)
574 {
575 continue;
576 }
577 // TODO: Update MSEC/MSCOD_31_24 check
578 // Check MSEC bits 31:24 for
579 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
580 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
581 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
582 if ((mc4Status & (0x40 << 24)) ||
583 (mc4Status & (0x42 << 24)) ||
584 (mc4Status & (0x43 << 24)))
585 {
586 cpuIERRLog(cpu, "CPU/VR Mismatch");
587 continue;
588 }
589
590 // Next check if it's a Core FIVR fault by looking for a
591 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
592 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
593 uint32_t coreFIVRErrLog0 = 0;
594 uint32_t coreFIVRErrLog1 = 0;
595 if (peci_RdEndPointConfigPciLocal(
596 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
597 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
598 {
599 continue;
600 }
601 if (peci_RdEndPointConfigPciLocal(
602 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
603 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
604 {
605 continue;
606 }
607 if (coreFIVRErrLog0 || coreFIVRErrLog1)
608 {
609 cpuIERRLog(cpu, "Core FIVR Fault");
610 continue;
611 }
612
613 // Next check if it's an Uncore FIVR fault by looking for a
614 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
615 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
616 uint32_t uncoreFIVRErrLog = 0;
617 if (peci_RdEndPointConfigPciLocal(
618 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
619 (uint8_t*)&uncoreFIVRErrLog,
620 &cc) != PECI_CC_SUCCESS)
621 {
622 continue;
623 }
624 if (uncoreFIVRErrLog)
625 {
626 cpuIERRLog(cpu, "Uncore FIVR Fault");
627 continue;
628 }
629
630 // TODO: Update MSEC/MSCOD_31_24 check
631 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
632 // both zero, but MSEC bits 31:24 have either
633 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
634 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
635 // uncore FIVR fault
636 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
637 !uncoreFIVRErrLog &&
638 ((mc4Status & (0x51 << 24)) ||
639 (mc4Status & (0x52 << 24))))
640 {
641 cpuIERRLog(cpu, "Uncore FIVR Fault");
642 continue;
643 }
644 cpuIERRLog(cpu);
645 }
646 break;
647 }
648 }
649 }
650 return cpuIERRFound;
651}
652
Jason M. Billsa15c2522019-08-16 10:01:44 -0700653static void caterrAssertHandler()
654{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700655 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
656 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
657 if (ec)
658 {
659 // operation_aborted is expected if timer is canceled
660 // before completion.
661 if (ec != boost::asio::error::operation_aborted)
662 {
663 std::cerr << "caterr timeout async_wait failed: "
664 << ec.message() << "\n";
665 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700666 return;
667 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700668 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
669 << " ms\n";
670 if (!checkIERRCPUs())
671 {
672 cpuIERRLog();
673 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700674 conn->async_method_call(
675 [](boost::system::error_code ec,
676 const std::variant<bool>& property) {
677 if (ec)
678 {
679 return;
680 }
681 const bool* reset = std::get_if<bool>(&property);
682 if (reset == nullptr)
683 {
684 std::cerr << "Unable to read reset on CATERR value\n";
685 return;
686 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800687 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700688 },
689 "xyz.openbmc_project.Settings",
690 "/xyz/openbmc_project/control/processor_error_config",
691 "org.freedesktop.DBus.Properties", "Get",
692 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
693 });
694}
695
Jason M. Bills1490b142019-07-01 15:48:43 -0700696static void caterrHandler()
697{
698 if (!hostOff)
699 {
700 gpiod::line_event gpioLineEvent = caterrLine.event_read();
701
702 bool caterr =
703 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
704 if (caterr)
705 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700706 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700707 }
708 else
709 {
710 caterrAssertTimer.cancel();
711 }
712 }
713 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
714 [](const boost::system::error_code ec) {
715 if (ec)
716 {
717 std::cerr << "caterr handler error: "
718 << ec.message() << "\n";
719 return;
720 }
721 caterrHandler();
722 });
723}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700724
Jason M. Billse94f5e12019-09-13 11:11:34 -0700725static void cpu1ThermtripAssertHandler()
726{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700727 if (cpu1FIVRFaultLine.get_value() == 0)
728 {
729 cpuBootFIVRFaultLog(1);
730 }
731 else
732 {
733 cpuThermTripLog(1);
734 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700735}
736
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700737static void cpu1ThermtripHandler()
738{
739 if (!hostOff)
740 {
741 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
742
743 bool cpu1Thermtrip =
744 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
745 if (cpu1Thermtrip)
746 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700747 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700748 }
749 }
750 cpu1ThermtripEvent.async_wait(
751 boost::asio::posix::stream_descriptor::wait_read,
752 [](const boost::system::error_code ec) {
753 if (ec)
754 {
755 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
756 << "\n";
757 return;
758 }
759 cpu1ThermtripHandler();
760 });
761}
762
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000763static void cpu1MemtripHandler()
764{
765 if (!hostOff)
766 {
767 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
768
769 bool cpu1Memtrip =
770 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
771 if (cpu1Memtrip)
772 {
773 memThermTripLog(1);
774 }
775 }
776 cpu1MemtripEvent.async_wait(
777 boost::asio::posix::stream_descriptor::wait_read,
778 [](const boost::system::error_code ec) {
779 if (ec)
780 {
781 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
782 << ec.message() << "\n";
783 return;
784 }
785 cpu1MemtripHandler();
786 });
787}
788
Jason M. Billse94f5e12019-09-13 11:11:34 -0700789static void cpu2ThermtripAssertHandler()
790{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700791 if (cpu2FIVRFaultLine.get_value() == 0)
792 {
793 cpuBootFIVRFaultLog(2);
794 }
795 else
796 {
797 cpuThermTripLog(2);
798 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700799}
800
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700801static void cpu2ThermtripHandler()
802{
803 if (!hostOff)
804 {
805 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
806
807 bool cpu2Thermtrip =
808 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
809 if (cpu2Thermtrip)
810 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700811 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700812 }
813 }
814 cpu2ThermtripEvent.async_wait(
815 boost::asio::posix::stream_descriptor::wait_read,
816 [](const boost::system::error_code ec) {
817 if (ec)
818 {
819 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
820 << "\n";
821 return;
822 }
823 cpu2ThermtripHandler();
824 });
825}
826
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000827static void cpu2MemtripHandler()
828{
829 if (!hostOff)
830 {
831 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
832
833 bool cpu2Memtrip =
834 gpioLineEvent.event_type == gpiod::line_event::RISING_EDGE;
835 if (cpu2Memtrip)
836 {
837 memThermTripLog(2);
838 }
839 }
840 cpu2MemtripEvent.async_wait(
841 boost::asio::posix::stream_descriptor::wait_read,
842 [](const boost::system::error_code ec) {
843 if (ec)
844 {
845 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
846 << ec.message() << "\n";
847 return;
848 }
849 cpu2MemtripHandler();
850 });
851}
852
Jason M. Billse94f5e12019-09-13 11:11:34 -0700853static void cpu1VRHotAssertHandler()
854{
855 cpuVRHotLog("CPU 1");
856}
857
Jason M. Bills250fa632019-08-28 15:58:25 -0700858static void cpu1VRHotHandler()
859{
860 if (!hostOff)
861 {
862 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
863
864 bool cpu1VRHot =
865 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
866 if (cpu1VRHot)
867 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700868 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700869 }
870 }
871 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
872 [](const boost::system::error_code ec) {
873 if (ec)
874 {
875 std::cerr << "CPU 1 VRHot handler error: "
876 << ec.message() << "\n";
877 return;
878 }
879 cpu1VRHotHandler();
880 });
881}
882
Jason M. Billse94f5e12019-09-13 11:11:34 -0700883static void cpu1MemABCDVRHotAssertHandler()
884{
885 cpuVRHotLog("CPU 1 Memory ABCD");
886}
887
Jason M. Bills9647ba72019-08-29 14:19:19 -0700888static void cpu1MemABCDVRHotHandler()
889{
890 if (!hostOff)
891 {
892 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
893
894 bool cpu1MemABCDVRHot =
895 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
896 if (cpu1MemABCDVRHot)
897 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700898 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700899 }
900 }
901 cpu1MemABCDVRHotEvent.async_wait(
902 boost::asio::posix::stream_descriptor::wait_read,
903 [](const boost::system::error_code ec) {
904 if (ec)
905 {
906 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
907 << ec.message() << "\n";
908 return;
909 }
910 cpu1MemABCDVRHotHandler();
911 });
912}
913
Jason M. Billse94f5e12019-09-13 11:11:34 -0700914static void cpu1MemEFGHVRHotAssertHandler()
915{
916 cpuVRHotLog("CPU 1 Memory EFGH");
917}
918
Jason M. Bills9647ba72019-08-29 14:19:19 -0700919static void cpu1MemEFGHVRHotHandler()
920{
921 if (!hostOff)
922 {
923 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
924
925 bool cpu1MemEFGHVRHot =
926 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
927 if (cpu1MemEFGHVRHot)
928 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700929 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700930 }
931 }
932 cpu1MemEFGHVRHotEvent.async_wait(
933 boost::asio::posix::stream_descriptor::wait_read,
934 [](const boost::system::error_code ec) {
935 if (ec)
936 {
937 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
938 << ec.message() << "\n";
939 return;
940 }
941 cpu1MemEFGHVRHotHandler();
942 });
943}
944
Jason M. Billse94f5e12019-09-13 11:11:34 -0700945static void cpu2VRHotAssertHandler()
946{
947 cpuVRHotLog("CPU 2");
948}
949
Jason M. Bills250fa632019-08-28 15:58:25 -0700950static void cpu2VRHotHandler()
951{
952 if (!hostOff)
953 {
954 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
955
956 bool cpu2VRHot =
957 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
958 if (cpu2VRHot)
959 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700960 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700961 }
962 }
963 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
964 [](const boost::system::error_code ec) {
965 if (ec)
966 {
967 std::cerr << "CPU 2 VRHot handler error: "
968 << ec.message() << "\n";
969 return;
970 }
971 cpu2VRHotHandler();
972 });
973}
974
Jason M. Billse94f5e12019-09-13 11:11:34 -0700975static void cpu2MemABCDVRHotAssertHandler()
976{
977 cpuVRHotLog("CPU 2 Memory ABCD");
978}
979
Jason M. Bills9647ba72019-08-29 14:19:19 -0700980static void cpu2MemABCDVRHotHandler()
981{
982 if (!hostOff)
983 {
984 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
985
986 bool cpu2MemABCDVRHot =
987 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
988 if (cpu2MemABCDVRHot)
989 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700990 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700991 }
992 }
993 cpu2MemABCDVRHotEvent.async_wait(
994 boost::asio::posix::stream_descriptor::wait_read,
995 [](const boost::system::error_code ec) {
996 if (ec)
997 {
998 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
999 << ec.message() << "\n";
1000 return;
1001 }
1002 cpu2MemABCDVRHotHandler();
1003 });
1004}
1005
Jason M. Billse94f5e12019-09-13 11:11:34 -07001006static void cpu2MemEFGHVRHotAssertHandler()
1007{
1008 cpuVRHotLog("CPU 2 Memory EFGH");
1009}
1010
Jason M. Bills9647ba72019-08-29 14:19:19 -07001011static void cpu2MemEFGHVRHotHandler()
1012{
1013 if (!hostOff)
1014 {
1015 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
1016
1017 bool cpu2MemEFGHVRHot =
1018 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1019 if (cpu2MemEFGHVRHot)
1020 {
Jason M. Billse94f5e12019-09-13 11:11:34 -07001021 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001022 }
1023 }
1024 cpu2MemEFGHVRHotEvent.async_wait(
1025 boost::asio::posix::stream_descriptor::wait_read,
1026 [](const boost::system::error_code ec) {
1027 if (ec)
1028 {
1029 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1030 << ec.message() << "\n";
1031 return;
1032 }
1033 cpu2MemEFGHVRHotHandler();
1034 });
1035}
1036
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001037static void pchThermtripHandler()
1038{
1039 if (!hostOff)
1040 {
1041 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
1042
1043 bool pchThermtrip =
1044 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1045 if (pchThermtrip)
1046 {
Jason M. Bills08866542019-08-16 12:04:19 -07001047 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001048 }
1049 }
1050 pchThermtripEvent.async_wait(
1051 boost::asio::posix::stream_descriptor::wait_read,
1052 [](const boost::system::error_code ec) {
1053 if (ec)
1054 {
1055 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1056 << "\n";
1057 return;
1058 }
1059 pchThermtripHandler();
1060 });
1061}
1062
Jason M. Billscbf78532019-08-16 15:32:11 -07001063static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001064{
Jason M. Billscbf78532019-08-16 15:32:11 -07001065 int errPinSts = (1 << errPin);
1066 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001067 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1068 cpu++, addr++)
1069 {
1070 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1071 {
1072 uint8_t cc = 0;
1073 CPUModel model{};
1074 uint8_t stepping = 0;
1075 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1076 {
1077 std::cerr << "Cannot get CPUID!\n";
1078 continue;
1079 }
1080
1081 switch (model)
1082 {
1083 case skx:
1084 {
1085 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001086 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001087 uint32_t errpinsts = 0;
1088 if (peci_RdPCIConfigLocal(
1089 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1090 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1091 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001092 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001093 }
1094 break;
1095 }
1096 case icx:
1097 {
1098 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001099 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001100 // accessed on PECI as bus 13)
1101 uint32_t errpinsts = 0;
1102 if (peci_RdEndPointConfigPciLocal(
1103 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1104 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1105 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001106 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001107 }
1108 break;
1109 }
1110 }
1111 }
1112 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001113 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001114}
1115
Jason M. Billscbf78532019-08-16 15:32:11 -07001116static void errXAssertHandler(const int errPin,
1117 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001118{
Jason M. Billscbf78532019-08-16 15:32:11 -07001119 // ERRx status is not guaranteed through the timeout, so save which
1120 // CPUs have it asserted
1121 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1122 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1123 errXAssertTimer.async_wait([errPin, errPinCPUs](
1124 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001125 if (ec)
1126 {
1127 // operation_aborted is expected if timer is canceled before
1128 // completion.
1129 if (ec != boost::asio::error::operation_aborted)
1130 {
1131 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1132 << "\n";
1133 }
1134 return;
1135 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001136 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1137 << std::to_string(errTimeoutMs) << " ms\n";
1138 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001139 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001140 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001141 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001142 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001143 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001144 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001145 }
1146 }
1147 }
1148 else
1149 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001150 cpuERRXLog(errPin);
1151 }
1152 });
1153}
1154
Jason M. Bills8c584392019-08-19 11:05:51 -07001155static void err0AssertHandler()
1156{
1157 // Handle the standard ERR0 detection and logging
1158 const static constexpr int err0 = 0;
1159 errXAssertHandler(err0, err0AssertTimer);
1160}
1161
1162static void err0Handler()
1163{
1164 if (!hostOff)
1165 {
1166 gpiod::line_event gpioLineEvent = err0Line.event_read();
1167
1168 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1169 if (err0)
1170 {
1171 err0AssertHandler();
1172 }
1173 else
1174 {
1175 err0AssertTimer.cancel();
1176 }
1177 }
1178 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1179 [](const boost::system::error_code ec) {
1180 if (ec)
1181 {
1182 std::cerr
1183 << "err0 handler error: " << ec.message()
1184 << "\n";
1185 return;
1186 }
1187 err0Handler();
1188 });
1189}
1190
Jason M. Bills75af3962019-08-19 11:07:17 -07001191static void err1AssertHandler()
1192{
1193 // Handle the standard ERR1 detection and logging
1194 const static constexpr int err1 = 1;
1195 errXAssertHandler(err1, err1AssertTimer);
1196}
1197
1198static void err1Handler()
1199{
1200 if (!hostOff)
1201 {
1202 gpiod::line_event gpioLineEvent = err1Line.event_read();
1203
1204 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1205 if (err1)
1206 {
1207 err1AssertHandler();
1208 }
1209 else
1210 {
1211 err1AssertTimer.cancel();
1212 }
1213 }
1214 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1215 [](const boost::system::error_code ec) {
1216 if (ec)
1217 {
1218 std::cerr
1219 << "err1 handler error: " << ec.message()
1220 << "\n";
1221 return;
1222 }
1223 err1Handler();
1224 });
1225}
1226
Jason M. Billscbf78532019-08-16 15:32:11 -07001227static void err2AssertHandler()
1228{
1229 // Handle the standard ERR2 detection and logging
1230 const static constexpr int err2 = 2;
1231 errXAssertHandler(err2, err2AssertTimer);
1232 // Also handle reset for ERR2
1233 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1234 if (ec)
1235 {
1236 // operation_aborted is expected if timer is canceled before
1237 // completion.
1238 if (ec != boost::asio::error::operation_aborted)
1239 {
1240 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1241 << "\n";
1242 }
1243 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001244 }
1245 conn->async_method_call(
1246 [](boost::system::error_code ec,
1247 const std::variant<bool>& property) {
1248 if (ec)
1249 {
1250 return;
1251 }
1252 const bool* reset = std::get_if<bool>(&property);
1253 if (reset == nullptr)
1254 {
1255 std::cerr << "Unable to read reset on ERR2 value\n";
1256 return;
1257 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001258 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001259 },
1260 "xyz.openbmc_project.Settings",
1261 "/xyz/openbmc_project/control/processor_error_config",
1262 "org.freedesktop.DBus.Properties", "Get",
1263 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
1264 });
1265}
1266
1267static void err2Handler()
1268{
1269 if (!hostOff)
1270 {
1271 gpiod::line_event gpioLineEvent = err2Line.event_read();
1272
1273 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1274 if (err2)
1275 {
1276 err2AssertHandler();
1277 }
1278 else
1279 {
1280 err2AssertTimer.cancel();
1281 }
1282 }
1283 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1284 [](const boost::system::error_code ec) {
1285 if (ec)
1286 {
1287 std::cerr
1288 << "err2 handler error: " << ec.message()
1289 << "\n";
1290 return;
1291 }
1292 err2Handler();
1293 });
1294}
1295
Jason M. Bills89922f82019-08-06 11:10:02 -07001296static void smiAssertHandler()
1297{
1298 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1299 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1300 if (ec)
1301 {
1302 // operation_aborted is expected if timer is canceled before
1303 // completion.
1304 if (ec != boost::asio::error::operation_aborted)
1305 {
1306 std::cerr << "smi timeout async_wait failed: " << ec.message()
1307 << "\n";
1308 }
1309 return;
1310 }
1311 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1312 << " ms\n";
1313 smiTimeoutLog();
1314 conn->async_method_call(
1315 [](boost::system::error_code ec,
1316 const std::variant<bool>& property) {
1317 if (ec)
1318 {
1319 return;
1320 }
1321 const bool* reset = std::get_if<bool>(&property);
1322 if (reset == nullptr)
1323 {
1324 std::cerr << "Unable to read reset on SMI value\n";
1325 return;
1326 }
Jason M. Bills94785442020-01-07 15:22:09 -08001327#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001328 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001329#else
1330 if (*reset)
1331 {
1332 std::cout << "Recovering the system\n";
1333 startPowerCycle();
1334 }
1335#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001336 },
1337 "xyz.openbmc_project.Settings",
1338 "/xyz/openbmc_project/control/bmc_reset_disables",
1339 "org.freedesktop.DBus.Properties", "Get",
1340 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1341 });
1342}
1343
1344static void smiHandler()
1345{
1346 if (!hostOff)
1347 {
1348 gpiod::line_event gpioLineEvent = smiLine.event_read();
1349
1350 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1351 if (smi)
1352 {
1353 smiAssertHandler();
1354 }
1355 else
1356 {
1357 smiAssertTimer.cancel();
1358 }
1359 }
1360 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1361 [](const boost::system::error_code ec) {
1362 if (ec)
1363 {
1364 std::cerr
1365 << "smi handler error: " << ec.message()
1366 << "\n";
1367 return;
1368 }
1369 smiHandler();
1370 });
1371}
1372
Jason M. Billsa15c2522019-08-16 10:01:44 -07001373static void initializeErrorState()
1374{
1375 // Handle CPU_CATERR if it's asserted now
1376 if (caterrLine.get_value() == 0)
1377 {
1378 caterrAssertHandler();
1379 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001380
Jason M. Bills8c584392019-08-19 11:05:51 -07001381 // Handle CPU_ERR0 if it's asserted now
1382 if (err0Line.get_value() == 0)
1383 {
1384 err0AssertHandler();
1385 }
1386
Jason M. Bills75af3962019-08-19 11:07:17 -07001387 // Handle CPU_ERR1 if it's asserted now
1388 if (err1Line.get_value() == 0)
1389 {
1390 err1AssertHandler();
1391 }
1392
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001393 // Handle CPU_ERR2 if it's asserted now
1394 if (err2Line.get_value() == 0)
1395 {
1396 err2AssertHandler();
1397 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001398
1399 // Handle SMI if it's asserted now
1400 if (smiLine.get_value() == 0)
1401 {
1402 smiAssertHandler();
1403 }
Jason M. Bills08866542019-08-16 12:04:19 -07001404
Jason M. Billse94f5e12019-09-13 11:11:34 -07001405 // Handle CPU1_THERMTRIP if it's asserted now
1406 if (cpu1ThermtripLine.get_value() == 0)
1407 {
1408 cpu1ThermtripAssertHandler();
1409 }
1410
1411 // Handle CPU2_THERMTRIP if it's asserted now
1412 if (cpu2ThermtripLine.get_value() == 0)
1413 {
1414 cpu2ThermtripAssertHandler();
1415 }
1416
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001417 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1418 if (cpu1MemtripLine.get_value() == 0)
1419 {
1420 memThermTripLog(1);
1421 }
1422
1423 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1424 if (cpu2MemtripLine.get_value() == 0)
1425 {
1426 memThermTripLog(2);
1427 }
1428
Jason M. Billse94f5e12019-09-13 11:11:34 -07001429 // Handle CPU1_VRHOT if it's asserted now
1430 if (cpu1VRHotLine.get_value() == 0)
1431 {
1432 cpu1VRHotAssertHandler();
1433 }
1434
1435 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1436 if (cpu1MemABCDVRHotLine.get_value() == 0)
1437 {
1438 cpu1MemABCDVRHotAssertHandler();
1439 }
1440
1441 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1442 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1443 {
1444 cpu1MemEFGHVRHotAssertHandler();
1445 }
1446
1447 // Handle CPU2_VRHOT if it's asserted now
1448 if (cpu2VRHotLine.get_value() == 0)
1449 {
1450 cpu2VRHotAssertHandler();
1451 }
1452
1453 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1454 if (cpu2MemABCDVRHotLine.get_value() == 0)
1455 {
1456 cpu2MemABCDVRHotAssertHandler();
1457 }
1458
1459 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1460 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1461 {
1462 cpu2MemEFGHVRHotAssertHandler();
1463 }
1464
Jason M. Bills08866542019-08-16 12:04:19 -07001465 // Handle PCH_BMC_THERMTRIP if it's asserted now
1466 if (pchThermtripLine.get_value() == 0)
1467 {
1468 ssbThermTripLog();
1469 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001470}
Jason M. Bills1490b142019-07-01 15:48:43 -07001471} // namespace host_error_monitor
1472
1473int main(int argc, char* argv[])
1474{
1475 // setup connection to dbus
1476 host_error_monitor::conn =
1477 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1478
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001479 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001480 host_error_monitor::conn->request_name(
1481 "xyz.openbmc_project.HostErrorMonitor");
1482 sdbusplus::asio::object_server server =
1483 sdbusplus::asio::object_server(host_error_monitor::conn);
1484
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001485 // Restart Cause Interface
1486 host_error_monitor::hostErrorTimeoutIface =
1487 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1488 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1489
1490 host_error_monitor::hostErrorTimeoutIface->register_property(
1491 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1492 [](const std::size_t& requested, std::size_t& resp) {
1493 if (requested > host_error_monitor::caterrTimeoutMsMax)
1494 {
1495 std::cerr << "IERRTimeoutMs update to " << requested
1496 << "ms rejected. Cannot be greater than "
1497 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1498 return 0;
1499 }
1500 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1501 host_error_monitor::caterrTimeoutMs = requested;
1502 resp = requested;
1503 return 1;
1504 },
1505 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1506 host_error_monitor::hostErrorTimeoutIface->initialize();
1507
Jason M. Bills1490b142019-07-01 15:48:43 -07001508 // Start tracking host state
1509 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1510 host_error_monitor::startHostStateMonitor();
1511
1512 // Initialize the host state
1513 host_error_monitor::initializeHostState();
1514
1515 // Request CPU_CATERR GPIO events
1516 if (!host_error_monitor::requestGPIOEvents(
1517 "CPU_CATERR", host_error_monitor::caterrHandler,
1518 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1519 {
1520 return -1;
1521 }
1522
Jason M. Bills8c584392019-08-19 11:05:51 -07001523 // Request CPU_ERR0 GPIO events
1524 if (!host_error_monitor::requestGPIOEvents(
1525 "CPU_ERR0", host_error_monitor::err0Handler,
1526 host_error_monitor::err0Line, host_error_monitor::err0Event))
1527 {
1528 return -1;
1529 }
1530
Jason M. Bills75af3962019-08-19 11:07:17 -07001531 // Request CPU_ERR1 GPIO events
1532 if (!host_error_monitor::requestGPIOEvents(
1533 "CPU_ERR1", host_error_monitor::err1Handler,
1534 host_error_monitor::err1Line, host_error_monitor::err1Event))
1535 {
1536 return -1;
1537 }
1538
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001539 // Request CPU_ERR2 GPIO events
1540 if (!host_error_monitor::requestGPIOEvents(
1541 "CPU_ERR2", host_error_monitor::err2Handler,
1542 host_error_monitor::err2Line, host_error_monitor::err2Event))
1543 {
1544 return -1;
1545 }
1546
Jason M. Bills89922f82019-08-06 11:10:02 -07001547 // Request SMI GPIO events
1548 if (!host_error_monitor::requestGPIOEvents(
1549 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1550 host_error_monitor::smiEvent))
1551 {
1552 return -1;
1553 }
1554
Jason M. Bills45e87e02019-09-09 14:45:38 -07001555 // Request CPU1_FIVR_FAULT GPIO input
1556 if (!host_error_monitor::requestGPIOInput(
1557 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1558 {
1559 return -1;
1560 }
1561
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001562 // Request CPU1_THERMTRIP GPIO events
1563 if (!host_error_monitor::requestGPIOEvents(
1564 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1565 host_error_monitor::cpu1ThermtripLine,
1566 host_error_monitor::cpu1ThermtripEvent))
1567 {
1568 return -1;
1569 }
1570
Jason M. Bills45e87e02019-09-09 14:45:38 -07001571 // Request CPU2_FIVR_FAULT GPIO input
1572 if (!host_error_monitor::requestGPIOInput(
1573 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1574 {
1575 return -1;
1576 }
1577
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001578 // Request CPU2_THERMTRIP GPIO events
1579 if (!host_error_monitor::requestGPIOEvents(
1580 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1581 host_error_monitor::cpu2ThermtripLine,
1582 host_error_monitor::cpu2ThermtripEvent))
1583 {
1584 return -1;
1585 }
1586
Jason M. Bills250fa632019-08-28 15:58:25 -07001587 // Request CPU1_VRHOT GPIO events
1588 if (!host_error_monitor::requestGPIOEvents(
1589 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1590 host_error_monitor::cpu1VRHotLine,
1591 host_error_monitor::cpu1VRHotEvent))
1592 {
1593 return -1;
1594 }
1595
Jason M. Bills9647ba72019-08-29 14:19:19 -07001596 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1597 if (!host_error_monitor::requestGPIOEvents(
1598 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1599 host_error_monitor::cpu1MemABCDVRHotLine,
1600 host_error_monitor::cpu1MemABCDVRHotEvent))
1601 {
1602 return -1;
1603 }
1604
1605 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1606 if (!host_error_monitor::requestGPIOEvents(
1607 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1608 host_error_monitor::cpu1MemEFGHVRHotLine,
1609 host_error_monitor::cpu1MemEFGHVRHotEvent))
1610 {
1611 return -1;
1612 }
1613
Jason M. Bills250fa632019-08-28 15:58:25 -07001614 // Request CPU2_VRHOT GPIO events
1615 if (!host_error_monitor::requestGPIOEvents(
1616 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1617 host_error_monitor::cpu2VRHotLine,
1618 host_error_monitor::cpu2VRHotEvent))
1619 {
1620 return -1;
1621 }
1622
Jason M. Bills9647ba72019-08-29 14:19:19 -07001623 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1624 if (!host_error_monitor::requestGPIOEvents(
1625 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1626 host_error_monitor::cpu2MemABCDVRHotLine,
1627 host_error_monitor::cpu2MemABCDVRHotEvent))
1628 {
1629 return -1;
1630 }
1631
1632 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1633 if (!host_error_monitor::requestGPIOEvents(
1634 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1635 host_error_monitor::cpu2MemEFGHVRHotLine,
1636 host_error_monitor::cpu2MemEFGHVRHotEvent))
1637 {
1638 return -1;
1639 }
1640
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001641 // Request PCH_BMC_THERMTRIP GPIO events
1642 if (!host_error_monitor::requestGPIOEvents(
1643 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1644 host_error_monitor::pchThermtripLine,
1645 host_error_monitor::pchThermtripEvent))
1646 {
1647 return -1;
1648 }
1649
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001650 // Request CPU1_MEM_THERM_EVENT GPIO events
1651 if (!host_error_monitor::requestGPIOEvents(
1652 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1653 host_error_monitor::cpu1MemtripLine,
1654 host_error_monitor::cpu1MemtripEvent))
1655 {
1656 return -1;
1657 }
1658
1659 // Request CPU2_MEM_THERM_EVENT GPIO events
1660 if (!host_error_monitor::requestGPIOEvents(
1661 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1662 host_error_monitor::cpu2MemtripLine,
1663 host_error_monitor::cpu2MemtripEvent))
1664 {
1665 return -1;
1666 }
1667
Jason M. Bills1490b142019-07-01 15:48:43 -07001668 host_error_monitor::io.run();
1669
1670 return 0;
1671}