blob: 6ea0b9ce3560efa5981a4740bcf1488b209a0aeb [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070024#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070025
26namespace host_error_monitor
27{
28static boost::asio::io_service io;
29static std::shared_ptr<sdbusplus::asio::connection> conn;
30
31static bool hostOff = true;
32
33const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Billscbf78532019-08-16 15:32:11 -070034const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070035const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070036const static constexpr size_t crashdumpTimeoutS = 300;
37
38// Timers
39// Timer for CATERR asserted
40static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070041// Timer for ERR0 asserted
42static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070043// Timer for ERR1 asserted
44static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045// Timer for ERR2 asserted
46static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070047// Timer for SMI asserted
48static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070049
50// GPIO Lines and Event Descriptors
51static gpiod::line caterrLine;
52static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070053static gpiod::line err0Line;
54static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070055static gpiod::line err1Line;
56static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070057static gpiod::line err2Line;
58static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070059static gpiod::line smiLine;
60static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills78c5eed2019-08-28 14:00:40 -070061static gpiod::line cpu1ThermtripLine;
62static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
63static gpiod::line cpu2ThermtripLine;
64static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070065static gpiod::line cpu1VRHotLine;
66static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
67static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070068static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
69static gpiod::line cpu1MemEFGHVRHotLine;
70static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
71static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070072static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070073static gpiod::line cpu1MemABCDVRHotLine;
74static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
75static gpiod::line cpu2MemEFGHVRHotLine;
76static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080077//----------------------------------
78// PCH_BMC_THERMTRIP function related definition
79//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080080static gpiod::line pchThermtripLine;
81static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070082
Jason M. Billsa3397932019-08-06 11:07:21 -070083static void cpuIERRLog()
84{
85 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
86 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
87 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
88}
89
90static void cpuIERRLog(const int cpuNum)
91{
92 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
93
94 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
95 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
96 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
97}
98
99static void cpuIERRLog(const int cpuNum, const std::string& type)
100{
101 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
102
103 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
104 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
105 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
106}
107
Jason M. Billscbf78532019-08-16 15:32:11 -0700108static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700109{
Jason M. Billscbf78532019-08-16 15:32:11 -0700110 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
111
112 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
113 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
114 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700115}
116
Jason M. Billscbf78532019-08-16 15:32:11 -0700117static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700118{
Jason M. Billscbf78532019-08-16 15:32:11 -0700119 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
120 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700121
122 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
123 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
124 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
125}
126
Jason M. Bills89922f82019-08-06 11:10:02 -0700127static void smiTimeoutLog()
128{
129 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
130 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
131 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
132}
133
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700134static void cpuThermTripLog(const int cpuNum)
135{
136 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
137
138 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
139 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
140 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
141 cpuNum, NULL);
142}
143
Jason M. Bills250fa632019-08-28 15:58:25 -0700144static void cpuVRHotLog(const std::string& vr)
145{
146 std::string msg = vr + " Voltage Regulator Overheated.";
147
148 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
149 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
150 "OpenBMC.0.1.VoltageRegulatorOverheated",
151 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
152}
153
Jason M. Bills08866542019-08-16 12:04:19 -0700154static void ssbThermTripLog()
155{
156 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
157 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
158 "OpenBMC.0.1.SsbThermalTrip", NULL);
159}
160
Jason M. Billsa15c2522019-08-16 10:01:44 -0700161static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700162static void initializeHostState()
163{
164 conn->async_method_call(
165 [](boost::system::error_code ec,
166 const std::variant<std::string>& property) {
167 if (ec)
168 {
169 return;
170 }
171 const std::string* state = std::get_if<std::string>(&property);
172 if (state == nullptr)
173 {
174 std::cerr << "Unable to read host state value\n";
175 return;
176 }
177 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700178 // If the system is on, initialize the error state
179 if (!hostOff)
180 {
181 initializeErrorState();
182 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700183 },
184 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
185 "org.freedesktop.DBus.Properties", "Get",
186 "xyz.openbmc_project.State.Host", "CurrentHostState");
187}
188
189static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
190{
191 return std::make_shared<sdbusplus::bus::match::match>(
192 *conn,
193 "type='signal',interface='org.freedesktop.DBus.Properties',"
194 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
195 "Host'",
196 [](sdbusplus::message::message& msg) {
197 std::string interfaceName;
198 boost::container::flat_map<std::string, std::variant<std::string>>
199 propertiesChanged;
200 std::string state;
201 try
202 {
203 msg.read(interfaceName, propertiesChanged);
204 state =
205 std::get<std::string>(propertiesChanged.begin()->second);
206 }
207 catch (std::exception& e)
208 {
209 std::cerr << "Unable to read host state\n";
210 return;
211 }
212 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
213
214 // No host events should fire while off, so cancel any pending
215 // timers
216 if (hostOff)
217 {
218 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700219 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700220 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700221 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700222 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700223 }
224 });
225}
226
227static bool requestGPIOEvents(
228 const std::string& name, const std::function<void()>& handler,
229 gpiod::line& gpioLine,
230 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
231{
232 // Find the GPIO line
233 gpioLine = gpiod::find_line(name);
234 if (!gpioLine)
235 {
236 std::cerr << "Failed to find the " << name << " line\n";
237 return false;
238 }
239
240 try
241 {
242 gpioLine.request(
243 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
244 }
245 catch (std::exception&)
246 {
247 std::cerr << "Failed to request events for " << name << "\n";
248 return false;
249 }
250
251 int gpioLineFd = gpioLine.event_get_fd();
252 if (gpioLineFd < 0)
253 {
254 std::cerr << "Failed to get " << name << " fd\n";
255 return false;
256 }
257
258 gpioEventDescriptor.assign(gpioLineFd);
259
260 gpioEventDescriptor.async_wait(
261 boost::asio::posix::stream_descriptor::wait_read,
262 [&name, handler](const boost::system::error_code ec) {
263 if (ec)
264 {
265 std::cerr << name << " fd handler error: " << ec.message()
266 << "\n";
267 return;
268 }
269 handler();
270 });
271 return true;
272}
273
274static void startPowerCycle()
275{
276 conn->async_method_call(
277 [](boost::system::error_code ec) {
278 if (ec)
279 {
280 std::cerr << "failed to set Chassis State\n";
281 }
282 },
283 "xyz.openbmc_project.State.Chassis",
284 "/xyz/openbmc_project/state/chassis0",
285 "org.freedesktop.DBus.Properties", "Set",
286 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
287 std::variant<std::string>{
288 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
289}
290
291static void startCrashdumpAndRecovery(bool recoverSystem)
292{
293 std::cout << "Starting crashdump\n";
294 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
295 static boost::asio::steady_timer crashdumpTimer(io);
296
297 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
298 *conn,
299 "type='signal',interface='org.freedesktop.DBus.Properties',"
300 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
301 [recoverSystem](sdbusplus::message::message& msg) {
302 crashdumpTimer.cancel();
303 std::cout << "Crashdump completed\n";
304 if (recoverSystem)
305 {
306 std::cout << "Recovering the system\n";
307 startPowerCycle();
308 }
309 crashdumpCompleteMatch.reset();
310 });
311
312 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
313 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
314 if (ec)
315 {
316 // operation_aborted is expected if timer is canceled
317 if (ec != boost::asio::error::operation_aborted)
318 {
319 std::cerr << "Crashdump async_wait failed: " << ec.message()
320 << "\n";
321 }
322 std::cout << "Crashdump timer canceled\n";
323 return;
324 }
325 std::cerr << "Crashdump failed to complete before timeout\n";
326 crashdumpCompleteMatch.reset();
327 });
328
329 conn->async_method_call(
330 [](boost::system::error_code ec) {
331 if (ec)
332 {
333 std::cerr << "failed to start Crashdump\n";
334 crashdumpTimer.cancel();
335 crashdumpCompleteMatch.reset();
336 }
337 },
338 "com.intel.crashdump", "/com/intel/crashdump",
339 "com.intel.crashdump.Stored", "GenerateStoredLog");
340}
341
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700342static void incrementCPUErrorCount(int cpuNum)
343{
344 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
345
346 // Get the current count
347 conn->async_method_call(
348 [propertyName](boost::system::error_code ec,
349 const std::variant<uint8_t>& property) {
350 if (ec)
351 {
352 std::cerr << "Failed to read " << propertyName << ": "
353 << ec.message() << "\n";
354 return;
355 }
356 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
357 if (errorCountVariant == nullptr)
358 {
359 std::cerr << propertyName << " invalid\n";
360 return;
361 }
362 uint8_t errorCount = *errorCountVariant;
363 if (errorCount == std::numeric_limits<uint8_t>::max())
364 {
365 std::cerr << "Maximum error count reached\n";
366 return;
367 }
368 // Increment the count
369 errorCount++;
370 conn->async_method_call(
371 [propertyName](boost::system::error_code ec) {
372 if (ec)
373 {
374 std::cerr << "Failed to set " << propertyName << ": "
375 << ec.message() << "\n";
376 }
377 },
378 "xyz.openbmc_project.Settings",
379 "/xyz/openbmc_project/control/processor_error_config",
380 "org.freedesktop.DBus.Properties", "Set",
381 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
382 std::variant<uint8_t>{errorCount});
383 },
384 "xyz.openbmc_project.Settings",
385 "/xyz/openbmc_project/control/processor_error_config",
386 "org.freedesktop.DBus.Properties", "Get",
387 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
388}
389
Jason M. Billsa3397932019-08-06 11:07:21 -0700390static bool checkIERRCPUs()
391{
392 bool cpuIERRFound = false;
393 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
394 cpu++, addr++)
395 {
396 uint8_t cc = 0;
397 CPUModel model{};
398 uint8_t stepping = 0;
399 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
400 {
401 std::cerr << "Cannot get CPUID!\n";
402 continue;
403 }
404
405 switch (model)
406 {
407 case skx:
408 {
409 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
410 // that caused the IERR
411 uint32_t mcaErrSrcLog = 0;
412 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
413 &cc) != PECI_CC_SUCCESS)
414 {
415 continue;
416 }
417 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
418 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
419 {
420 // TODO: Light the CPU fault LED?
421 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700422 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700423 // Next check if it's a CPU/VR mismatch by reading the
424 // IA32_MC4_STATUS MSR (0x411)
425 uint64_t mc4Status = 0;
426 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
427 PECI_CC_SUCCESS)
428 {
429 continue;
430 }
431 // Check MSEC bits 31:24 for
432 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
433 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
434 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
435 if ((mc4Status & (0x40 << 24)) ||
436 (mc4Status & (0x42 << 24)) ||
437 (mc4Status & (0x43 << 24)))
438 {
439 cpuIERRLog(cpu, "CPU/VR Mismatch");
440 continue;
441 }
442
443 // Next check if it's a Core FIVR fault by looking for a
444 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
445 // 80h)
446 uint32_t coreFIVRErrLog = 0;
447 if (peci_RdPCIConfigLocal(
448 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
449 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
450 {
451 continue;
452 }
453 if (coreFIVRErrLog)
454 {
455 cpuIERRLog(cpu, "Core FIVR Fault");
456 continue;
457 }
458
459 // Next check if it's an Uncore FIVR fault by looking for a
460 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
461 // 84h)
462 uint32_t uncoreFIVRErrLog = 0;
463 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
464 sizeof(uint32_t),
465 (uint8_t*)&uncoreFIVRErrLog,
466 &cc) != PECI_CC_SUCCESS)
467 {
468 continue;
469 }
470 if (uncoreFIVRErrLog)
471 {
472 cpuIERRLog(cpu, "Uncore FIVR Fault");
473 continue;
474 }
475
476 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
477 // both zero, but MSEC bits 31:24 have either
478 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
479 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
480 // uncore FIVR fault
481 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
482 ((mc4Status & (0x51 << 24)) ||
483 (mc4Status & (0x52 << 24))))
484 {
485 cpuIERRLog(cpu, "Uncore FIVR Fault");
486 continue;
487 }
488 cpuIERRLog(cpu);
489 }
490 break;
491 }
492 case icx:
493 {
494 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
495 // that caused the IERR
496 uint32_t mcaErrSrcLog = 0;
497 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
498 &cc) != PECI_CC_SUCCESS)
499 {
500 continue;
501 }
502 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
503 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
504 {
505 // TODO: Light the CPU fault LED?
506 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700507 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700508 // Next check if it's a CPU/VR mismatch by reading the
509 // IA32_MC4_STATUS MSR (0x411)
510 uint64_t mc4Status = 0;
511 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
512 PECI_CC_SUCCESS)
513 {
514 continue;
515 }
516 // TODO: Update MSEC/MSCOD_31_24 check
517 // Check MSEC bits 31:24 for
518 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
519 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
520 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
521 if ((mc4Status & (0x40 << 24)) ||
522 (mc4Status & (0x42 << 24)) ||
523 (mc4Status & (0x43 << 24)))
524 {
525 cpuIERRLog(cpu, "CPU/VR Mismatch");
526 continue;
527 }
528
529 // Next check if it's a Core FIVR fault by looking for a
530 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
531 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
532 uint32_t coreFIVRErrLog0 = 0;
533 uint32_t coreFIVRErrLog1 = 0;
534 if (peci_RdEndPointConfigPciLocal(
535 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
536 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
537 {
538 continue;
539 }
540 if (peci_RdEndPointConfigPciLocal(
541 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
542 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
543 {
544 continue;
545 }
546 if (coreFIVRErrLog0 || coreFIVRErrLog1)
547 {
548 cpuIERRLog(cpu, "Core FIVR Fault");
549 continue;
550 }
551
552 // Next check if it's an Uncore FIVR fault by looking for a
553 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
554 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
555 uint32_t uncoreFIVRErrLog = 0;
556 if (peci_RdEndPointConfigPciLocal(
557 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
558 (uint8_t*)&uncoreFIVRErrLog,
559 &cc) != PECI_CC_SUCCESS)
560 {
561 continue;
562 }
563 if (uncoreFIVRErrLog)
564 {
565 cpuIERRLog(cpu, "Uncore FIVR Fault");
566 continue;
567 }
568
569 // TODO: Update MSEC/MSCOD_31_24 check
570 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
571 // both zero, but MSEC bits 31:24 have either
572 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
573 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
574 // uncore FIVR fault
575 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
576 !uncoreFIVRErrLog &&
577 ((mc4Status & (0x51 << 24)) ||
578 (mc4Status & (0x52 << 24))))
579 {
580 cpuIERRLog(cpu, "Uncore FIVR Fault");
581 continue;
582 }
583 cpuIERRLog(cpu);
584 }
585 break;
586 }
587 }
588 }
589 return cpuIERRFound;
590}
591
Jason M. Billsa15c2522019-08-16 10:01:44 -0700592static void caterrAssertHandler()
593{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700594 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
595 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
596 if (ec)
597 {
598 // operation_aborted is expected if timer is canceled
599 // before completion.
600 if (ec != boost::asio::error::operation_aborted)
601 {
602 std::cerr << "caterr timeout async_wait failed: "
603 << ec.message() << "\n";
604 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700605 return;
606 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700607 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
608 << " ms\n";
609 if (!checkIERRCPUs())
610 {
611 cpuIERRLog();
612 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700613 conn->async_method_call(
614 [](boost::system::error_code ec,
615 const std::variant<bool>& property) {
616 if (ec)
617 {
618 return;
619 }
620 const bool* reset = std::get_if<bool>(&property);
621 if (reset == nullptr)
622 {
623 std::cerr << "Unable to read reset on CATERR value\n";
624 return;
625 }
626 startCrashdumpAndRecovery(*reset);
627 },
628 "xyz.openbmc_project.Settings",
629 "/xyz/openbmc_project/control/processor_error_config",
630 "org.freedesktop.DBus.Properties", "Get",
631 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
632 });
633}
634
Jason M. Bills1490b142019-07-01 15:48:43 -0700635static void caterrHandler()
636{
637 if (!hostOff)
638 {
639 gpiod::line_event gpioLineEvent = caterrLine.event_read();
640
641 bool caterr =
642 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
643 if (caterr)
644 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700645 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700646 }
647 else
648 {
649 caterrAssertTimer.cancel();
650 }
651 }
652 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
653 [](const boost::system::error_code ec) {
654 if (ec)
655 {
656 std::cerr << "caterr handler error: "
657 << ec.message() << "\n";
658 return;
659 }
660 caterrHandler();
661 });
662}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700663
664static void cpu1ThermtripHandler()
665{
666 if (!hostOff)
667 {
668 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
669
670 bool cpu1Thermtrip =
671 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
672 if (cpu1Thermtrip)
673 {
674 cpuThermTripLog(1);
675 }
676 }
677 cpu1ThermtripEvent.async_wait(
678 boost::asio::posix::stream_descriptor::wait_read,
679 [](const boost::system::error_code ec) {
680 if (ec)
681 {
682 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
683 << "\n";
684 return;
685 }
686 cpu1ThermtripHandler();
687 });
688}
689
690static void cpu2ThermtripHandler()
691{
692 if (!hostOff)
693 {
694 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
695
696 bool cpu2Thermtrip =
697 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
698 if (cpu2Thermtrip)
699 {
700 cpuThermTripLog(2);
701 }
702 }
703 cpu2ThermtripEvent.async_wait(
704 boost::asio::posix::stream_descriptor::wait_read,
705 [](const boost::system::error_code ec) {
706 if (ec)
707 {
708 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
709 << "\n";
710 return;
711 }
712 cpu2ThermtripHandler();
713 });
714}
715
Jason M. Bills250fa632019-08-28 15:58:25 -0700716static void cpu1VRHotHandler()
717{
718 if (!hostOff)
719 {
720 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
721
722 bool cpu1VRHot =
723 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
724 if (cpu1VRHot)
725 {
726 cpuVRHotLog("CPU 1");
727 }
728 }
729 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
730 [](const boost::system::error_code ec) {
731 if (ec)
732 {
733 std::cerr << "CPU 1 VRHot handler error: "
734 << ec.message() << "\n";
735 return;
736 }
737 cpu1VRHotHandler();
738 });
739}
740
Jason M. Bills9647ba72019-08-29 14:19:19 -0700741static void cpu1MemABCDVRHotHandler()
742{
743 if (!hostOff)
744 {
745 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
746
747 bool cpu1MemABCDVRHot =
748 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
749 if (cpu1MemABCDVRHot)
750 {
751 cpuVRHotLog("CPU 1 Memory ABCD");
752 }
753 }
754 cpu1MemABCDVRHotEvent.async_wait(
755 boost::asio::posix::stream_descriptor::wait_read,
756 [](const boost::system::error_code ec) {
757 if (ec)
758 {
759 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
760 << ec.message() << "\n";
761 return;
762 }
763 cpu1MemABCDVRHotHandler();
764 });
765}
766
767static void cpu1MemEFGHVRHotHandler()
768{
769 if (!hostOff)
770 {
771 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
772
773 bool cpu1MemEFGHVRHot =
774 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
775 if (cpu1MemEFGHVRHot)
776 {
777 cpuVRHotLog("CPU 1 Memory EFGH");
778 }
779 }
780 cpu1MemEFGHVRHotEvent.async_wait(
781 boost::asio::posix::stream_descriptor::wait_read,
782 [](const boost::system::error_code ec) {
783 if (ec)
784 {
785 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
786 << ec.message() << "\n";
787 return;
788 }
789 cpu1MemEFGHVRHotHandler();
790 });
791}
792
Jason M. Bills250fa632019-08-28 15:58:25 -0700793static void cpu2VRHotHandler()
794{
795 if (!hostOff)
796 {
797 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
798
799 bool cpu2VRHot =
800 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
801 if (cpu2VRHot)
802 {
803 cpuVRHotLog("CPU 2");
804 }
805 }
806 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
807 [](const boost::system::error_code ec) {
808 if (ec)
809 {
810 std::cerr << "CPU 2 VRHot handler error: "
811 << ec.message() << "\n";
812 return;
813 }
814 cpu2VRHotHandler();
815 });
816}
817
Jason M. Bills9647ba72019-08-29 14:19:19 -0700818static void cpu2MemABCDVRHotHandler()
819{
820 if (!hostOff)
821 {
822 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
823
824 bool cpu2MemABCDVRHot =
825 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
826 if (cpu2MemABCDVRHot)
827 {
828 cpuVRHotLog("CPU 2 Memory ABCD");
829 }
830 }
831 cpu2MemABCDVRHotEvent.async_wait(
832 boost::asio::posix::stream_descriptor::wait_read,
833 [](const boost::system::error_code ec) {
834 if (ec)
835 {
836 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
837 << ec.message() << "\n";
838 return;
839 }
840 cpu2MemABCDVRHotHandler();
841 });
842}
843
844static void cpu2MemEFGHVRHotHandler()
845{
846 if (!hostOff)
847 {
848 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
849
850 bool cpu2MemEFGHVRHot =
851 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
852 if (cpu2MemEFGHVRHot)
853 {
854 cpuVRHotLog("CPU 2 Memory EFGH");
855 }
856 }
857 cpu2MemEFGHVRHotEvent.async_wait(
858 boost::asio::posix::stream_descriptor::wait_read,
859 [](const boost::system::error_code ec) {
860 if (ec)
861 {
862 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
863 << ec.message() << "\n";
864 return;
865 }
866 cpu2MemEFGHVRHotHandler();
867 });
868}
869
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800870static void pchThermtripHandler()
871{
872 if (!hostOff)
873 {
874 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
875
876 bool pchThermtrip =
877 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
878 if (pchThermtrip)
879 {
Jason M. Bills08866542019-08-16 12:04:19 -0700880 ssbThermTripLog();
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800881 }
882 }
883 pchThermtripEvent.async_wait(
884 boost::asio::posix::stream_descriptor::wait_read,
885 [](const boost::system::error_code ec) {
886 if (ec)
887 {
888 std::cerr << "PCH Thermal trip handler error: " << ec.message()
889 << "\n";
890 return;
891 }
892 pchThermtripHandler();
893 });
894}
895
Jason M. Billscbf78532019-08-16 15:32:11 -0700896static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700897{
Jason M. Billscbf78532019-08-16 15:32:11 -0700898 int errPinSts = (1 << errPin);
899 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700900 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
901 cpu++, addr++)
902 {
903 if (peci_Ping(addr) == PECI_CC_SUCCESS)
904 {
905 uint8_t cc = 0;
906 CPUModel model{};
907 uint8_t stepping = 0;
908 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
909 {
910 std::cerr << "Cannot get CPUID!\n";
911 continue;
912 }
913
914 switch (model)
915 {
916 case skx:
917 {
918 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700919 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700920 uint32_t errpinsts = 0;
921 if (peci_RdPCIConfigLocal(
922 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
923 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
924 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700925 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700926 }
927 break;
928 }
929 case icx:
930 {
931 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -0700932 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700933 // accessed on PECI as bus 13)
934 uint32_t errpinsts = 0;
935 if (peci_RdEndPointConfigPciLocal(
936 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
937 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
938 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700939 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700940 }
941 break;
942 }
943 }
944 }
945 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700946 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700947}
948
Jason M. Billscbf78532019-08-16 15:32:11 -0700949static void errXAssertHandler(const int errPin,
950 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700951{
Jason M. Billscbf78532019-08-16 15:32:11 -0700952 // ERRx status is not guaranteed through the timeout, so save which
953 // CPUs have it asserted
954 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
955 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
956 errXAssertTimer.async_wait([errPin, errPinCPUs](
957 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700958 if (ec)
959 {
960 // operation_aborted is expected if timer is canceled before
961 // completion.
962 if (ec != boost::asio::error::operation_aborted)
963 {
964 std::cerr << "err2 timeout async_wait failed: " << ec.message()
965 << "\n";
966 }
967 return;
968 }
Jason M. Billscbf78532019-08-16 15:32:11 -0700969 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
970 << std::to_string(errTimeoutMs) << " ms\n";
971 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700972 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700973 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700974 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700975 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700976 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700977 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700978 }
979 }
980 }
981 else
982 {
Jason M. Billscbf78532019-08-16 15:32:11 -0700983 cpuERRXLog(errPin);
984 }
985 });
986}
987
Jason M. Bills8c584392019-08-19 11:05:51 -0700988static void err0AssertHandler()
989{
990 // Handle the standard ERR0 detection and logging
991 const static constexpr int err0 = 0;
992 errXAssertHandler(err0, err0AssertTimer);
993}
994
995static void err0Handler()
996{
997 if (!hostOff)
998 {
999 gpiod::line_event gpioLineEvent = err0Line.event_read();
1000
1001 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1002 if (err0)
1003 {
1004 err0AssertHandler();
1005 }
1006 else
1007 {
1008 err0AssertTimer.cancel();
1009 }
1010 }
1011 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1012 [](const boost::system::error_code ec) {
1013 if (ec)
1014 {
1015 std::cerr
1016 << "err0 handler error: " << ec.message()
1017 << "\n";
1018 return;
1019 }
1020 err0Handler();
1021 });
1022}
1023
Jason M. Bills75af3962019-08-19 11:07:17 -07001024static void err1AssertHandler()
1025{
1026 // Handle the standard ERR1 detection and logging
1027 const static constexpr int err1 = 1;
1028 errXAssertHandler(err1, err1AssertTimer);
1029}
1030
1031static void err1Handler()
1032{
1033 if (!hostOff)
1034 {
1035 gpiod::line_event gpioLineEvent = err1Line.event_read();
1036
1037 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1038 if (err1)
1039 {
1040 err1AssertHandler();
1041 }
1042 else
1043 {
1044 err1AssertTimer.cancel();
1045 }
1046 }
1047 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1048 [](const boost::system::error_code ec) {
1049 if (ec)
1050 {
1051 std::cerr
1052 << "err1 handler error: " << ec.message()
1053 << "\n";
1054 return;
1055 }
1056 err1Handler();
1057 });
1058}
1059
Jason M. Billscbf78532019-08-16 15:32:11 -07001060static void err2AssertHandler()
1061{
1062 // Handle the standard ERR2 detection and logging
1063 const static constexpr int err2 = 2;
1064 errXAssertHandler(err2, err2AssertTimer);
1065 // Also handle reset for ERR2
1066 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1067 if (ec)
1068 {
1069 // operation_aborted is expected if timer is canceled before
1070 // completion.
1071 if (ec != boost::asio::error::operation_aborted)
1072 {
1073 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1074 << "\n";
1075 }
1076 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001077 }
1078 conn->async_method_call(
1079 [](boost::system::error_code ec,
1080 const std::variant<bool>& property) {
1081 if (ec)
1082 {
1083 return;
1084 }
1085 const bool* reset = std::get_if<bool>(&property);
1086 if (reset == nullptr)
1087 {
1088 std::cerr << "Unable to read reset on ERR2 value\n";
1089 return;
1090 }
1091 startCrashdumpAndRecovery(*reset);
1092 },
1093 "xyz.openbmc_project.Settings",
1094 "/xyz/openbmc_project/control/processor_error_config",
1095 "org.freedesktop.DBus.Properties", "Get",
1096 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
1097 });
1098}
1099
1100static void err2Handler()
1101{
1102 if (!hostOff)
1103 {
1104 gpiod::line_event gpioLineEvent = err2Line.event_read();
1105
1106 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1107 if (err2)
1108 {
1109 err2AssertHandler();
1110 }
1111 else
1112 {
1113 err2AssertTimer.cancel();
1114 }
1115 }
1116 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1117 [](const boost::system::error_code ec) {
1118 if (ec)
1119 {
1120 std::cerr
1121 << "err2 handler error: " << ec.message()
1122 << "\n";
1123 return;
1124 }
1125 err2Handler();
1126 });
1127}
1128
Jason M. Bills89922f82019-08-06 11:10:02 -07001129static void smiAssertHandler()
1130{
1131 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1132 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1133 if (ec)
1134 {
1135 // operation_aborted is expected if timer is canceled before
1136 // completion.
1137 if (ec != boost::asio::error::operation_aborted)
1138 {
1139 std::cerr << "smi timeout async_wait failed: " << ec.message()
1140 << "\n";
1141 }
1142 return;
1143 }
1144 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1145 << " ms\n";
1146 smiTimeoutLog();
1147 conn->async_method_call(
1148 [](boost::system::error_code ec,
1149 const std::variant<bool>& property) {
1150 if (ec)
1151 {
1152 return;
1153 }
1154 const bool* reset = std::get_if<bool>(&property);
1155 if (reset == nullptr)
1156 {
1157 std::cerr << "Unable to read reset on SMI value\n";
1158 return;
1159 }
1160 startCrashdumpAndRecovery(*reset);
1161 },
1162 "xyz.openbmc_project.Settings",
1163 "/xyz/openbmc_project/control/bmc_reset_disables",
1164 "org.freedesktop.DBus.Properties", "Get",
1165 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1166 });
1167}
1168
1169static void smiHandler()
1170{
1171 if (!hostOff)
1172 {
1173 gpiod::line_event gpioLineEvent = smiLine.event_read();
1174
1175 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1176 if (smi)
1177 {
1178 smiAssertHandler();
1179 }
1180 else
1181 {
1182 smiAssertTimer.cancel();
1183 }
1184 }
1185 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1186 [](const boost::system::error_code ec) {
1187 if (ec)
1188 {
1189 std::cerr
1190 << "smi handler error: " << ec.message()
1191 << "\n";
1192 return;
1193 }
1194 smiHandler();
1195 });
1196}
1197
Jason M. Billsa15c2522019-08-16 10:01:44 -07001198static void initializeErrorState()
1199{
1200 // Handle CPU_CATERR if it's asserted now
1201 if (caterrLine.get_value() == 0)
1202 {
1203 caterrAssertHandler();
1204 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001205
Jason M. Bills8c584392019-08-19 11:05:51 -07001206 // Handle CPU_ERR0 if it's asserted now
1207 if (err0Line.get_value() == 0)
1208 {
1209 err0AssertHandler();
1210 }
1211
Jason M. Bills75af3962019-08-19 11:07:17 -07001212 // Handle CPU_ERR1 if it's asserted now
1213 if (err1Line.get_value() == 0)
1214 {
1215 err1AssertHandler();
1216 }
1217
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001218 // Handle CPU_ERR2 if it's asserted now
1219 if (err2Line.get_value() == 0)
1220 {
1221 err2AssertHandler();
1222 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001223
1224 // Handle SMI if it's asserted now
1225 if (smiLine.get_value() == 0)
1226 {
1227 smiAssertHandler();
1228 }
Jason M. Bills08866542019-08-16 12:04:19 -07001229
1230 // Handle PCH_BMC_THERMTRIP if it's asserted now
1231 if (pchThermtripLine.get_value() == 0)
1232 {
1233 ssbThermTripLog();
1234 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001235}
Jason M. Bills1490b142019-07-01 15:48:43 -07001236} // namespace host_error_monitor
1237
1238int main(int argc, char* argv[])
1239{
1240 // setup connection to dbus
1241 host_error_monitor::conn =
1242 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1243
1244 // Host Error Monitor Object
1245 host_error_monitor::conn->request_name(
1246 "xyz.openbmc_project.HostErrorMonitor");
1247 sdbusplus::asio::object_server server =
1248 sdbusplus::asio::object_server(host_error_monitor::conn);
1249
1250 // Start tracking host state
1251 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1252 host_error_monitor::startHostStateMonitor();
1253
1254 // Initialize the host state
1255 host_error_monitor::initializeHostState();
1256
1257 // Request CPU_CATERR GPIO events
1258 if (!host_error_monitor::requestGPIOEvents(
1259 "CPU_CATERR", host_error_monitor::caterrHandler,
1260 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1261 {
1262 return -1;
1263 }
1264
Jason M. Bills8c584392019-08-19 11:05:51 -07001265 // Request CPU_ERR0 GPIO events
1266 if (!host_error_monitor::requestGPIOEvents(
1267 "CPU_ERR0", host_error_monitor::err0Handler,
1268 host_error_monitor::err0Line, host_error_monitor::err0Event))
1269 {
1270 return -1;
1271 }
1272
Jason M. Bills75af3962019-08-19 11:07:17 -07001273 // Request CPU_ERR1 GPIO events
1274 if (!host_error_monitor::requestGPIOEvents(
1275 "CPU_ERR1", host_error_monitor::err1Handler,
1276 host_error_monitor::err1Line, host_error_monitor::err1Event))
1277 {
1278 return -1;
1279 }
1280
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001281 // Request CPU_ERR2 GPIO events
1282 if (!host_error_monitor::requestGPIOEvents(
1283 "CPU_ERR2", host_error_monitor::err2Handler,
1284 host_error_monitor::err2Line, host_error_monitor::err2Event))
1285 {
1286 return -1;
1287 }
1288
Jason M. Bills89922f82019-08-06 11:10:02 -07001289 // Request SMI GPIO events
1290 if (!host_error_monitor::requestGPIOEvents(
1291 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1292 host_error_monitor::smiEvent))
1293 {
1294 return -1;
1295 }
1296
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001297 // Request CPU1_THERMTRIP GPIO events
1298 if (!host_error_monitor::requestGPIOEvents(
1299 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1300 host_error_monitor::cpu1ThermtripLine,
1301 host_error_monitor::cpu1ThermtripEvent))
1302 {
1303 return -1;
1304 }
1305
1306 // Request CPU2_THERMTRIP GPIO events
1307 if (!host_error_monitor::requestGPIOEvents(
1308 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1309 host_error_monitor::cpu2ThermtripLine,
1310 host_error_monitor::cpu2ThermtripEvent))
1311 {
1312 return -1;
1313 }
1314
Jason M. Bills250fa632019-08-28 15:58:25 -07001315 // Request CPU1_VRHOT GPIO events
1316 if (!host_error_monitor::requestGPIOEvents(
1317 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1318 host_error_monitor::cpu1VRHotLine,
1319 host_error_monitor::cpu1VRHotEvent))
1320 {
1321 return -1;
1322 }
1323
Jason M. Bills9647ba72019-08-29 14:19:19 -07001324 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1325 if (!host_error_monitor::requestGPIOEvents(
1326 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1327 host_error_monitor::cpu1MemABCDVRHotLine,
1328 host_error_monitor::cpu1MemABCDVRHotEvent))
1329 {
1330 return -1;
1331 }
1332
1333 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1334 if (!host_error_monitor::requestGPIOEvents(
1335 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1336 host_error_monitor::cpu1MemEFGHVRHotLine,
1337 host_error_monitor::cpu1MemEFGHVRHotEvent))
1338 {
1339 return -1;
1340 }
1341
Jason M. Bills250fa632019-08-28 15:58:25 -07001342 // Request CPU2_VRHOT GPIO events
1343 if (!host_error_monitor::requestGPIOEvents(
1344 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1345 host_error_monitor::cpu2VRHotLine,
1346 host_error_monitor::cpu2VRHotEvent))
1347 {
1348 return -1;
1349 }
1350
Jason M. Bills9647ba72019-08-29 14:19:19 -07001351 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1352 if (!host_error_monitor::requestGPIOEvents(
1353 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1354 host_error_monitor::cpu2MemABCDVRHotLine,
1355 host_error_monitor::cpu2MemABCDVRHotEvent))
1356 {
1357 return -1;
1358 }
1359
1360 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1361 if (!host_error_monitor::requestGPIOEvents(
1362 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1363 host_error_monitor::cpu2MemEFGHVRHotLine,
1364 host_error_monitor::cpu2MemEFGHVRHotEvent))
1365 {
1366 return -1;
1367 }
1368
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001369 // Request PCH_BMC_THERMTRIP GPIO events
1370 if (!host_error_monitor::requestGPIOEvents(
1371 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1372 host_error_monitor::pchThermtripLine,
1373 host_error_monitor::pchThermtripEvent))
1374 {
1375 return -1;
1376 }
1377
Jason M. Bills1490b142019-07-01 15:48:43 -07001378 host_error_monitor::io.run();
1379
1380 return 0;
1381}