blob: f8de2d3feffac8a4e467b6eecb6053271d0b6922 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills1490b142019-07-01 15:48:43 -070019#include <boost/asio/posix/stream_descriptor.hpp>
20#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070021#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070022
23#include <bitset>
24#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070025#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070026
27namespace host_error_monitor
28{
29static boost::asio::io_service io;
30static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080031static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070032
Yong Li1429ca82020-04-27 16:49:45 +080033using Association = std::tuple<std::string, std::string, std::string>;
34static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
35static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
36
37static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
38
Jason M. Bills1490b142019-07-01 15:48:43 -070039static bool hostOff = true;
40
Jason M. Billsc4b91f22019-11-26 17:04:50 -080041static size_t caterrTimeoutMs = 2000;
42const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070043const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070044const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070045const static constexpr size_t crashdumpTimeoutS = 300;
46
47// Timers
48// Timer for CATERR asserted
49static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070050// Timer for ERR0 asserted
51static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070052// Timer for ERR1 asserted
53static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070054// Timer for ERR2 asserted
55static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070056// Timer for SMI asserted
57static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070058
59// GPIO Lines and Event Descriptors
60static gpiod::line caterrLine;
61static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070062static gpiod::line err0Line;
63static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070064static gpiod::line err1Line;
65static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070066static gpiod::line err2Line;
67static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070068static gpiod::line smiLine;
69static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070070static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070071static gpiod::line cpu1ThermtripLine;
72static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070073static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070074static gpiod::line cpu2ThermtripLine;
75static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070076static gpiod::line cpu1VRHotLine;
77static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
78static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070079static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
80static gpiod::line cpu1MemEFGHVRHotLine;
81static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
82static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070083static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070084static gpiod::line cpu1MemABCDVRHotLine;
85static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
86static gpiod::line cpu2MemEFGHVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080088//----------------------------------
89// PCH_BMC_THERMTRIP function related definition
90//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080091static gpiod::line pchThermtripLine;
92static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000093//----------------------------------
94// CPU_MEM_THERM_EVENT function related definition
95//----------------------------------
96static gpiod::line cpu1MemtripLine;
97static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
98static gpiod::line cpu2MemtripLine;
99static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000100//---------------------------------
101// CPU_MISMATCH function related definition
102//---------------------------------
103static gpiod::line cpu1MismatchLine;
104static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700105
Yong Li061eb032020-02-26 15:06:18 +0800106// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800107const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800108const static constexpr uint8_t beepCPUErr2 = 5;
109
110static void beep(const uint8_t& beepPriority)
111{
112 conn->async_method_call(
113 [](boost::system::error_code ec) {
114 if (ec)
115 {
116 std::cerr << "beep returned error with "
117 "async_method_call (ec = "
118 << ec << ")\n";
119 return;
120 }
121 },
122 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
123 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
124}
125
Jason M. Billsa3397932019-08-06 11:07:21 -0700126static void cpuIERRLog()
127{
128 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
129 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
130 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
131}
132
133static void cpuIERRLog(const int cpuNum)
134{
135 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
136
137 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
138 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
139 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
140}
141
142static void cpuIERRLog(const int cpuNum, const std::string& type)
143{
144 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
145
146 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
147 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
148 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
149}
150
Jason M. Billscbf78532019-08-16 15:32:11 -0700151static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700152{
Jason M. Billscbf78532019-08-16 15:32:11 -0700153 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
154
155 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
156 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
157 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700158}
159
Jason M. Billscbf78532019-08-16 15:32:11 -0700160static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700161{
Jason M. Billscbf78532019-08-16 15:32:11 -0700162 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
163 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700164
165 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
166 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
167 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
168}
169
Jason M. Bills89922f82019-08-06 11:10:02 -0700170static void smiTimeoutLog()
171{
172 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
173 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
174 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
175}
176
Jason M. Bills45e87e02019-09-09 14:45:38 -0700177static void cpuBootFIVRFaultLog(const int cpuNum)
178{
179 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
180
181 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
182 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
183 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
184}
185
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700186static void cpuThermTripLog(const int cpuNum)
187{
188 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
189
190 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
191 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
192 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
193 cpuNum, NULL);
194}
195
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000196static void memThermTripLog(const int cpuNum)
197{
198 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
199 std::string msg = cpuNumber + " Memory Thermal trip.";
200
201 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
202 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
203 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
204 cpuNumber.c_str(), NULL);
205}
206
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000207static void cpuMismatchLog(const int cpuNum)
208{
209 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
210
211 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
212 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
213 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
214}
215
Jason M. Bills250fa632019-08-28 15:58:25 -0700216static void cpuVRHotLog(const std::string& vr)
217{
218 std::string msg = vr + " Voltage Regulator Overheated.";
219
220 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
221 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
222 "OpenBMC.0.1.VoltageRegulatorOverheated",
223 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
224}
225
Jason M. Bills08866542019-08-16 12:04:19 -0700226static void ssbThermTripLog()
227{
228 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
229 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
230 "OpenBMC.0.1.SsbThermalTrip", NULL);
231}
232
Jason M. Billsa15c2522019-08-16 10:01:44 -0700233static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700234static void initializeHostState()
235{
236 conn->async_method_call(
237 [](boost::system::error_code ec,
238 const std::variant<std::string>& property) {
239 if (ec)
240 {
241 return;
242 }
243 const std::string* state = std::get_if<std::string>(&property);
244 if (state == nullptr)
245 {
246 std::cerr << "Unable to read host state value\n";
247 return;
248 }
249 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700250 // If the system is on, initialize the error state
251 if (!hostOff)
252 {
253 initializeErrorState();
254 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700255 },
256 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
257 "org.freedesktop.DBus.Properties", "Get",
258 "xyz.openbmc_project.State.Host", "CurrentHostState");
259}
260
261static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
262{
263 return std::make_shared<sdbusplus::bus::match::match>(
264 *conn,
265 "type='signal',interface='org.freedesktop.DBus.Properties',"
266 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
267 "Host'",
268 [](sdbusplus::message::message& msg) {
269 std::string interfaceName;
270 boost::container::flat_map<std::string, std::variant<std::string>>
271 propertiesChanged;
272 std::string state;
273 try
274 {
275 msg.read(interfaceName, propertiesChanged);
276 state =
277 std::get<std::string>(propertiesChanged.begin()->second);
278 }
279 catch (std::exception& e)
280 {
281 std::cerr << "Unable to read host state\n";
282 return;
283 }
284 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
285
Jason M. Bills1490b142019-07-01 15:48:43 -0700286 if (hostOff)
287 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700288 // No host events should fire while off, so cancel any pending
289 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700290 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700291 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700292 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700293 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700294 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700295 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700296 else
297 {
298 // Handle any initial errors when the host turns on
299 initializeErrorState();
300 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700301 });
302}
303
304static bool requestGPIOEvents(
305 const std::string& name, const std::function<void()>& handler,
306 gpiod::line& gpioLine,
307 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
308{
309 // Find the GPIO line
310 gpioLine = gpiod::find_line(name);
311 if (!gpioLine)
312 {
313 std::cerr << "Failed to find the " << name << " line\n";
314 return false;
315 }
316
317 try
318 {
319 gpioLine.request(
320 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
321 }
322 catch (std::exception&)
323 {
324 std::cerr << "Failed to request events for " << name << "\n";
325 return false;
326 }
327
328 int gpioLineFd = gpioLine.event_get_fd();
329 if (gpioLineFd < 0)
330 {
331 std::cerr << "Failed to get " << name << " fd\n";
332 return false;
333 }
334
335 gpioEventDescriptor.assign(gpioLineFd);
336
337 gpioEventDescriptor.async_wait(
338 boost::asio::posix::stream_descriptor::wait_read,
339 [&name, handler](const boost::system::error_code ec) {
340 if (ec)
341 {
342 std::cerr << name << " fd handler error: " << ec.message()
343 << "\n";
344 return;
345 }
346 handler();
347 });
348 return true;
349}
350
Jason M. Bills45e87e02019-09-09 14:45:38 -0700351static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
352{
353 // Find the GPIO line
354 gpioLine = gpiod::find_line(name);
355 if (!gpioLine)
356 {
357 std::cerr << "Failed to find the " << name << " line.\n";
358 return false;
359 }
360
361 // Request GPIO input
362 try
363 {
364 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
365 }
366 catch (std::exception&)
367 {
368 std::cerr << "Failed to request " << name << " input\n";
369 return false;
370 }
371
372 return true;
373}
374
Jason M. Bills1490b142019-07-01 15:48:43 -0700375static void startPowerCycle()
376{
377 conn->async_method_call(
378 [](boost::system::error_code ec) {
379 if (ec)
380 {
381 std::cerr << "failed to set Chassis State\n";
382 }
383 },
384 "xyz.openbmc_project.State.Chassis",
385 "/xyz/openbmc_project/state/chassis0",
386 "org.freedesktop.DBus.Properties", "Set",
387 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
388 std::variant<std::string>{
389 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
390}
391
Jason M. Billsb61766b2019-11-26 17:02:44 -0800392static void startCrashdumpAndRecovery(bool recoverSystem,
393 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700394{
395 std::cout << "Starting crashdump\n";
396 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
397 static boost::asio::steady_timer crashdumpTimer(io);
398
399 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
400 *conn,
401 "type='signal',interface='org.freedesktop.DBus.Properties',"
402 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
403 [recoverSystem](sdbusplus::message::message& msg) {
404 crashdumpTimer.cancel();
405 std::cout << "Crashdump completed\n";
406 if (recoverSystem)
407 {
408 std::cout << "Recovering the system\n";
409 startPowerCycle();
410 }
411 crashdumpCompleteMatch.reset();
412 });
413
414 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
415 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
416 if (ec)
417 {
418 // operation_aborted is expected if timer is canceled
419 if (ec != boost::asio::error::operation_aborted)
420 {
421 std::cerr << "Crashdump async_wait failed: " << ec.message()
422 << "\n";
423 }
424 std::cout << "Crashdump timer canceled\n";
425 return;
426 }
427 std::cerr << "Crashdump failed to complete before timeout\n";
428 crashdumpCompleteMatch.reset();
429 });
430
431 conn->async_method_call(
432 [](boost::system::error_code ec) {
433 if (ec)
434 {
435 std::cerr << "failed to start Crashdump\n";
436 crashdumpTimer.cancel();
437 crashdumpCompleteMatch.reset();
438 }
439 },
440 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800441 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700442}
443
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700444static void incrementCPUErrorCount(int cpuNum)
445{
446 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
447
448 // Get the current count
449 conn->async_method_call(
450 [propertyName](boost::system::error_code ec,
451 const std::variant<uint8_t>& property) {
452 if (ec)
453 {
454 std::cerr << "Failed to read " << propertyName << ": "
455 << ec.message() << "\n";
456 return;
457 }
458 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
459 if (errorCountVariant == nullptr)
460 {
461 std::cerr << propertyName << " invalid\n";
462 return;
463 }
464 uint8_t errorCount = *errorCountVariant;
465 if (errorCount == std::numeric_limits<uint8_t>::max())
466 {
467 std::cerr << "Maximum error count reached\n";
468 return;
469 }
470 // Increment the count
471 errorCount++;
472 conn->async_method_call(
473 [propertyName](boost::system::error_code ec) {
474 if (ec)
475 {
476 std::cerr << "Failed to set " << propertyName << ": "
477 << ec.message() << "\n";
478 }
479 },
480 "xyz.openbmc_project.Settings",
481 "/xyz/openbmc_project/control/processor_error_config",
482 "org.freedesktop.DBus.Properties", "Set",
483 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
484 std::variant<uint8_t>{errorCount});
485 },
486 "xyz.openbmc_project.Settings",
487 "/xyz/openbmc_project/control/processor_error_config",
488 "org.freedesktop.DBus.Properties", "Get",
489 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
490}
491
Jason M. Billsa3397932019-08-06 11:07:21 -0700492static bool checkIERRCPUs()
493{
494 bool cpuIERRFound = false;
495 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
496 cpu++, addr++)
497 {
498 uint8_t cc = 0;
499 CPUModel model{};
500 uint8_t stepping = 0;
501 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
502 {
503 std::cerr << "Cannot get CPUID!\n";
504 continue;
505 }
506
507 switch (model)
508 {
509 case skx:
510 {
511 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
512 // that caused the IERR
513 uint32_t mcaErrSrcLog = 0;
514 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
515 &cc) != PECI_CC_SUCCESS)
516 {
517 continue;
518 }
519 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
520 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
521 {
522 // TODO: Light the CPU fault LED?
523 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700524 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700525 // Next check if it's a CPU/VR mismatch by reading the
526 // IA32_MC4_STATUS MSR (0x411)
527 uint64_t mc4Status = 0;
528 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
529 PECI_CC_SUCCESS)
530 {
531 continue;
532 }
533 // Check MSEC bits 31:24 for
534 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
535 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
536 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
537 if ((mc4Status & (0x40 << 24)) ||
538 (mc4Status & (0x42 << 24)) ||
539 (mc4Status & (0x43 << 24)))
540 {
541 cpuIERRLog(cpu, "CPU/VR Mismatch");
542 continue;
543 }
544
545 // Next check if it's a Core FIVR fault by looking for a
546 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
547 // 80h)
548 uint32_t coreFIVRErrLog = 0;
549 if (peci_RdPCIConfigLocal(
550 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
551 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
552 {
553 continue;
554 }
555 if (coreFIVRErrLog)
556 {
557 cpuIERRLog(cpu, "Core FIVR Fault");
558 continue;
559 }
560
561 // Next check if it's an Uncore FIVR fault by looking for a
562 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
563 // 84h)
564 uint32_t uncoreFIVRErrLog = 0;
565 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
566 sizeof(uint32_t),
567 (uint8_t*)&uncoreFIVRErrLog,
568 &cc) != PECI_CC_SUCCESS)
569 {
570 continue;
571 }
572 if (uncoreFIVRErrLog)
573 {
574 cpuIERRLog(cpu, "Uncore FIVR Fault");
575 continue;
576 }
577
578 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
579 // both zero, but MSEC bits 31:24 have either
580 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
581 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
582 // uncore FIVR fault
583 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
584 ((mc4Status & (0x51 << 24)) ||
585 (mc4Status & (0x52 << 24))))
586 {
587 cpuIERRLog(cpu, "Uncore FIVR Fault");
588 continue;
589 }
590 cpuIERRLog(cpu);
591 }
592 break;
593 }
594 case icx:
595 {
596 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
597 // that caused the IERR
598 uint32_t mcaErrSrcLog = 0;
599 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
600 &cc) != PECI_CC_SUCCESS)
601 {
602 continue;
603 }
604 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
605 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
606 {
607 // TODO: Light the CPU fault LED?
608 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700609 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700610 // Next check if it's a CPU/VR mismatch by reading the
611 // IA32_MC4_STATUS MSR (0x411)
612 uint64_t mc4Status = 0;
613 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
614 PECI_CC_SUCCESS)
615 {
616 continue;
617 }
618 // TODO: Update MSEC/MSCOD_31_24 check
619 // Check MSEC bits 31:24 for
620 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
621 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
622 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
623 if ((mc4Status & (0x40 << 24)) ||
624 (mc4Status & (0x42 << 24)) ||
625 (mc4Status & (0x43 << 24)))
626 {
627 cpuIERRLog(cpu, "CPU/VR Mismatch");
628 continue;
629 }
630
631 // Next check if it's a Core FIVR fault by looking for a
632 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
633 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
634 uint32_t coreFIVRErrLog0 = 0;
635 uint32_t coreFIVRErrLog1 = 0;
636 if (peci_RdEndPointConfigPciLocal(
637 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
638 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
639 {
640 continue;
641 }
642 if (peci_RdEndPointConfigPciLocal(
643 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
644 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
645 {
646 continue;
647 }
648 if (coreFIVRErrLog0 || coreFIVRErrLog1)
649 {
650 cpuIERRLog(cpu, "Core FIVR Fault");
651 continue;
652 }
653
654 // Next check if it's an Uncore FIVR fault by looking for a
655 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
656 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
657 uint32_t uncoreFIVRErrLog = 0;
658 if (peci_RdEndPointConfigPciLocal(
659 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
660 (uint8_t*)&uncoreFIVRErrLog,
661 &cc) != PECI_CC_SUCCESS)
662 {
663 continue;
664 }
665 if (uncoreFIVRErrLog)
666 {
667 cpuIERRLog(cpu, "Uncore FIVR Fault");
668 continue;
669 }
670
671 // TODO: Update MSEC/MSCOD_31_24 check
672 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
673 // both zero, but MSEC bits 31:24 have either
674 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
675 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
676 // uncore FIVR fault
677 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
678 !uncoreFIVRErrLog &&
679 ((mc4Status & (0x51 << 24)) ||
680 (mc4Status & (0x52 << 24))))
681 {
682 cpuIERRLog(cpu, "Uncore FIVR Fault");
683 continue;
684 }
685 cpuIERRLog(cpu);
686 }
687 break;
688 }
689 }
690 }
691 return cpuIERRFound;
692}
693
Jason M. Billsa15c2522019-08-16 10:01:44 -0700694static void caterrAssertHandler()
695{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700696 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
697 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
698 if (ec)
699 {
700 // operation_aborted is expected if timer is canceled
701 // before completion.
702 if (ec != boost::asio::error::operation_aborted)
703 {
704 std::cerr << "caterr timeout async_wait failed: "
705 << ec.message() << "\n";
706 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700707 return;
708 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700709 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
710 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800711 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700712 if (!checkIERRCPUs())
713 {
714 cpuIERRLog();
715 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700716 conn->async_method_call(
717 [](boost::system::error_code ec,
718 const std::variant<bool>& property) {
719 if (ec)
720 {
721 return;
722 }
723 const bool* reset = std::get_if<bool>(&property);
724 if (reset == nullptr)
725 {
726 std::cerr << "Unable to read reset on CATERR value\n";
727 return;
728 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800729 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700730 },
731 "xyz.openbmc_project.Settings",
732 "/xyz/openbmc_project/control/processor_error_config",
733 "org.freedesktop.DBus.Properties", "Get",
734 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
735 });
736}
737
Jason M. Bills1490b142019-07-01 15:48:43 -0700738static void caterrHandler()
739{
740 if (!hostOff)
741 {
742 gpiod::line_event gpioLineEvent = caterrLine.event_read();
743
744 bool caterr =
745 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800746
747 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700748 if (caterr)
749 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700750 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800751 associations.emplace_back(
752 "", "critical",
753 "/xyz/openbmc_project/host_error_monitor/cat_error");
754 associations.emplace_back("", "critical",
755 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700756 }
757 else
758 {
759 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800760 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700761 }
Yong Li1429ca82020-04-27 16:49:45 +0800762 host_error_monitor::associationCATAssert->set_property("Associations",
763 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700764 }
765 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
766 [](const boost::system::error_code ec) {
767 if (ec)
768 {
769 std::cerr << "caterr handler error: "
770 << ec.message() << "\n";
771 return;
772 }
773 caterrHandler();
774 });
775}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700776
Jason M. Billse94f5e12019-09-13 11:11:34 -0700777static void cpu1ThermtripAssertHandler()
778{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700779 if (cpu1FIVRFaultLine.get_value() == 0)
780 {
781 cpuBootFIVRFaultLog(1);
782 }
783 else
784 {
785 cpuThermTripLog(1);
786 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700787}
788
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700789static void cpu1ThermtripHandler()
790{
Jason M. Bills84951142020-04-17 15:57:11 -0700791 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700792
Jason M. Bills84951142020-04-17 15:57:11 -0700793 bool cpu1Thermtrip =
794 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
795 if (cpu1Thermtrip)
796 {
797 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700798 }
Jason M. Bills84951142020-04-17 15:57:11 -0700799
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700800 cpu1ThermtripEvent.async_wait(
801 boost::asio::posix::stream_descriptor::wait_read,
802 [](const boost::system::error_code ec) {
803 if (ec)
804 {
805 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
806 << "\n";
807 return;
808 }
809 cpu1ThermtripHandler();
810 });
811}
812
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000813static void cpu1MemtripHandler()
814{
Jason M. Bills5287c022020-05-19 11:16:09 -0700815 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000816
Jason M. Bills5287c022020-05-19 11:16:09 -0700817 bool cpu1Memtrip =
818 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
819 if (cpu1Memtrip)
820 {
821 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000822 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700823
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000824 cpu1MemtripEvent.async_wait(
825 boost::asio::posix::stream_descriptor::wait_read,
826 [](const boost::system::error_code ec) {
827 if (ec)
828 {
829 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
830 << ec.message() << "\n";
831 return;
832 }
833 cpu1MemtripHandler();
834 });
835}
836
Jason M. Billse94f5e12019-09-13 11:11:34 -0700837static void cpu2ThermtripAssertHandler()
838{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700839 if (cpu2FIVRFaultLine.get_value() == 0)
840 {
841 cpuBootFIVRFaultLog(2);
842 }
843 else
844 {
845 cpuThermTripLog(2);
846 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700847}
848
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700849static void cpu2ThermtripHandler()
850{
Jason M. Bills84951142020-04-17 15:57:11 -0700851 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700852
Jason M. Bills84951142020-04-17 15:57:11 -0700853 bool cpu2Thermtrip =
854 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
855 if (cpu2Thermtrip)
856 {
857 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700858 }
Jason M. Bills84951142020-04-17 15:57:11 -0700859
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700860 cpu2ThermtripEvent.async_wait(
861 boost::asio::posix::stream_descriptor::wait_read,
862 [](const boost::system::error_code ec) {
863 if (ec)
864 {
865 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
866 << "\n";
867 return;
868 }
869 cpu2ThermtripHandler();
870 });
871}
872
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000873static void cpu2MemtripHandler()
874{
Jason M. Bills5287c022020-05-19 11:16:09 -0700875 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000876
Jason M. Bills5287c022020-05-19 11:16:09 -0700877 bool cpu2Memtrip =
878 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
879 if (cpu2Memtrip)
880 {
881 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000882 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700883
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000884 cpu2MemtripEvent.async_wait(
885 boost::asio::posix::stream_descriptor::wait_read,
886 [](const boost::system::error_code ec) {
887 if (ec)
888 {
889 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
890 << ec.message() << "\n";
891 return;
892 }
893 cpu2MemtripHandler();
894 });
895}
896
Jason M. Billse94f5e12019-09-13 11:11:34 -0700897static void cpu1VRHotAssertHandler()
898{
899 cpuVRHotLog("CPU 1");
900}
901
Jason M. Bills250fa632019-08-28 15:58:25 -0700902static void cpu1VRHotHandler()
903{
Jason M. Bills84951142020-04-17 15:57:11 -0700904 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700905
Jason M. Bills84951142020-04-17 15:57:11 -0700906 bool cpu1VRHot =
907 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
908 if (cpu1VRHot)
909 {
910 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700911 }
Jason M. Bills84951142020-04-17 15:57:11 -0700912
Jason M. Bills250fa632019-08-28 15:58:25 -0700913 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
914 [](const boost::system::error_code ec) {
915 if (ec)
916 {
917 std::cerr << "CPU 1 VRHot handler error: "
918 << ec.message() << "\n";
919 return;
920 }
921 cpu1VRHotHandler();
922 });
923}
924
Jason M. Billse94f5e12019-09-13 11:11:34 -0700925static void cpu1MemABCDVRHotAssertHandler()
926{
927 cpuVRHotLog("CPU 1 Memory ABCD");
928}
929
Jason M. Bills9647ba72019-08-29 14:19:19 -0700930static void cpu1MemABCDVRHotHandler()
931{
Jason M. Bills84951142020-04-17 15:57:11 -0700932 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700933
Jason M. Bills84951142020-04-17 15:57:11 -0700934 bool cpu1MemABCDVRHot =
935 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
936 if (cpu1MemABCDVRHot)
937 {
938 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700939 }
Jason M. Bills84951142020-04-17 15:57:11 -0700940
Jason M. Bills9647ba72019-08-29 14:19:19 -0700941 cpu1MemABCDVRHotEvent.async_wait(
942 boost::asio::posix::stream_descriptor::wait_read,
943 [](const boost::system::error_code ec) {
944 if (ec)
945 {
946 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
947 << ec.message() << "\n";
948 return;
949 }
950 cpu1MemABCDVRHotHandler();
951 });
952}
953
Jason M. Billse94f5e12019-09-13 11:11:34 -0700954static void cpu1MemEFGHVRHotAssertHandler()
955{
956 cpuVRHotLog("CPU 1 Memory EFGH");
957}
958
Jason M. Bills9647ba72019-08-29 14:19:19 -0700959static void cpu1MemEFGHVRHotHandler()
960{
Jason M. Bills84951142020-04-17 15:57:11 -0700961 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700962
Jason M. Bills84951142020-04-17 15:57:11 -0700963 bool cpu1MemEFGHVRHot =
964 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
965 if (cpu1MemEFGHVRHot)
966 {
967 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700968 }
Jason M. Bills84951142020-04-17 15:57:11 -0700969
Jason M. Bills9647ba72019-08-29 14:19:19 -0700970 cpu1MemEFGHVRHotEvent.async_wait(
971 boost::asio::posix::stream_descriptor::wait_read,
972 [](const boost::system::error_code ec) {
973 if (ec)
974 {
975 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
976 << ec.message() << "\n";
977 return;
978 }
979 cpu1MemEFGHVRHotHandler();
980 });
981}
982
Jason M. Billse94f5e12019-09-13 11:11:34 -0700983static void cpu2VRHotAssertHandler()
984{
985 cpuVRHotLog("CPU 2");
986}
987
Jason M. Bills250fa632019-08-28 15:58:25 -0700988static void cpu2VRHotHandler()
989{
Jason M. Bills84951142020-04-17 15:57:11 -0700990 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700991
Jason M. Bills84951142020-04-17 15:57:11 -0700992 bool cpu2VRHot =
993 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
994 if (cpu2VRHot)
995 {
996 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700997 }
Jason M. Bills84951142020-04-17 15:57:11 -0700998
Jason M. Bills250fa632019-08-28 15:58:25 -0700999 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1000 [](const boost::system::error_code ec) {
1001 if (ec)
1002 {
1003 std::cerr << "CPU 2 VRHot handler error: "
1004 << ec.message() << "\n";
1005 return;
1006 }
1007 cpu2VRHotHandler();
1008 });
1009}
1010
Jason M. Billse94f5e12019-09-13 11:11:34 -07001011static void cpu2MemABCDVRHotAssertHandler()
1012{
1013 cpuVRHotLog("CPU 2 Memory ABCD");
1014}
1015
Jason M. Bills9647ba72019-08-29 14:19:19 -07001016static void cpu2MemABCDVRHotHandler()
1017{
Jason M. Bills84951142020-04-17 15:57:11 -07001018 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001019
Jason M. Bills84951142020-04-17 15:57:11 -07001020 bool cpu2MemABCDVRHot =
1021 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1022 if (cpu2MemABCDVRHot)
1023 {
1024 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001025 }
Jason M. Bills84951142020-04-17 15:57:11 -07001026
Jason M. Bills9647ba72019-08-29 14:19:19 -07001027 cpu2MemABCDVRHotEvent.async_wait(
1028 boost::asio::posix::stream_descriptor::wait_read,
1029 [](const boost::system::error_code ec) {
1030 if (ec)
1031 {
1032 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1033 << ec.message() << "\n";
1034 return;
1035 }
1036 cpu2MemABCDVRHotHandler();
1037 });
1038}
1039
Jason M. Billse94f5e12019-09-13 11:11:34 -07001040static void cpu2MemEFGHVRHotAssertHandler()
1041{
1042 cpuVRHotLog("CPU 2 Memory EFGH");
1043}
1044
Jason M. Bills9647ba72019-08-29 14:19:19 -07001045static void cpu2MemEFGHVRHotHandler()
1046{
Jason M. Bills84951142020-04-17 15:57:11 -07001047 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001048
Jason M. Bills84951142020-04-17 15:57:11 -07001049 bool cpu2MemEFGHVRHot =
1050 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1051 if (cpu2MemEFGHVRHot)
1052 {
1053 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001054 }
Jason M. Bills84951142020-04-17 15:57:11 -07001055
Jason M. Bills9647ba72019-08-29 14:19:19 -07001056 cpu2MemEFGHVRHotEvent.async_wait(
1057 boost::asio::posix::stream_descriptor::wait_read,
1058 [](const boost::system::error_code ec) {
1059 if (ec)
1060 {
1061 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1062 << ec.message() << "\n";
1063 return;
1064 }
1065 cpu2MemEFGHVRHotHandler();
1066 });
1067}
1068
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001069static void pchThermtripHandler()
1070{
Yong Li1429ca82020-04-27 16:49:45 +08001071 std::vector<Association> associations;
1072
Jason M. Bills84951142020-04-17 15:57:11 -07001073 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001074
Jason M. Bills84951142020-04-17 15:57:11 -07001075 bool pchThermtrip =
1076 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1077 if (pchThermtrip)
1078 {
1079 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001080 associations.emplace_back(
1081 "", "critical",
1082 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1083 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001084 }
Yong Li1429ca82020-04-27 16:49:45 +08001085 else
1086 {
1087 associations.emplace_back("", "", "");
1088 }
1089 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1090 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001091
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001092 pchThermtripEvent.async_wait(
1093 boost::asio::posix::stream_descriptor::wait_read,
1094 [](const boost::system::error_code ec) {
1095 if (ec)
1096 {
1097 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1098 << "\n";
1099 return;
1100 }
1101 pchThermtripHandler();
1102 });
1103}
1104
Jason M. Billscbf78532019-08-16 15:32:11 -07001105static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001106{
Jason M. Billscbf78532019-08-16 15:32:11 -07001107 int errPinSts = (1 << errPin);
1108 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001109 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1110 cpu++, addr++)
1111 {
1112 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1113 {
1114 uint8_t cc = 0;
1115 CPUModel model{};
1116 uint8_t stepping = 0;
1117 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1118 {
1119 std::cerr << "Cannot get CPUID!\n";
1120 continue;
1121 }
1122
1123 switch (model)
1124 {
1125 case skx:
1126 {
1127 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001128 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001129 uint32_t errpinsts = 0;
1130 if (peci_RdPCIConfigLocal(
1131 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1132 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1133 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001134 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001135 }
1136 break;
1137 }
1138 case icx:
1139 {
1140 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001141 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001142 // accessed on PECI as bus 13)
1143 uint32_t errpinsts = 0;
1144 if (peci_RdEndPointConfigPciLocal(
1145 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1146 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1147 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001148 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001149 }
1150 break;
1151 }
1152 }
1153 }
1154 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001155 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001156}
1157
Jason M. Billscbf78532019-08-16 15:32:11 -07001158static void errXAssertHandler(const int errPin,
1159 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001160{
Jason M. Billscbf78532019-08-16 15:32:11 -07001161 // ERRx status is not guaranteed through the timeout, so save which
1162 // CPUs have it asserted
1163 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1164 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1165 errXAssertTimer.async_wait([errPin, errPinCPUs](
1166 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001167 if (ec)
1168 {
1169 // operation_aborted is expected if timer is canceled before
1170 // completion.
1171 if (ec != boost::asio::error::operation_aborted)
1172 {
1173 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1174 << "\n";
1175 }
1176 return;
1177 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001178 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1179 << std::to_string(errTimeoutMs) << " ms\n";
1180 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001181 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001182 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001183 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001184 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001185 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001186 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001187 }
1188 }
1189 }
1190 else
1191 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001192 cpuERRXLog(errPin);
1193 }
1194 });
1195}
1196
Jason M. Bills8c584392019-08-19 11:05:51 -07001197static void err0AssertHandler()
1198{
1199 // Handle the standard ERR0 detection and logging
1200 const static constexpr int err0 = 0;
1201 errXAssertHandler(err0, err0AssertTimer);
1202}
1203
1204static void err0Handler()
1205{
1206 if (!hostOff)
1207 {
1208 gpiod::line_event gpioLineEvent = err0Line.event_read();
1209
1210 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1211 if (err0)
1212 {
1213 err0AssertHandler();
1214 }
1215 else
1216 {
1217 err0AssertTimer.cancel();
1218 }
1219 }
1220 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1221 [](const boost::system::error_code ec) {
1222 if (ec)
1223 {
1224 std::cerr
1225 << "err0 handler error: " << ec.message()
1226 << "\n";
1227 return;
1228 }
1229 err0Handler();
1230 });
1231}
1232
Jason M. Bills75af3962019-08-19 11:07:17 -07001233static void err1AssertHandler()
1234{
1235 // Handle the standard ERR1 detection and logging
1236 const static constexpr int err1 = 1;
1237 errXAssertHandler(err1, err1AssertTimer);
1238}
1239
1240static void err1Handler()
1241{
1242 if (!hostOff)
1243 {
1244 gpiod::line_event gpioLineEvent = err1Line.event_read();
1245
1246 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1247 if (err1)
1248 {
1249 err1AssertHandler();
1250 }
1251 else
1252 {
1253 err1AssertTimer.cancel();
1254 }
1255 }
1256 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1257 [](const boost::system::error_code ec) {
1258 if (ec)
1259 {
1260 std::cerr
1261 << "err1 handler error: " << ec.message()
1262 << "\n";
1263 return;
1264 }
1265 err1Handler();
1266 });
1267}
1268
Jason M. Billscbf78532019-08-16 15:32:11 -07001269static void err2AssertHandler()
1270{
1271 // Handle the standard ERR2 detection and logging
1272 const static constexpr int err2 = 2;
1273 errXAssertHandler(err2, err2AssertTimer);
1274 // Also handle reset for ERR2
1275 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1276 if (ec)
1277 {
1278 // operation_aborted is expected if timer is canceled before
1279 // completion.
1280 if (ec != boost::asio::error::operation_aborted)
1281 {
1282 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1283 << "\n";
1284 }
1285 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001286 }
1287 conn->async_method_call(
1288 [](boost::system::error_code ec,
1289 const std::variant<bool>& property) {
1290 if (ec)
1291 {
1292 return;
1293 }
1294 const bool* reset = std::get_if<bool>(&property);
1295 if (reset == nullptr)
1296 {
1297 std::cerr << "Unable to read reset on ERR2 value\n";
1298 return;
1299 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001300 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001301 },
1302 "xyz.openbmc_project.Settings",
1303 "/xyz/openbmc_project/control/processor_error_config",
1304 "org.freedesktop.DBus.Properties", "Get",
1305 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001306
1307 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001308 });
1309}
1310
1311static void err2Handler()
1312{
1313 if (!hostOff)
1314 {
1315 gpiod::line_event gpioLineEvent = err2Line.event_read();
1316
1317 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1318 if (err2)
1319 {
1320 err2AssertHandler();
1321 }
1322 else
1323 {
1324 err2AssertTimer.cancel();
1325 }
1326 }
1327 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1328 [](const boost::system::error_code ec) {
1329 if (ec)
1330 {
1331 std::cerr
1332 << "err2 handler error: " << ec.message()
1333 << "\n";
1334 return;
1335 }
1336 err2Handler();
1337 });
1338}
1339
Jason M. Bills89922f82019-08-06 11:10:02 -07001340static void smiAssertHandler()
1341{
1342 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1343 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1344 if (ec)
1345 {
1346 // operation_aborted is expected if timer is canceled before
1347 // completion.
1348 if (ec != boost::asio::error::operation_aborted)
1349 {
1350 std::cerr << "smi timeout async_wait failed: " << ec.message()
1351 << "\n";
1352 }
1353 return;
1354 }
1355 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1356 << " ms\n";
1357 smiTimeoutLog();
1358 conn->async_method_call(
1359 [](boost::system::error_code ec,
1360 const std::variant<bool>& property) {
1361 if (ec)
1362 {
1363 return;
1364 }
1365 const bool* reset = std::get_if<bool>(&property);
1366 if (reset == nullptr)
1367 {
1368 std::cerr << "Unable to read reset on SMI value\n";
1369 return;
1370 }
Jason M. Bills94785442020-01-07 15:22:09 -08001371#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001372 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001373#else
1374 if (*reset)
1375 {
1376 std::cout << "Recovering the system\n";
1377 startPowerCycle();
1378 }
1379#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001380 },
1381 "xyz.openbmc_project.Settings",
1382 "/xyz/openbmc_project/control/bmc_reset_disables",
1383 "org.freedesktop.DBus.Properties", "Get",
1384 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1385 });
1386}
1387
1388static void smiHandler()
1389{
1390 if (!hostOff)
1391 {
1392 gpiod::line_event gpioLineEvent = smiLine.event_read();
1393
1394 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1395 if (smi)
1396 {
1397 smiAssertHandler();
1398 }
1399 else
1400 {
1401 smiAssertTimer.cancel();
1402 }
1403 }
1404 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1405 [](const boost::system::error_code ec) {
1406 if (ec)
1407 {
1408 std::cerr
1409 << "smi handler error: " << ec.message()
1410 << "\n";
1411 return;
1412 }
1413 smiHandler();
1414 });
1415}
1416
Jason M. Billsa15c2522019-08-16 10:01:44 -07001417static void initializeErrorState()
1418{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001419 // Handle CPU1_MISMATCH if it's asserted now
1420 if (cpu1MismatchLine.get_value() == 1)
1421 {
1422 cpuMismatchLog(1);
1423 }
1424
1425 // Handle CPU2_MISMATCH if it's asserted now
1426 if (cpu2MismatchLine.get_value() == 1)
1427 {
1428 cpuMismatchLog(2);
1429 }
1430
Jason M. Billsa15c2522019-08-16 10:01:44 -07001431 // Handle CPU_CATERR if it's asserted now
1432 if (caterrLine.get_value() == 0)
1433 {
1434 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001435 std::vector<Association> associations;
1436 associations.emplace_back(
1437 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1438 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1439 host_error_monitor::associationCATAssert->set_property("Associations",
1440 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001441 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001442
Jason M. Bills8c584392019-08-19 11:05:51 -07001443 // Handle CPU_ERR0 if it's asserted now
1444 if (err0Line.get_value() == 0)
1445 {
1446 err0AssertHandler();
1447 }
1448
Jason M. Bills75af3962019-08-19 11:07:17 -07001449 // Handle CPU_ERR1 if it's asserted now
1450 if (err1Line.get_value() == 0)
1451 {
1452 err1AssertHandler();
1453 }
1454
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001455 // Handle CPU_ERR2 if it's asserted now
1456 if (err2Line.get_value() == 0)
1457 {
1458 err2AssertHandler();
1459 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001460
1461 // Handle SMI if it's asserted now
1462 if (smiLine.get_value() == 0)
1463 {
1464 smiAssertHandler();
1465 }
Jason M. Bills08866542019-08-16 12:04:19 -07001466
Jason M. Billse94f5e12019-09-13 11:11:34 -07001467 // Handle CPU1_THERMTRIP if it's asserted now
1468 if (cpu1ThermtripLine.get_value() == 0)
1469 {
1470 cpu1ThermtripAssertHandler();
1471 }
1472
1473 // Handle CPU2_THERMTRIP if it's asserted now
1474 if (cpu2ThermtripLine.get_value() == 0)
1475 {
1476 cpu2ThermtripAssertHandler();
1477 }
1478
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001479 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1480 if (cpu1MemtripLine.get_value() == 0)
1481 {
1482 memThermTripLog(1);
1483 }
1484
1485 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1486 if (cpu2MemtripLine.get_value() == 0)
1487 {
1488 memThermTripLog(2);
1489 }
1490
Jason M. Billse94f5e12019-09-13 11:11:34 -07001491 // Handle CPU1_VRHOT if it's asserted now
1492 if (cpu1VRHotLine.get_value() == 0)
1493 {
1494 cpu1VRHotAssertHandler();
1495 }
1496
1497 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1498 if (cpu1MemABCDVRHotLine.get_value() == 0)
1499 {
1500 cpu1MemABCDVRHotAssertHandler();
1501 }
1502
1503 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1504 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1505 {
1506 cpu1MemEFGHVRHotAssertHandler();
1507 }
1508
1509 // Handle CPU2_VRHOT if it's asserted now
1510 if (cpu2VRHotLine.get_value() == 0)
1511 {
1512 cpu2VRHotAssertHandler();
1513 }
1514
1515 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1516 if (cpu2MemABCDVRHotLine.get_value() == 0)
1517 {
1518 cpu2MemABCDVRHotAssertHandler();
1519 }
1520
1521 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1522 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1523 {
1524 cpu2MemEFGHVRHotAssertHandler();
1525 }
1526
Jason M. Bills08866542019-08-16 12:04:19 -07001527 // Handle PCH_BMC_THERMTRIP if it's asserted now
1528 if (pchThermtripLine.get_value() == 0)
1529 {
1530 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001531 std::vector<Association> associations;
1532 associations.emplace_back(
1533 "", "critical",
1534 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1535 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1536 host_error_monitor::associationSSBThermTrip->set_property(
1537 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001538 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001539}
Jason M. Bills1490b142019-07-01 15:48:43 -07001540} // namespace host_error_monitor
1541
1542int main(int argc, char* argv[])
1543{
1544 // setup connection to dbus
1545 host_error_monitor::conn =
1546 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1547
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001548 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001549 host_error_monitor::conn->request_name(
1550 "xyz.openbmc_project.HostErrorMonitor");
1551 sdbusplus::asio::object_server server =
1552 sdbusplus::asio::object_server(host_error_monitor::conn);
1553
Yong Li1429ca82020-04-27 16:49:45 +08001554 // Associations interface for led status
1555 std::vector<host_error_monitor::Association> associations;
1556 associations.emplace_back("", "", "");
1557 host_error_monitor::associationSSBThermTrip = server.add_interface(
1558 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1559 "xyz.openbmc_project.Association.Definitions");
1560 host_error_monitor::associationSSBThermTrip->register_property(
1561 "Associations", associations);
1562 host_error_monitor::associationSSBThermTrip->initialize();
1563
1564 host_error_monitor::associationCATAssert = server.add_interface(
1565 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1566 "xyz.openbmc_project.Association.Definitions");
1567 host_error_monitor::associationCATAssert->register_property("Associations",
1568 associations);
1569 host_error_monitor::associationCATAssert->initialize();
1570
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001571 // Restart Cause Interface
1572 host_error_monitor::hostErrorTimeoutIface =
1573 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1574 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1575
1576 host_error_monitor::hostErrorTimeoutIface->register_property(
1577 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1578 [](const std::size_t& requested, std::size_t& resp) {
1579 if (requested > host_error_monitor::caterrTimeoutMsMax)
1580 {
1581 std::cerr << "IERRTimeoutMs update to " << requested
1582 << "ms rejected. Cannot be greater than "
1583 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1584 return 0;
1585 }
1586 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1587 host_error_monitor::caterrTimeoutMs = requested;
1588 resp = requested;
1589 return 1;
1590 },
1591 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1592 host_error_monitor::hostErrorTimeoutIface->initialize();
1593
Jason M. Bills1490b142019-07-01 15:48:43 -07001594 // Start tracking host state
1595 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1596 host_error_monitor::startHostStateMonitor();
1597
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001598 // Request CPU1_MISMATCH GPIO events
1599 if (!host_error_monitor::requestGPIOInput(
1600 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1601 {
1602 return -1;
1603 }
1604
1605 // Request CPU2_MISMATCH GPIO events
1606 if (!host_error_monitor::requestGPIOInput(
1607 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1608 {
1609 return -1;
1610 }
1611
Jason M. Bills1490b142019-07-01 15:48:43 -07001612 // Initialize the host state
1613 host_error_monitor::initializeHostState();
1614
1615 // Request CPU_CATERR GPIO events
1616 if (!host_error_monitor::requestGPIOEvents(
1617 "CPU_CATERR", host_error_monitor::caterrHandler,
1618 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1619 {
1620 return -1;
1621 }
1622
Jason M. Bills8c584392019-08-19 11:05:51 -07001623 // Request CPU_ERR0 GPIO events
1624 if (!host_error_monitor::requestGPIOEvents(
1625 "CPU_ERR0", host_error_monitor::err0Handler,
1626 host_error_monitor::err0Line, host_error_monitor::err0Event))
1627 {
1628 return -1;
1629 }
1630
Jason M. Bills75af3962019-08-19 11:07:17 -07001631 // Request CPU_ERR1 GPIO events
1632 if (!host_error_monitor::requestGPIOEvents(
1633 "CPU_ERR1", host_error_monitor::err1Handler,
1634 host_error_monitor::err1Line, host_error_monitor::err1Event))
1635 {
1636 return -1;
1637 }
1638
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001639 // Request CPU_ERR2 GPIO events
1640 if (!host_error_monitor::requestGPIOEvents(
1641 "CPU_ERR2", host_error_monitor::err2Handler,
1642 host_error_monitor::err2Line, host_error_monitor::err2Event))
1643 {
1644 return -1;
1645 }
1646
Jason M. Bills89922f82019-08-06 11:10:02 -07001647 // Request SMI GPIO events
1648 if (!host_error_monitor::requestGPIOEvents(
1649 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1650 host_error_monitor::smiEvent))
1651 {
1652 return -1;
1653 }
1654
Jason M. Bills45e87e02019-09-09 14:45:38 -07001655 // Request CPU1_FIVR_FAULT GPIO input
1656 if (!host_error_monitor::requestGPIOInput(
1657 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1658 {
1659 return -1;
1660 }
1661
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001662 // Request CPU1_THERMTRIP GPIO events
1663 if (!host_error_monitor::requestGPIOEvents(
1664 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1665 host_error_monitor::cpu1ThermtripLine,
1666 host_error_monitor::cpu1ThermtripEvent))
1667 {
1668 return -1;
1669 }
1670
Jason M. Bills45e87e02019-09-09 14:45:38 -07001671 // Request CPU2_FIVR_FAULT GPIO input
1672 if (!host_error_monitor::requestGPIOInput(
1673 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1674 {
1675 return -1;
1676 }
1677
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001678 // Request CPU2_THERMTRIP GPIO events
1679 if (!host_error_monitor::requestGPIOEvents(
1680 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1681 host_error_monitor::cpu2ThermtripLine,
1682 host_error_monitor::cpu2ThermtripEvent))
1683 {
1684 return -1;
1685 }
1686
Jason M. Bills250fa632019-08-28 15:58:25 -07001687 // Request CPU1_VRHOT GPIO events
1688 if (!host_error_monitor::requestGPIOEvents(
1689 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1690 host_error_monitor::cpu1VRHotLine,
1691 host_error_monitor::cpu1VRHotEvent))
1692 {
1693 return -1;
1694 }
1695
Jason M. Bills9647ba72019-08-29 14:19:19 -07001696 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1697 if (!host_error_monitor::requestGPIOEvents(
1698 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1699 host_error_monitor::cpu1MemABCDVRHotLine,
1700 host_error_monitor::cpu1MemABCDVRHotEvent))
1701 {
1702 return -1;
1703 }
1704
1705 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1706 if (!host_error_monitor::requestGPIOEvents(
1707 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1708 host_error_monitor::cpu1MemEFGHVRHotLine,
1709 host_error_monitor::cpu1MemEFGHVRHotEvent))
1710 {
1711 return -1;
1712 }
1713
Jason M. Bills250fa632019-08-28 15:58:25 -07001714 // Request CPU2_VRHOT GPIO events
1715 if (!host_error_monitor::requestGPIOEvents(
1716 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1717 host_error_monitor::cpu2VRHotLine,
1718 host_error_monitor::cpu2VRHotEvent))
1719 {
1720 return -1;
1721 }
1722
Jason M. Bills9647ba72019-08-29 14:19:19 -07001723 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1724 if (!host_error_monitor::requestGPIOEvents(
1725 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1726 host_error_monitor::cpu2MemABCDVRHotLine,
1727 host_error_monitor::cpu2MemABCDVRHotEvent))
1728 {
1729 return -1;
1730 }
1731
1732 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1733 if (!host_error_monitor::requestGPIOEvents(
1734 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1735 host_error_monitor::cpu2MemEFGHVRHotLine,
1736 host_error_monitor::cpu2MemEFGHVRHotEvent))
1737 {
1738 return -1;
1739 }
1740
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001741 // Request PCH_BMC_THERMTRIP GPIO events
1742 if (!host_error_monitor::requestGPIOEvents(
1743 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1744 host_error_monitor::pchThermtripLine,
1745 host_error_monitor::pchThermtripEvent))
1746 {
1747 return -1;
1748 }
1749
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001750 // Request CPU1_MEM_THERM_EVENT GPIO events
1751 if (!host_error_monitor::requestGPIOEvents(
1752 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1753 host_error_monitor::cpu1MemtripLine,
1754 host_error_monitor::cpu1MemtripEvent))
1755 {
1756 return -1;
1757 }
1758
1759 // Request CPU2_MEM_THERM_EVENT GPIO events
1760 if (!host_error_monitor::requestGPIOEvents(
1761 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1762 host_error_monitor::cpu2MemtripLine,
1763 host_error_monitor::cpu2MemtripEvent))
1764 {
1765 return -1;
1766 }
1767
Jason M. Bills1490b142019-07-01 15:48:43 -07001768 host_error_monitor::io.run();
1769
1770 return 0;
1771}