blob: f1cab3d75861c8dec7c1750f6a1879d2f6e3c7db [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills1490b142019-07-01 15:48:43 -070019#include <boost/asio/posix/stream_descriptor.hpp>
20#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070021#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070022
23#include <bitset>
24#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070025#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070026
27namespace host_error_monitor
28{
29static boost::asio::io_service io;
30static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080031static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070032
Yong Li1429ca82020-04-27 16:49:45 +080033using Association = std::tuple<std::string, std::string, std::string>;
34static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
35static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
36
37static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
38
Jason M. Bills1490b142019-07-01 15:48:43 -070039static bool hostOff = true;
40
Jason M. Billsc4b91f22019-11-26 17:04:50 -080041static size_t caterrTimeoutMs = 2000;
42const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070043const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070044const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070045const static constexpr size_t crashdumpTimeoutS = 300;
46
47// Timers
48// Timer for CATERR asserted
49static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070050// Timer for ERR0 asserted
51static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070052// Timer for ERR1 asserted
53static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070054// Timer for ERR2 asserted
55static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070056// Timer for SMI asserted
57static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070058
59// GPIO Lines and Event Descriptors
60static gpiod::line caterrLine;
61static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070062static gpiod::line err0Line;
63static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070064static gpiod::line err1Line;
65static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070066static gpiod::line err2Line;
67static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070068static gpiod::line smiLine;
69static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070070static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070071static gpiod::line cpu1ThermtripLine;
72static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070073static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070074static gpiod::line cpu2ThermtripLine;
75static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070076static gpiod::line cpu1VRHotLine;
77static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
78static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070079static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
80static gpiod::line cpu1MemEFGHVRHotLine;
81static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
82static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070083static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070084static gpiod::line cpu1MemABCDVRHotLine;
85static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
86static gpiod::line cpu2MemEFGHVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080088//----------------------------------
89// PCH_BMC_THERMTRIP function related definition
90//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080091static gpiod::line pchThermtripLine;
92static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000093//----------------------------------
94// CPU_MEM_THERM_EVENT function related definition
95//----------------------------------
96static gpiod::line cpu1MemtripLine;
97static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
98static gpiod::line cpu2MemtripLine;
99static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000100//---------------------------------
101// CPU_MISMATCH function related definition
102//---------------------------------
103static gpiod::line cpu1MismatchLine;
104static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700105
Yong Li061eb032020-02-26 15:06:18 +0800106// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800107const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800108const static constexpr uint8_t beepCPUErr2 = 5;
109
110static void beep(const uint8_t& beepPriority)
111{
112 conn->async_method_call(
113 [](boost::system::error_code ec) {
114 if (ec)
115 {
116 std::cerr << "beep returned error with "
117 "async_method_call (ec = "
118 << ec << ")\n";
119 return;
120 }
121 },
122 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
123 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
124}
125
Jason M. Billsa3397932019-08-06 11:07:21 -0700126static void cpuIERRLog()
127{
128 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
129 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
130 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
131}
132
133static void cpuIERRLog(const int cpuNum)
134{
135 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
136
137 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
138 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
139 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
140}
141
142static void cpuIERRLog(const int cpuNum, const std::string& type)
143{
144 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
145
146 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
147 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
148 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
149}
150
Jason M. Billscbf78532019-08-16 15:32:11 -0700151static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700152{
Jason M. Billscbf78532019-08-16 15:32:11 -0700153 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
154
155 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
156 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
157 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700158}
159
Jason M. Billscbf78532019-08-16 15:32:11 -0700160static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700161{
Jason M. Billscbf78532019-08-16 15:32:11 -0700162 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
163 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700164
165 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
166 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
167 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
168}
169
Jason M. Bills89922f82019-08-06 11:10:02 -0700170static void smiTimeoutLog()
171{
172 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
173 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
174 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
175}
176
Jason M. Bills45e87e02019-09-09 14:45:38 -0700177static void cpuBootFIVRFaultLog(const int cpuNum)
178{
179 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
180
181 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
182 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
183 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
184}
185
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700186static void cpuThermTripLog(const int cpuNum)
187{
188 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
189
190 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
191 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
192 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
193 cpuNum, NULL);
194}
195
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000196static void memThermTripLog(const int cpuNum)
197{
198 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
199 std::string msg = cpuNumber + " Memory Thermal trip.";
200
201 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
202 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
203 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
204 cpuNumber.c_str(), NULL);
205}
206
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000207static void cpuMismatchLog(const int cpuNum)
208{
209 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
210
211 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
212 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
213 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
214}
215
Jason M. Bills250fa632019-08-28 15:58:25 -0700216static void cpuVRHotLog(const std::string& vr)
217{
218 std::string msg = vr + " Voltage Regulator Overheated.";
219
220 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
221 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
222 "OpenBMC.0.1.VoltageRegulatorOverheated",
223 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
224}
225
Jason M. Bills08866542019-08-16 12:04:19 -0700226static void ssbThermTripLog()
227{
228 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
229 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
230 "OpenBMC.0.1.SsbThermalTrip", NULL);
231}
232
Jason M. Billsa15c2522019-08-16 10:01:44 -0700233static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700234static void initializeHostState()
235{
236 conn->async_method_call(
237 [](boost::system::error_code ec,
238 const std::variant<std::string>& property) {
239 if (ec)
240 {
241 return;
242 }
243 const std::string* state = std::get_if<std::string>(&property);
244 if (state == nullptr)
245 {
246 std::cerr << "Unable to read host state value\n";
247 return;
248 }
249 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700250 // If the system is on, initialize the error state
251 if (!hostOff)
252 {
253 initializeErrorState();
254 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700255 },
256 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
257 "org.freedesktop.DBus.Properties", "Get",
258 "xyz.openbmc_project.State.Host", "CurrentHostState");
259}
260
261static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
262{
263 return std::make_shared<sdbusplus::bus::match::match>(
264 *conn,
265 "type='signal',interface='org.freedesktop.DBus.Properties',"
266 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
267 "Host'",
268 [](sdbusplus::message::message& msg) {
269 std::string interfaceName;
270 boost::container::flat_map<std::string, std::variant<std::string>>
271 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700272 try
273 {
274 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700275 }
276 catch (std::exception& e)
277 {
278 std::cerr << "Unable to read host state\n";
279 return;
280 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700281 // We only want to check for CurrentHostState
282 if (propertiesChanged.begin()->first != "CurrentHostState")
283 {
284 return;
285 }
286 std::string* state =
287 std::get_if<std::string>(&(propertiesChanged.begin()->second));
288 if (state == nullptr)
289 {
290 std::cerr << propertiesChanged.begin()->first
291 << " property invalid\n";
292 return;
293 }
294
295 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700296
Jason M. Bills1490b142019-07-01 15:48:43 -0700297 if (hostOff)
298 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700299 // No host events should fire while off, so cancel any pending
300 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700301 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700302 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700303 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700304 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700305 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700306 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700307 else
308 {
309 // Handle any initial errors when the host turns on
310 initializeErrorState();
311 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700312 });
313}
314
315static bool requestGPIOEvents(
316 const std::string& name, const std::function<void()>& handler,
317 gpiod::line& gpioLine,
318 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
319{
320 // Find the GPIO line
321 gpioLine = gpiod::find_line(name);
322 if (!gpioLine)
323 {
324 std::cerr << "Failed to find the " << name << " line\n";
325 return false;
326 }
327
328 try
329 {
330 gpioLine.request(
331 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
332 }
333 catch (std::exception&)
334 {
335 std::cerr << "Failed to request events for " << name << "\n";
336 return false;
337 }
338
339 int gpioLineFd = gpioLine.event_get_fd();
340 if (gpioLineFd < 0)
341 {
342 std::cerr << "Failed to get " << name << " fd\n";
343 return false;
344 }
345
346 gpioEventDescriptor.assign(gpioLineFd);
347
348 gpioEventDescriptor.async_wait(
349 boost::asio::posix::stream_descriptor::wait_read,
350 [&name, handler](const boost::system::error_code ec) {
351 if (ec)
352 {
353 std::cerr << name << " fd handler error: " << ec.message()
354 << "\n";
355 return;
356 }
357 handler();
358 });
359 return true;
360}
361
Jason M. Bills45e87e02019-09-09 14:45:38 -0700362static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
363{
364 // Find the GPIO line
365 gpioLine = gpiod::find_line(name);
366 if (!gpioLine)
367 {
368 std::cerr << "Failed to find the " << name << " line.\n";
369 return false;
370 }
371
372 // Request GPIO input
373 try
374 {
375 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
376 }
377 catch (std::exception&)
378 {
379 std::cerr << "Failed to request " << name << " input\n";
380 return false;
381 }
382
383 return true;
384}
385
Jason M. Bills1490b142019-07-01 15:48:43 -0700386static void startPowerCycle()
387{
388 conn->async_method_call(
389 [](boost::system::error_code ec) {
390 if (ec)
391 {
392 std::cerr << "failed to set Chassis State\n";
393 }
394 },
395 "xyz.openbmc_project.State.Chassis",
396 "/xyz/openbmc_project/state/chassis0",
397 "org.freedesktop.DBus.Properties", "Set",
398 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
399 std::variant<std::string>{
400 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
401}
402
Jason M. Billsb61766b2019-11-26 17:02:44 -0800403static void startCrashdumpAndRecovery(bool recoverSystem,
404 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700405{
406 std::cout << "Starting crashdump\n";
407 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
408 static boost::asio::steady_timer crashdumpTimer(io);
409
410 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
411 *conn,
412 "type='signal',interface='org.freedesktop.DBus.Properties',"
413 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
414 [recoverSystem](sdbusplus::message::message& msg) {
415 crashdumpTimer.cancel();
416 std::cout << "Crashdump completed\n";
417 if (recoverSystem)
418 {
419 std::cout << "Recovering the system\n";
420 startPowerCycle();
421 }
422 crashdumpCompleteMatch.reset();
423 });
424
425 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
426 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
427 if (ec)
428 {
429 // operation_aborted is expected if timer is canceled
430 if (ec != boost::asio::error::operation_aborted)
431 {
432 std::cerr << "Crashdump async_wait failed: " << ec.message()
433 << "\n";
434 }
435 std::cout << "Crashdump timer canceled\n";
436 return;
437 }
438 std::cerr << "Crashdump failed to complete before timeout\n";
439 crashdumpCompleteMatch.reset();
440 });
441
442 conn->async_method_call(
443 [](boost::system::error_code ec) {
444 if (ec)
445 {
446 std::cerr << "failed to start Crashdump\n";
447 crashdumpTimer.cancel();
448 crashdumpCompleteMatch.reset();
449 }
450 },
451 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800452 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700453}
454
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700455static void incrementCPUErrorCount(int cpuNum)
456{
457 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
458
459 // Get the current count
460 conn->async_method_call(
461 [propertyName](boost::system::error_code ec,
462 const std::variant<uint8_t>& property) {
463 if (ec)
464 {
465 std::cerr << "Failed to read " << propertyName << ": "
466 << ec.message() << "\n";
467 return;
468 }
469 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
470 if (errorCountVariant == nullptr)
471 {
472 std::cerr << propertyName << " invalid\n";
473 return;
474 }
475 uint8_t errorCount = *errorCountVariant;
476 if (errorCount == std::numeric_limits<uint8_t>::max())
477 {
478 std::cerr << "Maximum error count reached\n";
479 return;
480 }
481 // Increment the count
482 errorCount++;
483 conn->async_method_call(
484 [propertyName](boost::system::error_code ec) {
485 if (ec)
486 {
487 std::cerr << "Failed to set " << propertyName << ": "
488 << ec.message() << "\n";
489 }
490 },
491 "xyz.openbmc_project.Settings",
492 "/xyz/openbmc_project/control/processor_error_config",
493 "org.freedesktop.DBus.Properties", "Set",
494 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
495 std::variant<uint8_t>{errorCount});
496 },
497 "xyz.openbmc_project.Settings",
498 "/xyz/openbmc_project/control/processor_error_config",
499 "org.freedesktop.DBus.Properties", "Get",
500 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
501}
502
Jason M. Billsa3397932019-08-06 11:07:21 -0700503static bool checkIERRCPUs()
504{
505 bool cpuIERRFound = false;
506 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
507 cpu++, addr++)
508 {
509 uint8_t cc = 0;
510 CPUModel model{};
511 uint8_t stepping = 0;
512 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
513 {
514 std::cerr << "Cannot get CPUID!\n";
515 continue;
516 }
517
518 switch (model)
519 {
520 case skx:
521 {
522 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
523 // that caused the IERR
524 uint32_t mcaErrSrcLog = 0;
525 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
526 &cc) != PECI_CC_SUCCESS)
527 {
528 continue;
529 }
530 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
531 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
532 {
533 // TODO: Light the CPU fault LED?
534 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700535 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700536 // Next check if it's a CPU/VR mismatch by reading the
537 // IA32_MC4_STATUS MSR (0x411)
538 uint64_t mc4Status = 0;
539 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
540 PECI_CC_SUCCESS)
541 {
542 continue;
543 }
544 // Check MSEC bits 31:24 for
545 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
546 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
547 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
548 if ((mc4Status & (0x40 << 24)) ||
549 (mc4Status & (0x42 << 24)) ||
550 (mc4Status & (0x43 << 24)))
551 {
552 cpuIERRLog(cpu, "CPU/VR Mismatch");
553 continue;
554 }
555
556 // Next check if it's a Core FIVR fault by looking for a
557 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
558 // 80h)
559 uint32_t coreFIVRErrLog = 0;
560 if (peci_RdPCIConfigLocal(
561 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
562 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
563 {
564 continue;
565 }
566 if (coreFIVRErrLog)
567 {
568 cpuIERRLog(cpu, "Core FIVR Fault");
569 continue;
570 }
571
572 // Next check if it's an Uncore FIVR fault by looking for a
573 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
574 // 84h)
575 uint32_t uncoreFIVRErrLog = 0;
576 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
577 sizeof(uint32_t),
578 (uint8_t*)&uncoreFIVRErrLog,
579 &cc) != PECI_CC_SUCCESS)
580 {
581 continue;
582 }
583 if (uncoreFIVRErrLog)
584 {
585 cpuIERRLog(cpu, "Uncore FIVR Fault");
586 continue;
587 }
588
589 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
590 // both zero, but MSEC bits 31:24 have either
591 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
592 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
593 // uncore FIVR fault
594 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
595 ((mc4Status & (0x51 << 24)) ||
596 (mc4Status & (0x52 << 24))))
597 {
598 cpuIERRLog(cpu, "Uncore FIVR Fault");
599 continue;
600 }
601 cpuIERRLog(cpu);
602 }
603 break;
604 }
605 case icx:
606 {
607 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
608 // that caused the IERR
609 uint32_t mcaErrSrcLog = 0;
610 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
611 &cc) != PECI_CC_SUCCESS)
612 {
613 continue;
614 }
615 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
616 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
617 {
618 // TODO: Light the CPU fault LED?
619 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700620 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700621 // Next check if it's a CPU/VR mismatch by reading the
622 // IA32_MC4_STATUS MSR (0x411)
623 uint64_t mc4Status = 0;
624 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
625 PECI_CC_SUCCESS)
626 {
627 continue;
628 }
629 // TODO: Update MSEC/MSCOD_31_24 check
630 // Check MSEC bits 31:24 for
631 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
632 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
633 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
634 if ((mc4Status & (0x40 << 24)) ||
635 (mc4Status & (0x42 << 24)) ||
636 (mc4Status & (0x43 << 24)))
637 {
638 cpuIERRLog(cpu, "CPU/VR Mismatch");
639 continue;
640 }
641
642 // Next check if it's a Core FIVR fault by looking for a
643 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
644 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
645 uint32_t coreFIVRErrLog0 = 0;
646 uint32_t coreFIVRErrLog1 = 0;
647 if (peci_RdEndPointConfigPciLocal(
648 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
649 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
650 {
651 continue;
652 }
653 if (peci_RdEndPointConfigPciLocal(
654 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
655 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
656 {
657 continue;
658 }
659 if (coreFIVRErrLog0 || coreFIVRErrLog1)
660 {
661 cpuIERRLog(cpu, "Core FIVR Fault");
662 continue;
663 }
664
665 // Next check if it's an Uncore FIVR fault by looking for a
666 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
667 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
668 uint32_t uncoreFIVRErrLog = 0;
669 if (peci_RdEndPointConfigPciLocal(
670 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
671 (uint8_t*)&uncoreFIVRErrLog,
672 &cc) != PECI_CC_SUCCESS)
673 {
674 continue;
675 }
676 if (uncoreFIVRErrLog)
677 {
678 cpuIERRLog(cpu, "Uncore FIVR Fault");
679 continue;
680 }
681
682 // TODO: Update MSEC/MSCOD_31_24 check
683 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
684 // both zero, but MSEC bits 31:24 have either
685 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
686 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
687 // uncore FIVR fault
688 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
689 !uncoreFIVRErrLog &&
690 ((mc4Status & (0x51 << 24)) ||
691 (mc4Status & (0x52 << 24))))
692 {
693 cpuIERRLog(cpu, "Uncore FIVR Fault");
694 continue;
695 }
696 cpuIERRLog(cpu);
697 }
698 break;
699 }
700 }
701 }
702 return cpuIERRFound;
703}
704
Jason M. Billsa15c2522019-08-16 10:01:44 -0700705static void caterrAssertHandler()
706{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700707 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
708 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
709 if (ec)
710 {
711 // operation_aborted is expected if timer is canceled
712 // before completion.
713 if (ec != boost::asio::error::operation_aborted)
714 {
715 std::cerr << "caterr timeout async_wait failed: "
716 << ec.message() << "\n";
717 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700718 return;
719 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700720 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
721 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800722 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700723 if (!checkIERRCPUs())
724 {
725 cpuIERRLog();
726 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700727 conn->async_method_call(
728 [](boost::system::error_code ec,
729 const std::variant<bool>& property) {
730 if (ec)
731 {
732 return;
733 }
734 const bool* reset = std::get_if<bool>(&property);
735 if (reset == nullptr)
736 {
737 std::cerr << "Unable to read reset on CATERR value\n";
738 return;
739 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800740 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700741 },
742 "xyz.openbmc_project.Settings",
743 "/xyz/openbmc_project/control/processor_error_config",
744 "org.freedesktop.DBus.Properties", "Get",
745 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
746 });
747}
748
Jason M. Bills1490b142019-07-01 15:48:43 -0700749static void caterrHandler()
750{
751 if (!hostOff)
752 {
753 gpiod::line_event gpioLineEvent = caterrLine.event_read();
754
755 bool caterr =
756 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800757
758 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700759 if (caterr)
760 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700761 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800762 associations.emplace_back(
763 "", "critical",
764 "/xyz/openbmc_project/host_error_monitor/cat_error");
765 associations.emplace_back("", "critical",
766 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700767 }
768 else
769 {
770 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800771 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700772 }
Yong Li1429ca82020-04-27 16:49:45 +0800773 host_error_monitor::associationCATAssert->set_property("Associations",
774 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700775 }
776 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
777 [](const boost::system::error_code ec) {
778 if (ec)
779 {
780 std::cerr << "caterr handler error: "
781 << ec.message() << "\n";
782 return;
783 }
784 caterrHandler();
785 });
786}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700787
Jason M. Billse94f5e12019-09-13 11:11:34 -0700788static void cpu1ThermtripAssertHandler()
789{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700790 if (cpu1FIVRFaultLine.get_value() == 0)
791 {
792 cpuBootFIVRFaultLog(1);
793 }
794 else
795 {
796 cpuThermTripLog(1);
797 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700798}
799
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700800static void cpu1ThermtripHandler()
801{
Jason M. Bills84951142020-04-17 15:57:11 -0700802 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700803
Jason M. Bills84951142020-04-17 15:57:11 -0700804 bool cpu1Thermtrip =
805 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
806 if (cpu1Thermtrip)
807 {
808 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700809 }
Jason M. Bills84951142020-04-17 15:57:11 -0700810
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700811 cpu1ThermtripEvent.async_wait(
812 boost::asio::posix::stream_descriptor::wait_read,
813 [](const boost::system::error_code ec) {
814 if (ec)
815 {
816 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
817 << "\n";
818 return;
819 }
820 cpu1ThermtripHandler();
821 });
822}
823
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000824static void cpu1MemtripHandler()
825{
Jason M. Bills5287c022020-05-19 11:16:09 -0700826 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000827
Jason M. Bills5287c022020-05-19 11:16:09 -0700828 bool cpu1Memtrip =
829 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
830 if (cpu1Memtrip)
831 {
832 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000833 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700834
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000835 cpu1MemtripEvent.async_wait(
836 boost::asio::posix::stream_descriptor::wait_read,
837 [](const boost::system::error_code ec) {
838 if (ec)
839 {
840 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
841 << ec.message() << "\n";
842 return;
843 }
844 cpu1MemtripHandler();
845 });
846}
847
Jason M. Billse94f5e12019-09-13 11:11:34 -0700848static void cpu2ThermtripAssertHandler()
849{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700850 if (cpu2FIVRFaultLine.get_value() == 0)
851 {
852 cpuBootFIVRFaultLog(2);
853 }
854 else
855 {
856 cpuThermTripLog(2);
857 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700858}
859
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700860static void cpu2ThermtripHandler()
861{
Jason M. Bills84951142020-04-17 15:57:11 -0700862 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700863
Jason M. Bills84951142020-04-17 15:57:11 -0700864 bool cpu2Thermtrip =
865 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
866 if (cpu2Thermtrip)
867 {
868 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700869 }
Jason M. Bills84951142020-04-17 15:57:11 -0700870
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700871 cpu2ThermtripEvent.async_wait(
872 boost::asio::posix::stream_descriptor::wait_read,
873 [](const boost::system::error_code ec) {
874 if (ec)
875 {
876 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
877 << "\n";
878 return;
879 }
880 cpu2ThermtripHandler();
881 });
882}
883
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000884static void cpu2MemtripHandler()
885{
Jason M. Bills5287c022020-05-19 11:16:09 -0700886 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000887
Jason M. Bills5287c022020-05-19 11:16:09 -0700888 bool cpu2Memtrip =
889 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
890 if (cpu2Memtrip)
891 {
892 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000893 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700894
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000895 cpu2MemtripEvent.async_wait(
896 boost::asio::posix::stream_descriptor::wait_read,
897 [](const boost::system::error_code ec) {
898 if (ec)
899 {
900 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
901 << ec.message() << "\n";
902 return;
903 }
904 cpu2MemtripHandler();
905 });
906}
907
Jason M. Billse94f5e12019-09-13 11:11:34 -0700908static void cpu1VRHotAssertHandler()
909{
910 cpuVRHotLog("CPU 1");
911}
912
Jason M. Bills250fa632019-08-28 15:58:25 -0700913static void cpu1VRHotHandler()
914{
Jason M. Bills84951142020-04-17 15:57:11 -0700915 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700916
Jason M. Bills84951142020-04-17 15:57:11 -0700917 bool cpu1VRHot =
918 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
919 if (cpu1VRHot)
920 {
921 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700922 }
Jason M. Bills84951142020-04-17 15:57:11 -0700923
Jason M. Bills250fa632019-08-28 15:58:25 -0700924 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
925 [](const boost::system::error_code ec) {
926 if (ec)
927 {
928 std::cerr << "CPU 1 VRHot handler error: "
929 << ec.message() << "\n";
930 return;
931 }
932 cpu1VRHotHandler();
933 });
934}
935
Jason M. Billse94f5e12019-09-13 11:11:34 -0700936static void cpu1MemABCDVRHotAssertHandler()
937{
938 cpuVRHotLog("CPU 1 Memory ABCD");
939}
940
Jason M. Bills9647ba72019-08-29 14:19:19 -0700941static void cpu1MemABCDVRHotHandler()
942{
Jason M. Bills84951142020-04-17 15:57:11 -0700943 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700944
Jason M. Bills84951142020-04-17 15:57:11 -0700945 bool cpu1MemABCDVRHot =
946 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
947 if (cpu1MemABCDVRHot)
948 {
949 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700950 }
Jason M. Bills84951142020-04-17 15:57:11 -0700951
Jason M. Bills9647ba72019-08-29 14:19:19 -0700952 cpu1MemABCDVRHotEvent.async_wait(
953 boost::asio::posix::stream_descriptor::wait_read,
954 [](const boost::system::error_code ec) {
955 if (ec)
956 {
957 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
958 << ec.message() << "\n";
959 return;
960 }
961 cpu1MemABCDVRHotHandler();
962 });
963}
964
Jason M. Billse94f5e12019-09-13 11:11:34 -0700965static void cpu1MemEFGHVRHotAssertHandler()
966{
967 cpuVRHotLog("CPU 1 Memory EFGH");
968}
969
Jason M. Bills9647ba72019-08-29 14:19:19 -0700970static void cpu1MemEFGHVRHotHandler()
971{
Jason M. Bills84951142020-04-17 15:57:11 -0700972 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700973
Jason M. Bills84951142020-04-17 15:57:11 -0700974 bool cpu1MemEFGHVRHot =
975 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
976 if (cpu1MemEFGHVRHot)
977 {
978 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700979 }
Jason M. Bills84951142020-04-17 15:57:11 -0700980
Jason M. Bills9647ba72019-08-29 14:19:19 -0700981 cpu1MemEFGHVRHotEvent.async_wait(
982 boost::asio::posix::stream_descriptor::wait_read,
983 [](const boost::system::error_code ec) {
984 if (ec)
985 {
986 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
987 << ec.message() << "\n";
988 return;
989 }
990 cpu1MemEFGHVRHotHandler();
991 });
992}
993
Jason M. Billse94f5e12019-09-13 11:11:34 -0700994static void cpu2VRHotAssertHandler()
995{
996 cpuVRHotLog("CPU 2");
997}
998
Jason M. Bills250fa632019-08-28 15:58:25 -0700999static void cpu2VRHotHandler()
1000{
Jason M. Bills84951142020-04-17 15:57:11 -07001001 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001002
Jason M. Bills84951142020-04-17 15:57:11 -07001003 bool cpu2VRHot =
1004 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1005 if (cpu2VRHot)
1006 {
1007 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001008 }
Jason M. Bills84951142020-04-17 15:57:11 -07001009
Jason M. Bills250fa632019-08-28 15:58:25 -07001010 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1011 [](const boost::system::error_code ec) {
1012 if (ec)
1013 {
1014 std::cerr << "CPU 2 VRHot handler error: "
1015 << ec.message() << "\n";
1016 return;
1017 }
1018 cpu2VRHotHandler();
1019 });
1020}
1021
Jason M. Billse94f5e12019-09-13 11:11:34 -07001022static void cpu2MemABCDVRHotAssertHandler()
1023{
1024 cpuVRHotLog("CPU 2 Memory ABCD");
1025}
1026
Jason M. Bills9647ba72019-08-29 14:19:19 -07001027static void cpu2MemABCDVRHotHandler()
1028{
Jason M. Bills84951142020-04-17 15:57:11 -07001029 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001030
Jason M. Bills84951142020-04-17 15:57:11 -07001031 bool cpu2MemABCDVRHot =
1032 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1033 if (cpu2MemABCDVRHot)
1034 {
1035 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001036 }
Jason M. Bills84951142020-04-17 15:57:11 -07001037
Jason M. Bills9647ba72019-08-29 14:19:19 -07001038 cpu2MemABCDVRHotEvent.async_wait(
1039 boost::asio::posix::stream_descriptor::wait_read,
1040 [](const boost::system::error_code ec) {
1041 if (ec)
1042 {
1043 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1044 << ec.message() << "\n";
1045 return;
1046 }
1047 cpu2MemABCDVRHotHandler();
1048 });
1049}
1050
Jason M. Billse94f5e12019-09-13 11:11:34 -07001051static void cpu2MemEFGHVRHotAssertHandler()
1052{
1053 cpuVRHotLog("CPU 2 Memory EFGH");
1054}
1055
Jason M. Bills9647ba72019-08-29 14:19:19 -07001056static void cpu2MemEFGHVRHotHandler()
1057{
Jason M. Bills84951142020-04-17 15:57:11 -07001058 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001059
Jason M. Bills84951142020-04-17 15:57:11 -07001060 bool cpu2MemEFGHVRHot =
1061 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1062 if (cpu2MemEFGHVRHot)
1063 {
1064 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001065 }
Jason M. Bills84951142020-04-17 15:57:11 -07001066
Jason M. Bills9647ba72019-08-29 14:19:19 -07001067 cpu2MemEFGHVRHotEvent.async_wait(
1068 boost::asio::posix::stream_descriptor::wait_read,
1069 [](const boost::system::error_code ec) {
1070 if (ec)
1071 {
1072 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1073 << ec.message() << "\n";
1074 return;
1075 }
1076 cpu2MemEFGHVRHotHandler();
1077 });
1078}
1079
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001080static void pchThermtripHandler()
1081{
Yong Li1429ca82020-04-27 16:49:45 +08001082 std::vector<Association> associations;
1083
Jason M. Bills84951142020-04-17 15:57:11 -07001084 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001085
Jason M. Bills84951142020-04-17 15:57:11 -07001086 bool pchThermtrip =
1087 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1088 if (pchThermtrip)
1089 {
1090 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001091 associations.emplace_back(
1092 "", "critical",
1093 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1094 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001095 }
Yong Li1429ca82020-04-27 16:49:45 +08001096 else
1097 {
1098 associations.emplace_back("", "", "");
1099 }
1100 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1101 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001102
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001103 pchThermtripEvent.async_wait(
1104 boost::asio::posix::stream_descriptor::wait_read,
1105 [](const boost::system::error_code ec) {
1106 if (ec)
1107 {
1108 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1109 << "\n";
1110 return;
1111 }
1112 pchThermtripHandler();
1113 });
1114}
1115
Jason M. Billscbf78532019-08-16 15:32:11 -07001116static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001117{
Jason M. Billscbf78532019-08-16 15:32:11 -07001118 int errPinSts = (1 << errPin);
1119 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001120 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1121 cpu++, addr++)
1122 {
1123 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1124 {
1125 uint8_t cc = 0;
1126 CPUModel model{};
1127 uint8_t stepping = 0;
1128 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1129 {
1130 std::cerr << "Cannot get CPUID!\n";
1131 continue;
1132 }
1133
1134 switch (model)
1135 {
1136 case skx:
1137 {
1138 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001139 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001140 uint32_t errpinsts = 0;
1141 if (peci_RdPCIConfigLocal(
1142 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1143 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1144 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001145 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001146 }
1147 break;
1148 }
1149 case icx:
1150 {
1151 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001152 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001153 // accessed on PECI as bus 13)
1154 uint32_t errpinsts = 0;
1155 if (peci_RdEndPointConfigPciLocal(
1156 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1157 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1158 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001159 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001160 }
1161 break;
1162 }
1163 }
1164 }
1165 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001166 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001167}
1168
Jason M. Billscbf78532019-08-16 15:32:11 -07001169static void errXAssertHandler(const int errPin,
1170 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001171{
Jason M. Billscbf78532019-08-16 15:32:11 -07001172 // ERRx status is not guaranteed through the timeout, so save which
1173 // CPUs have it asserted
1174 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1175 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1176 errXAssertTimer.async_wait([errPin, errPinCPUs](
1177 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001178 if (ec)
1179 {
1180 // operation_aborted is expected if timer is canceled before
1181 // completion.
1182 if (ec != boost::asio::error::operation_aborted)
1183 {
1184 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1185 << "\n";
1186 }
1187 return;
1188 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001189 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1190 << std::to_string(errTimeoutMs) << " ms\n";
1191 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001192 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001193 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001194 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001195 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001196 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001197 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001198 }
1199 }
1200 }
1201 else
1202 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001203 cpuERRXLog(errPin);
1204 }
1205 });
1206}
1207
Jason M. Bills8c584392019-08-19 11:05:51 -07001208static void err0AssertHandler()
1209{
1210 // Handle the standard ERR0 detection and logging
1211 const static constexpr int err0 = 0;
1212 errXAssertHandler(err0, err0AssertTimer);
1213}
1214
1215static void err0Handler()
1216{
1217 if (!hostOff)
1218 {
1219 gpiod::line_event gpioLineEvent = err0Line.event_read();
1220
1221 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1222 if (err0)
1223 {
1224 err0AssertHandler();
1225 }
1226 else
1227 {
1228 err0AssertTimer.cancel();
1229 }
1230 }
1231 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1232 [](const boost::system::error_code ec) {
1233 if (ec)
1234 {
1235 std::cerr
1236 << "err0 handler error: " << ec.message()
1237 << "\n";
1238 return;
1239 }
1240 err0Handler();
1241 });
1242}
1243
Jason M. Bills75af3962019-08-19 11:07:17 -07001244static void err1AssertHandler()
1245{
1246 // Handle the standard ERR1 detection and logging
1247 const static constexpr int err1 = 1;
1248 errXAssertHandler(err1, err1AssertTimer);
1249}
1250
1251static void err1Handler()
1252{
1253 if (!hostOff)
1254 {
1255 gpiod::line_event gpioLineEvent = err1Line.event_read();
1256
1257 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1258 if (err1)
1259 {
1260 err1AssertHandler();
1261 }
1262 else
1263 {
1264 err1AssertTimer.cancel();
1265 }
1266 }
1267 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1268 [](const boost::system::error_code ec) {
1269 if (ec)
1270 {
1271 std::cerr
1272 << "err1 handler error: " << ec.message()
1273 << "\n";
1274 return;
1275 }
1276 err1Handler();
1277 });
1278}
1279
Jason M. Billscbf78532019-08-16 15:32:11 -07001280static void err2AssertHandler()
1281{
1282 // Handle the standard ERR2 detection and logging
1283 const static constexpr int err2 = 2;
1284 errXAssertHandler(err2, err2AssertTimer);
1285 // Also handle reset for ERR2
1286 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1287 if (ec)
1288 {
1289 // operation_aborted is expected if timer is canceled before
1290 // completion.
1291 if (ec != boost::asio::error::operation_aborted)
1292 {
1293 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1294 << "\n";
1295 }
1296 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001297 }
1298 conn->async_method_call(
1299 [](boost::system::error_code ec,
1300 const std::variant<bool>& property) {
1301 if (ec)
1302 {
1303 return;
1304 }
1305 const bool* reset = std::get_if<bool>(&property);
1306 if (reset == nullptr)
1307 {
1308 std::cerr << "Unable to read reset on ERR2 value\n";
1309 return;
1310 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001311 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001312 },
1313 "xyz.openbmc_project.Settings",
1314 "/xyz/openbmc_project/control/processor_error_config",
1315 "org.freedesktop.DBus.Properties", "Get",
1316 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001317
1318 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001319 });
1320}
1321
1322static void err2Handler()
1323{
1324 if (!hostOff)
1325 {
1326 gpiod::line_event gpioLineEvent = err2Line.event_read();
1327
1328 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1329 if (err2)
1330 {
1331 err2AssertHandler();
1332 }
1333 else
1334 {
1335 err2AssertTimer.cancel();
1336 }
1337 }
1338 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1339 [](const boost::system::error_code ec) {
1340 if (ec)
1341 {
1342 std::cerr
1343 << "err2 handler error: " << ec.message()
1344 << "\n";
1345 return;
1346 }
1347 err2Handler();
1348 });
1349}
1350
Jason M. Bills89922f82019-08-06 11:10:02 -07001351static void smiAssertHandler()
1352{
1353 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1354 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1355 if (ec)
1356 {
1357 // operation_aborted is expected if timer is canceled before
1358 // completion.
1359 if (ec != boost::asio::error::operation_aborted)
1360 {
1361 std::cerr << "smi timeout async_wait failed: " << ec.message()
1362 << "\n";
1363 }
1364 return;
1365 }
1366 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1367 << " ms\n";
1368 smiTimeoutLog();
1369 conn->async_method_call(
1370 [](boost::system::error_code ec,
1371 const std::variant<bool>& property) {
1372 if (ec)
1373 {
1374 return;
1375 }
1376 const bool* reset = std::get_if<bool>(&property);
1377 if (reset == nullptr)
1378 {
1379 std::cerr << "Unable to read reset on SMI value\n";
1380 return;
1381 }
Jason M. Bills94785442020-01-07 15:22:09 -08001382#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001383 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001384#else
1385 if (*reset)
1386 {
1387 std::cout << "Recovering the system\n";
1388 startPowerCycle();
1389 }
1390#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001391 },
1392 "xyz.openbmc_project.Settings",
1393 "/xyz/openbmc_project/control/bmc_reset_disables",
1394 "org.freedesktop.DBus.Properties", "Get",
1395 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1396 });
1397}
1398
1399static void smiHandler()
1400{
1401 if (!hostOff)
1402 {
1403 gpiod::line_event gpioLineEvent = smiLine.event_read();
1404
1405 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1406 if (smi)
1407 {
1408 smiAssertHandler();
1409 }
1410 else
1411 {
1412 smiAssertTimer.cancel();
1413 }
1414 }
1415 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1416 [](const boost::system::error_code ec) {
1417 if (ec)
1418 {
1419 std::cerr
1420 << "smi handler error: " << ec.message()
1421 << "\n";
1422 return;
1423 }
1424 smiHandler();
1425 });
1426}
1427
Jason M. Billsa15c2522019-08-16 10:01:44 -07001428static void initializeErrorState()
1429{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001430 // Handle CPU1_MISMATCH if it's asserted now
1431 if (cpu1MismatchLine.get_value() == 1)
1432 {
1433 cpuMismatchLog(1);
1434 }
1435
1436 // Handle CPU2_MISMATCH if it's asserted now
1437 if (cpu2MismatchLine.get_value() == 1)
1438 {
1439 cpuMismatchLog(2);
1440 }
1441
Jason M. Billsa15c2522019-08-16 10:01:44 -07001442 // Handle CPU_CATERR if it's asserted now
1443 if (caterrLine.get_value() == 0)
1444 {
1445 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001446 std::vector<Association> associations;
1447 associations.emplace_back(
1448 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1449 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1450 host_error_monitor::associationCATAssert->set_property("Associations",
1451 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001452 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001453
Jason M. Bills8c584392019-08-19 11:05:51 -07001454 // Handle CPU_ERR0 if it's asserted now
1455 if (err0Line.get_value() == 0)
1456 {
1457 err0AssertHandler();
1458 }
1459
Jason M. Bills75af3962019-08-19 11:07:17 -07001460 // Handle CPU_ERR1 if it's asserted now
1461 if (err1Line.get_value() == 0)
1462 {
1463 err1AssertHandler();
1464 }
1465
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001466 // Handle CPU_ERR2 if it's asserted now
1467 if (err2Line.get_value() == 0)
1468 {
1469 err2AssertHandler();
1470 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001471
1472 // Handle SMI if it's asserted now
1473 if (smiLine.get_value() == 0)
1474 {
1475 smiAssertHandler();
1476 }
Jason M. Bills08866542019-08-16 12:04:19 -07001477
Jason M. Billse94f5e12019-09-13 11:11:34 -07001478 // Handle CPU1_THERMTRIP if it's asserted now
1479 if (cpu1ThermtripLine.get_value() == 0)
1480 {
1481 cpu1ThermtripAssertHandler();
1482 }
1483
1484 // Handle CPU2_THERMTRIP if it's asserted now
1485 if (cpu2ThermtripLine.get_value() == 0)
1486 {
1487 cpu2ThermtripAssertHandler();
1488 }
1489
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001490 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1491 if (cpu1MemtripLine.get_value() == 0)
1492 {
1493 memThermTripLog(1);
1494 }
1495
1496 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1497 if (cpu2MemtripLine.get_value() == 0)
1498 {
1499 memThermTripLog(2);
1500 }
1501
Jason M. Billse94f5e12019-09-13 11:11:34 -07001502 // Handle CPU1_VRHOT if it's asserted now
1503 if (cpu1VRHotLine.get_value() == 0)
1504 {
1505 cpu1VRHotAssertHandler();
1506 }
1507
1508 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1509 if (cpu1MemABCDVRHotLine.get_value() == 0)
1510 {
1511 cpu1MemABCDVRHotAssertHandler();
1512 }
1513
1514 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1515 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1516 {
1517 cpu1MemEFGHVRHotAssertHandler();
1518 }
1519
1520 // Handle CPU2_VRHOT if it's asserted now
1521 if (cpu2VRHotLine.get_value() == 0)
1522 {
1523 cpu2VRHotAssertHandler();
1524 }
1525
1526 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1527 if (cpu2MemABCDVRHotLine.get_value() == 0)
1528 {
1529 cpu2MemABCDVRHotAssertHandler();
1530 }
1531
1532 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1533 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1534 {
1535 cpu2MemEFGHVRHotAssertHandler();
1536 }
1537
Jason M. Bills08866542019-08-16 12:04:19 -07001538 // Handle PCH_BMC_THERMTRIP if it's asserted now
1539 if (pchThermtripLine.get_value() == 0)
1540 {
1541 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001542 std::vector<Association> associations;
1543 associations.emplace_back(
1544 "", "critical",
1545 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1546 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1547 host_error_monitor::associationSSBThermTrip->set_property(
1548 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001549 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001550}
Jason M. Bills1490b142019-07-01 15:48:43 -07001551} // namespace host_error_monitor
1552
1553int main(int argc, char* argv[])
1554{
1555 // setup connection to dbus
1556 host_error_monitor::conn =
1557 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1558
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001559 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001560 host_error_monitor::conn->request_name(
1561 "xyz.openbmc_project.HostErrorMonitor");
1562 sdbusplus::asio::object_server server =
1563 sdbusplus::asio::object_server(host_error_monitor::conn);
1564
Yong Li1429ca82020-04-27 16:49:45 +08001565 // Associations interface for led status
1566 std::vector<host_error_monitor::Association> associations;
1567 associations.emplace_back("", "", "");
1568 host_error_monitor::associationSSBThermTrip = server.add_interface(
1569 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1570 "xyz.openbmc_project.Association.Definitions");
1571 host_error_monitor::associationSSBThermTrip->register_property(
1572 "Associations", associations);
1573 host_error_monitor::associationSSBThermTrip->initialize();
1574
1575 host_error_monitor::associationCATAssert = server.add_interface(
1576 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1577 "xyz.openbmc_project.Association.Definitions");
1578 host_error_monitor::associationCATAssert->register_property("Associations",
1579 associations);
1580 host_error_monitor::associationCATAssert->initialize();
1581
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001582 // Restart Cause Interface
1583 host_error_monitor::hostErrorTimeoutIface =
1584 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1585 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1586
1587 host_error_monitor::hostErrorTimeoutIface->register_property(
1588 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1589 [](const std::size_t& requested, std::size_t& resp) {
1590 if (requested > host_error_monitor::caterrTimeoutMsMax)
1591 {
1592 std::cerr << "IERRTimeoutMs update to " << requested
1593 << "ms rejected. Cannot be greater than "
1594 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1595 return 0;
1596 }
1597 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1598 host_error_monitor::caterrTimeoutMs = requested;
1599 resp = requested;
1600 return 1;
1601 },
1602 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1603 host_error_monitor::hostErrorTimeoutIface->initialize();
1604
Jason M. Bills1490b142019-07-01 15:48:43 -07001605 // Start tracking host state
1606 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1607 host_error_monitor::startHostStateMonitor();
1608
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001609 // Request CPU1_MISMATCH GPIO events
1610 if (!host_error_monitor::requestGPIOInput(
1611 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1612 {
1613 return -1;
1614 }
1615
1616 // Request CPU2_MISMATCH GPIO events
1617 if (!host_error_monitor::requestGPIOInput(
1618 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1619 {
1620 return -1;
1621 }
1622
Jason M. Bills1490b142019-07-01 15:48:43 -07001623 // Initialize the host state
1624 host_error_monitor::initializeHostState();
1625
1626 // Request CPU_CATERR GPIO events
1627 if (!host_error_monitor::requestGPIOEvents(
1628 "CPU_CATERR", host_error_monitor::caterrHandler,
1629 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1630 {
1631 return -1;
1632 }
1633
Jason M. Bills8c584392019-08-19 11:05:51 -07001634 // Request CPU_ERR0 GPIO events
1635 if (!host_error_monitor::requestGPIOEvents(
1636 "CPU_ERR0", host_error_monitor::err0Handler,
1637 host_error_monitor::err0Line, host_error_monitor::err0Event))
1638 {
1639 return -1;
1640 }
1641
Jason M. Bills75af3962019-08-19 11:07:17 -07001642 // Request CPU_ERR1 GPIO events
1643 if (!host_error_monitor::requestGPIOEvents(
1644 "CPU_ERR1", host_error_monitor::err1Handler,
1645 host_error_monitor::err1Line, host_error_monitor::err1Event))
1646 {
1647 return -1;
1648 }
1649
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001650 // Request CPU_ERR2 GPIO events
1651 if (!host_error_monitor::requestGPIOEvents(
1652 "CPU_ERR2", host_error_monitor::err2Handler,
1653 host_error_monitor::err2Line, host_error_monitor::err2Event))
1654 {
1655 return -1;
1656 }
1657
Jason M. Bills89922f82019-08-06 11:10:02 -07001658 // Request SMI GPIO events
1659 if (!host_error_monitor::requestGPIOEvents(
1660 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1661 host_error_monitor::smiEvent))
1662 {
1663 return -1;
1664 }
1665
Jason M. Bills45e87e02019-09-09 14:45:38 -07001666 // Request CPU1_FIVR_FAULT GPIO input
1667 if (!host_error_monitor::requestGPIOInput(
1668 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1669 {
1670 return -1;
1671 }
1672
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001673 // Request CPU1_THERMTRIP GPIO events
1674 if (!host_error_monitor::requestGPIOEvents(
1675 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1676 host_error_monitor::cpu1ThermtripLine,
1677 host_error_monitor::cpu1ThermtripEvent))
1678 {
1679 return -1;
1680 }
1681
Jason M. Bills45e87e02019-09-09 14:45:38 -07001682 // Request CPU2_FIVR_FAULT GPIO input
1683 if (!host_error_monitor::requestGPIOInput(
1684 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1685 {
1686 return -1;
1687 }
1688
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001689 // Request CPU2_THERMTRIP GPIO events
1690 if (!host_error_monitor::requestGPIOEvents(
1691 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1692 host_error_monitor::cpu2ThermtripLine,
1693 host_error_monitor::cpu2ThermtripEvent))
1694 {
1695 return -1;
1696 }
1697
Jason M. Bills250fa632019-08-28 15:58:25 -07001698 // Request CPU1_VRHOT GPIO events
1699 if (!host_error_monitor::requestGPIOEvents(
1700 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1701 host_error_monitor::cpu1VRHotLine,
1702 host_error_monitor::cpu1VRHotEvent))
1703 {
1704 return -1;
1705 }
1706
Jason M. Bills9647ba72019-08-29 14:19:19 -07001707 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1708 if (!host_error_monitor::requestGPIOEvents(
1709 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1710 host_error_monitor::cpu1MemABCDVRHotLine,
1711 host_error_monitor::cpu1MemABCDVRHotEvent))
1712 {
1713 return -1;
1714 }
1715
1716 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1717 if (!host_error_monitor::requestGPIOEvents(
1718 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1719 host_error_monitor::cpu1MemEFGHVRHotLine,
1720 host_error_monitor::cpu1MemEFGHVRHotEvent))
1721 {
1722 return -1;
1723 }
1724
Jason M. Bills250fa632019-08-28 15:58:25 -07001725 // Request CPU2_VRHOT GPIO events
1726 if (!host_error_monitor::requestGPIOEvents(
1727 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1728 host_error_monitor::cpu2VRHotLine,
1729 host_error_monitor::cpu2VRHotEvent))
1730 {
1731 return -1;
1732 }
1733
Jason M. Bills9647ba72019-08-29 14:19:19 -07001734 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1735 if (!host_error_monitor::requestGPIOEvents(
1736 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1737 host_error_monitor::cpu2MemABCDVRHotLine,
1738 host_error_monitor::cpu2MemABCDVRHotEvent))
1739 {
1740 return -1;
1741 }
1742
1743 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1744 if (!host_error_monitor::requestGPIOEvents(
1745 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1746 host_error_monitor::cpu2MemEFGHVRHotLine,
1747 host_error_monitor::cpu2MemEFGHVRHotEvent))
1748 {
1749 return -1;
1750 }
1751
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001752 // Request PCH_BMC_THERMTRIP GPIO events
1753 if (!host_error_monitor::requestGPIOEvents(
1754 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1755 host_error_monitor::pchThermtripLine,
1756 host_error_monitor::pchThermtripEvent))
1757 {
1758 return -1;
1759 }
1760
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001761 // Request CPU1_MEM_THERM_EVENT GPIO events
1762 if (!host_error_monitor::requestGPIOEvents(
1763 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1764 host_error_monitor::cpu1MemtripLine,
1765 host_error_monitor::cpu1MemtripEvent))
1766 {
1767 return -1;
1768 }
1769
1770 // Request CPU2_MEM_THERM_EVENT GPIO events
1771 if (!host_error_monitor::requestGPIOEvents(
1772 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1773 host_error_monitor::cpu2MemtripLine,
1774 host_error_monitor::cpu2MemtripEvent))
1775 {
1776 return -1;
1777 }
1778
Jason M. Bills1490b142019-07-01 15:48:43 -07001779 host_error_monitor::io.run();
1780
1781 return 0;
1782}