blob: 1c6a2e70d3f81787f4bfccec6a8ef249e66dc0b1 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills1490b142019-07-01 15:48:43 -070019#include <boost/asio/posix/stream_descriptor.hpp>
20#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070021#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070022
23#include <bitset>
24#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070025#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070026
27namespace host_error_monitor
28{
29static boost::asio::io_service io;
30static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080031static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070032
Yong Li1429ca82020-04-27 16:49:45 +080033using Association = std::tuple<std::string, std::string, std::string>;
34static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
35static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
36
37static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
38
Jason M. Bills1490b142019-07-01 15:48:43 -070039static bool hostOff = true;
40
Jason M. Billsc4b91f22019-11-26 17:04:50 -080041static size_t caterrTimeoutMs = 2000;
42const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070043const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070044const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070045const static constexpr size_t crashdumpTimeoutS = 300;
46
47// Timers
48// Timer for CATERR asserted
49static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070050// Timer for ERR0 asserted
51static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070052// Timer for ERR1 asserted
53static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070054// Timer for ERR2 asserted
55static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070056// Timer for SMI asserted
57static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070058
59// GPIO Lines and Event Descriptors
60static gpiod::line caterrLine;
61static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070062static gpiod::line err0Line;
63static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070064static gpiod::line err1Line;
65static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070066static gpiod::line err2Line;
67static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070068static gpiod::line smiLine;
69static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070070static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070071static gpiod::line cpu1ThermtripLine;
72static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070073static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070074static gpiod::line cpu2ThermtripLine;
75static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070076static gpiod::line cpu1VRHotLine;
77static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
78static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070079static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
80static gpiod::line cpu1MemEFGHVRHotLine;
81static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
82static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070083static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070084static gpiod::line cpu1MemABCDVRHotLine;
85static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
86static gpiod::line cpu2MemEFGHVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080088//----------------------------------
89// PCH_BMC_THERMTRIP function related definition
90//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080091static gpiod::line pchThermtripLine;
92static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000093//----------------------------------
94// CPU_MEM_THERM_EVENT function related definition
95//----------------------------------
96static gpiod::line cpu1MemtripLine;
97static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
98static gpiod::line cpu2MemtripLine;
99static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000100//---------------------------------
101// CPU_MISMATCH function related definition
102//---------------------------------
103static gpiod::line cpu1MismatchLine;
104static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700105
Yong Li061eb032020-02-26 15:06:18 +0800106// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800107const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800108const static constexpr uint8_t beepCPUErr2 = 5;
109
110static void beep(const uint8_t& beepPriority)
111{
112 conn->async_method_call(
113 [](boost::system::error_code ec) {
114 if (ec)
115 {
116 std::cerr << "beep returned error with "
117 "async_method_call (ec = "
118 << ec << ")\n";
119 return;
120 }
121 },
122 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
123 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
124}
125
Jason M. Billsa3397932019-08-06 11:07:21 -0700126static void cpuIERRLog()
127{
128 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
129 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
130 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
131}
132
133static void cpuIERRLog(const int cpuNum)
134{
135 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
136
137 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
138 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
139 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
140}
141
142static void cpuIERRLog(const int cpuNum, const std::string& type)
143{
144 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
145
146 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
147 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
148 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
149}
150
Jason M. Billscbf78532019-08-16 15:32:11 -0700151static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700152{
Jason M. Billscbf78532019-08-16 15:32:11 -0700153 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
154
155 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
156 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
157 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700158}
159
Jason M. Billscbf78532019-08-16 15:32:11 -0700160static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700161{
Jason M. Billscbf78532019-08-16 15:32:11 -0700162 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
163 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700164
165 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
166 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
167 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
168}
169
Jason M. Bills89922f82019-08-06 11:10:02 -0700170static void smiTimeoutLog()
171{
172 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
173 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
174 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
175}
176
Jason M. Bills45e87e02019-09-09 14:45:38 -0700177static void cpuBootFIVRFaultLog(const int cpuNum)
178{
179 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
180
181 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
182 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
183 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
184}
185
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700186static void cpuThermTripLog(const int cpuNum)
187{
188 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
189
190 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
191 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
192 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
193 cpuNum, NULL);
194}
195
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000196static void memThermTripLog(const int cpuNum)
197{
198 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
199 std::string msg = cpuNumber + " Memory Thermal trip.";
200
201 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
202 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
203 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
204 cpuNumber.c_str(), NULL);
205}
206
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000207static void cpuMismatchLog(const int cpuNum)
208{
209 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
210
211 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
212 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
213 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
214}
215
Jason M. Bills250fa632019-08-28 15:58:25 -0700216static void cpuVRHotLog(const std::string& vr)
217{
218 std::string msg = vr + " Voltage Regulator Overheated.";
219
220 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
221 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
222 "OpenBMC.0.1.VoltageRegulatorOverheated",
223 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
224}
225
Jason M. Bills08866542019-08-16 12:04:19 -0700226static void ssbThermTripLog()
227{
228 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
229 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
230 "OpenBMC.0.1.SsbThermalTrip", NULL);
231}
232
Jason M. Billsa15c2522019-08-16 10:01:44 -0700233static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700234static void initializeHostState()
235{
236 conn->async_method_call(
237 [](boost::system::error_code ec,
238 const std::variant<std::string>& property) {
239 if (ec)
240 {
241 return;
242 }
243 const std::string* state = std::get_if<std::string>(&property);
244 if (state == nullptr)
245 {
246 std::cerr << "Unable to read host state value\n";
247 return;
248 }
249 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700250 // If the system is on, initialize the error state
251 if (!hostOff)
252 {
253 initializeErrorState();
254 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700255 },
256 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
257 "org.freedesktop.DBus.Properties", "Get",
258 "xyz.openbmc_project.State.Host", "CurrentHostState");
259}
260
261static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
262{
263 return std::make_shared<sdbusplus::bus::match::match>(
264 *conn,
265 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700266 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700267 [](sdbusplus::message::message& msg) {
268 std::string interfaceName;
269 boost::container::flat_map<std::string, std::variant<std::string>>
270 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700271 try
272 {
273 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700274 }
275 catch (std::exception& e)
276 {
277 std::cerr << "Unable to read host state\n";
278 return;
279 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700280 // We only want to check for CurrentHostState
281 if (propertiesChanged.begin()->first != "CurrentHostState")
282 {
283 return;
284 }
285 std::string* state =
286 std::get_if<std::string>(&(propertiesChanged.begin()->second));
287 if (state == nullptr)
288 {
289 std::cerr << propertiesChanged.begin()->first
290 << " property invalid\n";
291 return;
292 }
293
294 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700295
Jason M. Bills1490b142019-07-01 15:48:43 -0700296 if (hostOff)
297 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700298 // No host events should fire while off, so cancel any pending
299 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700300 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700301 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700302 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700303 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700304 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700305 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700306 else
307 {
308 // Handle any initial errors when the host turns on
309 initializeErrorState();
310 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700311 });
312}
313
314static bool requestGPIOEvents(
315 const std::string& name, const std::function<void()>& handler,
316 gpiod::line& gpioLine,
317 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
318{
319 // Find the GPIO line
320 gpioLine = gpiod::find_line(name);
321 if (!gpioLine)
322 {
323 std::cerr << "Failed to find the " << name << " line\n";
324 return false;
325 }
326
327 try
328 {
329 gpioLine.request(
330 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
331 }
332 catch (std::exception&)
333 {
334 std::cerr << "Failed to request events for " << name << "\n";
335 return false;
336 }
337
338 int gpioLineFd = gpioLine.event_get_fd();
339 if (gpioLineFd < 0)
340 {
341 std::cerr << "Failed to get " << name << " fd\n";
342 return false;
343 }
344
345 gpioEventDescriptor.assign(gpioLineFd);
346
347 gpioEventDescriptor.async_wait(
348 boost::asio::posix::stream_descriptor::wait_read,
349 [&name, handler](const boost::system::error_code ec) {
350 if (ec)
351 {
352 std::cerr << name << " fd handler error: " << ec.message()
353 << "\n";
354 return;
355 }
356 handler();
357 });
358 return true;
359}
360
Jason M. Bills45e87e02019-09-09 14:45:38 -0700361static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
362{
363 // Find the GPIO line
364 gpioLine = gpiod::find_line(name);
365 if (!gpioLine)
366 {
367 std::cerr << "Failed to find the " << name << " line.\n";
368 return false;
369 }
370
371 // Request GPIO input
372 try
373 {
374 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
375 }
376 catch (std::exception&)
377 {
378 std::cerr << "Failed to request " << name << " input\n";
379 return false;
380 }
381
382 return true;
383}
384
Jason M. Bills1490b142019-07-01 15:48:43 -0700385static void startPowerCycle()
386{
387 conn->async_method_call(
388 [](boost::system::error_code ec) {
389 if (ec)
390 {
391 std::cerr << "failed to set Chassis State\n";
392 }
393 },
394 "xyz.openbmc_project.State.Chassis",
395 "/xyz/openbmc_project/state/chassis0",
396 "org.freedesktop.DBus.Properties", "Set",
397 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
398 std::variant<std::string>{
399 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
400}
401
Jason M. Billsb61766b2019-11-26 17:02:44 -0800402static void startCrashdumpAndRecovery(bool recoverSystem,
403 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700404{
405 std::cout << "Starting crashdump\n";
406 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
407 static boost::asio::steady_timer crashdumpTimer(io);
408
409 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
410 *conn,
411 "type='signal',interface='org.freedesktop.DBus.Properties',"
412 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
413 [recoverSystem](sdbusplus::message::message& msg) {
414 crashdumpTimer.cancel();
415 std::cout << "Crashdump completed\n";
416 if (recoverSystem)
417 {
418 std::cout << "Recovering the system\n";
419 startPowerCycle();
420 }
421 crashdumpCompleteMatch.reset();
422 });
423
424 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
425 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
426 if (ec)
427 {
428 // operation_aborted is expected if timer is canceled
429 if (ec != boost::asio::error::operation_aborted)
430 {
431 std::cerr << "Crashdump async_wait failed: " << ec.message()
432 << "\n";
433 }
434 std::cout << "Crashdump timer canceled\n";
435 return;
436 }
437 std::cerr << "Crashdump failed to complete before timeout\n";
438 crashdumpCompleteMatch.reset();
439 });
440
441 conn->async_method_call(
442 [](boost::system::error_code ec) {
443 if (ec)
444 {
445 std::cerr << "failed to start Crashdump\n";
446 crashdumpTimer.cancel();
447 crashdumpCompleteMatch.reset();
448 }
449 },
450 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800451 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700452}
453
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700454static void incrementCPUErrorCount(int cpuNum)
455{
456 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
457
458 // Get the current count
459 conn->async_method_call(
460 [propertyName](boost::system::error_code ec,
461 const std::variant<uint8_t>& property) {
462 if (ec)
463 {
464 std::cerr << "Failed to read " << propertyName << ": "
465 << ec.message() << "\n";
466 return;
467 }
468 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
469 if (errorCountVariant == nullptr)
470 {
471 std::cerr << propertyName << " invalid\n";
472 return;
473 }
474 uint8_t errorCount = *errorCountVariant;
475 if (errorCount == std::numeric_limits<uint8_t>::max())
476 {
477 std::cerr << "Maximum error count reached\n";
478 return;
479 }
480 // Increment the count
481 errorCount++;
482 conn->async_method_call(
483 [propertyName](boost::system::error_code ec) {
484 if (ec)
485 {
486 std::cerr << "Failed to set " << propertyName << ": "
487 << ec.message() << "\n";
488 }
489 },
490 "xyz.openbmc_project.Settings",
491 "/xyz/openbmc_project/control/processor_error_config",
492 "org.freedesktop.DBus.Properties", "Set",
493 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
494 std::variant<uint8_t>{errorCount});
495 },
496 "xyz.openbmc_project.Settings",
497 "/xyz/openbmc_project/control/processor_error_config",
498 "org.freedesktop.DBus.Properties", "Get",
499 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
500}
501
Jason M. Billsa3397932019-08-06 11:07:21 -0700502static bool checkIERRCPUs()
503{
504 bool cpuIERRFound = false;
505 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
506 cpu++, addr++)
507 {
508 uint8_t cc = 0;
509 CPUModel model{};
510 uint8_t stepping = 0;
511 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
512 {
513 std::cerr << "Cannot get CPUID!\n";
514 continue;
515 }
516
517 switch (model)
518 {
519 case skx:
520 {
521 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
522 // that caused the IERR
523 uint32_t mcaErrSrcLog = 0;
524 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
525 &cc) != PECI_CC_SUCCESS)
526 {
527 continue;
528 }
529 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
530 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
531 {
532 // TODO: Light the CPU fault LED?
533 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700534 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700535 // Next check if it's a CPU/VR mismatch by reading the
536 // IA32_MC4_STATUS MSR (0x411)
537 uint64_t mc4Status = 0;
538 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
539 PECI_CC_SUCCESS)
540 {
541 continue;
542 }
543 // Check MSEC bits 31:24 for
544 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
545 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
546 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
547 if ((mc4Status & (0x40 << 24)) ||
548 (mc4Status & (0x42 << 24)) ||
549 (mc4Status & (0x43 << 24)))
550 {
551 cpuIERRLog(cpu, "CPU/VR Mismatch");
552 continue;
553 }
554
555 // Next check if it's a Core FIVR fault by looking for a
556 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
557 // 80h)
558 uint32_t coreFIVRErrLog = 0;
559 if (peci_RdPCIConfigLocal(
560 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
561 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
562 {
563 continue;
564 }
565 if (coreFIVRErrLog)
566 {
567 cpuIERRLog(cpu, "Core FIVR Fault");
568 continue;
569 }
570
571 // Next check if it's an Uncore FIVR fault by looking for a
572 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
573 // 84h)
574 uint32_t uncoreFIVRErrLog = 0;
575 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
576 sizeof(uint32_t),
577 (uint8_t*)&uncoreFIVRErrLog,
578 &cc) != PECI_CC_SUCCESS)
579 {
580 continue;
581 }
582 if (uncoreFIVRErrLog)
583 {
584 cpuIERRLog(cpu, "Uncore FIVR Fault");
585 continue;
586 }
587
588 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
589 // both zero, but MSEC bits 31:24 have either
590 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
591 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
592 // uncore FIVR fault
593 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
594 ((mc4Status & (0x51 << 24)) ||
595 (mc4Status & (0x52 << 24))))
596 {
597 cpuIERRLog(cpu, "Uncore FIVR Fault");
598 continue;
599 }
600 cpuIERRLog(cpu);
601 }
602 break;
603 }
604 case icx:
605 {
606 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
607 // that caused the IERR
608 uint32_t mcaErrSrcLog = 0;
609 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
610 &cc) != PECI_CC_SUCCESS)
611 {
612 continue;
613 }
614 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
615 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
616 {
617 // TODO: Light the CPU fault LED?
618 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700619 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700620 // Next check if it's a CPU/VR mismatch by reading the
621 // IA32_MC4_STATUS MSR (0x411)
622 uint64_t mc4Status = 0;
623 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
624 PECI_CC_SUCCESS)
625 {
626 continue;
627 }
628 // TODO: Update MSEC/MSCOD_31_24 check
629 // Check MSEC bits 31:24 for
630 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
631 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
632 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
633 if ((mc4Status & (0x40 << 24)) ||
634 (mc4Status & (0x42 << 24)) ||
635 (mc4Status & (0x43 << 24)))
636 {
637 cpuIERRLog(cpu, "CPU/VR Mismatch");
638 continue;
639 }
640
641 // Next check if it's a Core FIVR fault by looking for a
642 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
643 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
644 uint32_t coreFIVRErrLog0 = 0;
645 uint32_t coreFIVRErrLog1 = 0;
646 if (peci_RdEndPointConfigPciLocal(
647 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
648 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
649 {
650 continue;
651 }
652 if (peci_RdEndPointConfigPciLocal(
653 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
654 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
655 {
656 continue;
657 }
658 if (coreFIVRErrLog0 || coreFIVRErrLog1)
659 {
660 cpuIERRLog(cpu, "Core FIVR Fault");
661 continue;
662 }
663
664 // Next check if it's an Uncore FIVR fault by looking for a
665 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
666 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
667 uint32_t uncoreFIVRErrLog = 0;
668 if (peci_RdEndPointConfigPciLocal(
669 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
670 (uint8_t*)&uncoreFIVRErrLog,
671 &cc) != PECI_CC_SUCCESS)
672 {
673 continue;
674 }
675 if (uncoreFIVRErrLog)
676 {
677 cpuIERRLog(cpu, "Uncore FIVR Fault");
678 continue;
679 }
680
681 // TODO: Update MSEC/MSCOD_31_24 check
682 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
683 // both zero, but MSEC bits 31:24 have either
684 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
685 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
686 // uncore FIVR fault
687 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
688 !uncoreFIVRErrLog &&
689 ((mc4Status & (0x51 << 24)) ||
690 (mc4Status & (0x52 << 24))))
691 {
692 cpuIERRLog(cpu, "Uncore FIVR Fault");
693 continue;
694 }
695 cpuIERRLog(cpu);
696 }
697 break;
698 }
699 }
700 }
701 return cpuIERRFound;
702}
703
Jason M. Billsa15c2522019-08-16 10:01:44 -0700704static void caterrAssertHandler()
705{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700706 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
707 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
708 if (ec)
709 {
710 // operation_aborted is expected if timer is canceled
711 // before completion.
712 if (ec != boost::asio::error::operation_aborted)
713 {
714 std::cerr << "caterr timeout async_wait failed: "
715 << ec.message() << "\n";
716 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700717 return;
718 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700719 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
720 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800721 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700722 if (!checkIERRCPUs())
723 {
724 cpuIERRLog();
725 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700726 conn->async_method_call(
727 [](boost::system::error_code ec,
728 const std::variant<bool>& property) {
729 if (ec)
730 {
731 return;
732 }
733 const bool* reset = std::get_if<bool>(&property);
734 if (reset == nullptr)
735 {
736 std::cerr << "Unable to read reset on CATERR value\n";
737 return;
738 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800739 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700740 },
741 "xyz.openbmc_project.Settings",
742 "/xyz/openbmc_project/control/processor_error_config",
743 "org.freedesktop.DBus.Properties", "Get",
744 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
745 });
746}
747
Jason M. Bills1490b142019-07-01 15:48:43 -0700748static void caterrHandler()
749{
750 if (!hostOff)
751 {
752 gpiod::line_event gpioLineEvent = caterrLine.event_read();
753
754 bool caterr =
755 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800756
757 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700758 if (caterr)
759 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700760 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800761 associations.emplace_back(
762 "", "critical",
763 "/xyz/openbmc_project/host_error_monitor/cat_error");
764 associations.emplace_back("", "critical",
765 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700766 }
767 else
768 {
769 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800770 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700771 }
Yong Li1429ca82020-04-27 16:49:45 +0800772 host_error_monitor::associationCATAssert->set_property("Associations",
773 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700774 }
775 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
776 [](const boost::system::error_code ec) {
777 if (ec)
778 {
779 std::cerr << "caterr handler error: "
780 << ec.message() << "\n";
781 return;
782 }
783 caterrHandler();
784 });
785}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700786
Jason M. Billse94f5e12019-09-13 11:11:34 -0700787static void cpu1ThermtripAssertHandler()
788{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700789 if (cpu1FIVRFaultLine.get_value() == 0)
790 {
791 cpuBootFIVRFaultLog(1);
792 }
793 else
794 {
795 cpuThermTripLog(1);
796 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700797}
798
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700799static void cpu1ThermtripHandler()
800{
Jason M. Bills84951142020-04-17 15:57:11 -0700801 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700802
Jason M. Bills84951142020-04-17 15:57:11 -0700803 bool cpu1Thermtrip =
804 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
805 if (cpu1Thermtrip)
806 {
807 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700808 }
Jason M. Bills84951142020-04-17 15:57:11 -0700809
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700810 cpu1ThermtripEvent.async_wait(
811 boost::asio::posix::stream_descriptor::wait_read,
812 [](const boost::system::error_code ec) {
813 if (ec)
814 {
815 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
816 << "\n";
817 return;
818 }
819 cpu1ThermtripHandler();
820 });
821}
822
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000823static void cpu1MemtripHandler()
824{
Jason M. Bills5287c022020-05-19 11:16:09 -0700825 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000826
Jason M. Bills5287c022020-05-19 11:16:09 -0700827 bool cpu1Memtrip =
828 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
829 if (cpu1Memtrip)
830 {
831 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000832 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700833
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000834 cpu1MemtripEvent.async_wait(
835 boost::asio::posix::stream_descriptor::wait_read,
836 [](const boost::system::error_code ec) {
837 if (ec)
838 {
839 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
840 << ec.message() << "\n";
841 return;
842 }
843 cpu1MemtripHandler();
844 });
845}
846
Jason M. Billse94f5e12019-09-13 11:11:34 -0700847static void cpu2ThermtripAssertHandler()
848{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700849 if (cpu2FIVRFaultLine.get_value() == 0)
850 {
851 cpuBootFIVRFaultLog(2);
852 }
853 else
854 {
855 cpuThermTripLog(2);
856 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700857}
858
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700859static void cpu2ThermtripHandler()
860{
Jason M. Bills84951142020-04-17 15:57:11 -0700861 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700862
Jason M. Bills84951142020-04-17 15:57:11 -0700863 bool cpu2Thermtrip =
864 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
865 if (cpu2Thermtrip)
866 {
867 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700868 }
Jason M. Bills84951142020-04-17 15:57:11 -0700869
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700870 cpu2ThermtripEvent.async_wait(
871 boost::asio::posix::stream_descriptor::wait_read,
872 [](const boost::system::error_code ec) {
873 if (ec)
874 {
875 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
876 << "\n";
877 return;
878 }
879 cpu2ThermtripHandler();
880 });
881}
882
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000883static void cpu2MemtripHandler()
884{
Jason M. Bills5287c022020-05-19 11:16:09 -0700885 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000886
Jason M. Bills5287c022020-05-19 11:16:09 -0700887 bool cpu2Memtrip =
888 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
889 if (cpu2Memtrip)
890 {
891 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000892 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700893
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000894 cpu2MemtripEvent.async_wait(
895 boost::asio::posix::stream_descriptor::wait_read,
896 [](const boost::system::error_code ec) {
897 if (ec)
898 {
899 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
900 << ec.message() << "\n";
901 return;
902 }
903 cpu2MemtripHandler();
904 });
905}
906
Jason M. Billse94f5e12019-09-13 11:11:34 -0700907static void cpu1VRHotAssertHandler()
908{
909 cpuVRHotLog("CPU 1");
910}
911
Jason M. Bills250fa632019-08-28 15:58:25 -0700912static void cpu1VRHotHandler()
913{
Jason M. Bills84951142020-04-17 15:57:11 -0700914 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700915
Jason M. Bills84951142020-04-17 15:57:11 -0700916 bool cpu1VRHot =
917 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
918 if (cpu1VRHot)
919 {
920 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700921 }
Jason M. Bills84951142020-04-17 15:57:11 -0700922
Jason M. Bills250fa632019-08-28 15:58:25 -0700923 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
924 [](const boost::system::error_code ec) {
925 if (ec)
926 {
927 std::cerr << "CPU 1 VRHot handler error: "
928 << ec.message() << "\n";
929 return;
930 }
931 cpu1VRHotHandler();
932 });
933}
934
Jason M. Billse94f5e12019-09-13 11:11:34 -0700935static void cpu1MemABCDVRHotAssertHandler()
936{
937 cpuVRHotLog("CPU 1 Memory ABCD");
938}
939
Jason M. Bills9647ba72019-08-29 14:19:19 -0700940static void cpu1MemABCDVRHotHandler()
941{
Jason M. Bills84951142020-04-17 15:57:11 -0700942 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700943
Jason M. Bills84951142020-04-17 15:57:11 -0700944 bool cpu1MemABCDVRHot =
945 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
946 if (cpu1MemABCDVRHot)
947 {
948 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700949 }
Jason M. Bills84951142020-04-17 15:57:11 -0700950
Jason M. Bills9647ba72019-08-29 14:19:19 -0700951 cpu1MemABCDVRHotEvent.async_wait(
952 boost::asio::posix::stream_descriptor::wait_read,
953 [](const boost::system::error_code ec) {
954 if (ec)
955 {
956 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
957 << ec.message() << "\n";
958 return;
959 }
960 cpu1MemABCDVRHotHandler();
961 });
962}
963
Jason M. Billse94f5e12019-09-13 11:11:34 -0700964static void cpu1MemEFGHVRHotAssertHandler()
965{
966 cpuVRHotLog("CPU 1 Memory EFGH");
967}
968
Jason M. Bills9647ba72019-08-29 14:19:19 -0700969static void cpu1MemEFGHVRHotHandler()
970{
Jason M. Bills84951142020-04-17 15:57:11 -0700971 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700972
Jason M. Bills84951142020-04-17 15:57:11 -0700973 bool cpu1MemEFGHVRHot =
974 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
975 if (cpu1MemEFGHVRHot)
976 {
977 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700978 }
Jason M. Bills84951142020-04-17 15:57:11 -0700979
Jason M. Bills9647ba72019-08-29 14:19:19 -0700980 cpu1MemEFGHVRHotEvent.async_wait(
981 boost::asio::posix::stream_descriptor::wait_read,
982 [](const boost::system::error_code ec) {
983 if (ec)
984 {
985 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
986 << ec.message() << "\n";
987 return;
988 }
989 cpu1MemEFGHVRHotHandler();
990 });
991}
992
Jason M. Billse94f5e12019-09-13 11:11:34 -0700993static void cpu2VRHotAssertHandler()
994{
995 cpuVRHotLog("CPU 2");
996}
997
Jason M. Bills250fa632019-08-28 15:58:25 -0700998static void cpu2VRHotHandler()
999{
Jason M. Bills84951142020-04-17 15:57:11 -07001000 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001001
Jason M. Bills84951142020-04-17 15:57:11 -07001002 bool cpu2VRHot =
1003 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1004 if (cpu2VRHot)
1005 {
1006 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001007 }
Jason M. Bills84951142020-04-17 15:57:11 -07001008
Jason M. Bills250fa632019-08-28 15:58:25 -07001009 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1010 [](const boost::system::error_code ec) {
1011 if (ec)
1012 {
1013 std::cerr << "CPU 2 VRHot handler error: "
1014 << ec.message() << "\n";
1015 return;
1016 }
1017 cpu2VRHotHandler();
1018 });
1019}
1020
Jason M. Billse94f5e12019-09-13 11:11:34 -07001021static void cpu2MemABCDVRHotAssertHandler()
1022{
1023 cpuVRHotLog("CPU 2 Memory ABCD");
1024}
1025
Jason M. Bills9647ba72019-08-29 14:19:19 -07001026static void cpu2MemABCDVRHotHandler()
1027{
Jason M. Bills84951142020-04-17 15:57:11 -07001028 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001029
Jason M. Bills84951142020-04-17 15:57:11 -07001030 bool cpu2MemABCDVRHot =
1031 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1032 if (cpu2MemABCDVRHot)
1033 {
1034 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001035 }
Jason M. Bills84951142020-04-17 15:57:11 -07001036
Jason M. Bills9647ba72019-08-29 14:19:19 -07001037 cpu2MemABCDVRHotEvent.async_wait(
1038 boost::asio::posix::stream_descriptor::wait_read,
1039 [](const boost::system::error_code ec) {
1040 if (ec)
1041 {
1042 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1043 << ec.message() << "\n";
1044 return;
1045 }
1046 cpu2MemABCDVRHotHandler();
1047 });
1048}
1049
Jason M. Billse94f5e12019-09-13 11:11:34 -07001050static void cpu2MemEFGHVRHotAssertHandler()
1051{
1052 cpuVRHotLog("CPU 2 Memory EFGH");
1053}
1054
Jason M. Bills9647ba72019-08-29 14:19:19 -07001055static void cpu2MemEFGHVRHotHandler()
1056{
Jason M. Bills84951142020-04-17 15:57:11 -07001057 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001058
Jason M. Bills84951142020-04-17 15:57:11 -07001059 bool cpu2MemEFGHVRHot =
1060 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1061 if (cpu2MemEFGHVRHot)
1062 {
1063 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001064 }
Jason M. Bills84951142020-04-17 15:57:11 -07001065
Jason M. Bills9647ba72019-08-29 14:19:19 -07001066 cpu2MemEFGHVRHotEvent.async_wait(
1067 boost::asio::posix::stream_descriptor::wait_read,
1068 [](const boost::system::error_code ec) {
1069 if (ec)
1070 {
1071 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1072 << ec.message() << "\n";
1073 return;
1074 }
1075 cpu2MemEFGHVRHotHandler();
1076 });
1077}
1078
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001079static void pchThermtripHandler()
1080{
Yong Li1429ca82020-04-27 16:49:45 +08001081 std::vector<Association> associations;
1082
Jason M. Bills84951142020-04-17 15:57:11 -07001083 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001084
Jason M. Bills84951142020-04-17 15:57:11 -07001085 bool pchThermtrip =
1086 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1087 if (pchThermtrip)
1088 {
1089 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001090 associations.emplace_back(
1091 "", "critical",
1092 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1093 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001094 }
Yong Li1429ca82020-04-27 16:49:45 +08001095 else
1096 {
1097 associations.emplace_back("", "", "");
1098 }
1099 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1100 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001101
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001102 pchThermtripEvent.async_wait(
1103 boost::asio::posix::stream_descriptor::wait_read,
1104 [](const boost::system::error_code ec) {
1105 if (ec)
1106 {
1107 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1108 << "\n";
1109 return;
1110 }
1111 pchThermtripHandler();
1112 });
1113}
1114
Jason M. Billscbf78532019-08-16 15:32:11 -07001115static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001116{
Jason M. Billscbf78532019-08-16 15:32:11 -07001117 int errPinSts = (1 << errPin);
1118 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001119 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1120 cpu++, addr++)
1121 {
1122 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1123 {
1124 uint8_t cc = 0;
1125 CPUModel model{};
1126 uint8_t stepping = 0;
1127 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1128 {
1129 std::cerr << "Cannot get CPUID!\n";
1130 continue;
1131 }
1132
1133 switch (model)
1134 {
1135 case skx:
1136 {
1137 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001138 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001139 uint32_t errpinsts = 0;
1140 if (peci_RdPCIConfigLocal(
1141 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1142 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1143 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001144 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001145 }
1146 break;
1147 }
1148 case icx:
1149 {
1150 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001151 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001152 // accessed on PECI as bus 13)
1153 uint32_t errpinsts = 0;
1154 if (peci_RdEndPointConfigPciLocal(
1155 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1156 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1157 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001158 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001159 }
1160 break;
1161 }
1162 }
1163 }
1164 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001165 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001166}
1167
Jason M. Billscbf78532019-08-16 15:32:11 -07001168static void errXAssertHandler(const int errPin,
1169 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001170{
Jason M. Billscbf78532019-08-16 15:32:11 -07001171 // ERRx status is not guaranteed through the timeout, so save which
1172 // CPUs have it asserted
1173 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1174 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1175 errXAssertTimer.async_wait([errPin, errPinCPUs](
1176 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001177 if (ec)
1178 {
1179 // operation_aborted is expected if timer is canceled before
1180 // completion.
1181 if (ec != boost::asio::error::operation_aborted)
1182 {
1183 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1184 << "\n";
1185 }
1186 return;
1187 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001188 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1189 << std::to_string(errTimeoutMs) << " ms\n";
1190 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001191 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001192 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001193 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001194 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001195 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001196 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001197 }
1198 }
1199 }
1200 else
1201 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001202 cpuERRXLog(errPin);
1203 }
1204 });
1205}
1206
Jason M. Bills8c584392019-08-19 11:05:51 -07001207static void err0AssertHandler()
1208{
1209 // Handle the standard ERR0 detection and logging
1210 const static constexpr int err0 = 0;
1211 errXAssertHandler(err0, err0AssertTimer);
1212}
1213
1214static void err0Handler()
1215{
1216 if (!hostOff)
1217 {
1218 gpiod::line_event gpioLineEvent = err0Line.event_read();
1219
1220 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1221 if (err0)
1222 {
1223 err0AssertHandler();
1224 }
1225 else
1226 {
1227 err0AssertTimer.cancel();
1228 }
1229 }
1230 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1231 [](const boost::system::error_code ec) {
1232 if (ec)
1233 {
1234 std::cerr
1235 << "err0 handler error: " << ec.message()
1236 << "\n";
1237 return;
1238 }
1239 err0Handler();
1240 });
1241}
1242
Jason M. Bills75af3962019-08-19 11:07:17 -07001243static void err1AssertHandler()
1244{
1245 // Handle the standard ERR1 detection and logging
1246 const static constexpr int err1 = 1;
1247 errXAssertHandler(err1, err1AssertTimer);
1248}
1249
1250static void err1Handler()
1251{
1252 if (!hostOff)
1253 {
1254 gpiod::line_event gpioLineEvent = err1Line.event_read();
1255
1256 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1257 if (err1)
1258 {
1259 err1AssertHandler();
1260 }
1261 else
1262 {
1263 err1AssertTimer.cancel();
1264 }
1265 }
1266 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1267 [](const boost::system::error_code ec) {
1268 if (ec)
1269 {
1270 std::cerr
1271 << "err1 handler error: " << ec.message()
1272 << "\n";
1273 return;
1274 }
1275 err1Handler();
1276 });
1277}
1278
Jason M. Billscbf78532019-08-16 15:32:11 -07001279static void err2AssertHandler()
1280{
1281 // Handle the standard ERR2 detection and logging
1282 const static constexpr int err2 = 2;
1283 errXAssertHandler(err2, err2AssertTimer);
1284 // Also handle reset for ERR2
1285 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1286 if (ec)
1287 {
1288 // operation_aborted is expected if timer is canceled before
1289 // completion.
1290 if (ec != boost::asio::error::operation_aborted)
1291 {
1292 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1293 << "\n";
1294 }
1295 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001296 }
1297 conn->async_method_call(
1298 [](boost::system::error_code ec,
1299 const std::variant<bool>& property) {
1300 if (ec)
1301 {
1302 return;
1303 }
1304 const bool* reset = std::get_if<bool>(&property);
1305 if (reset == nullptr)
1306 {
1307 std::cerr << "Unable to read reset on ERR2 value\n";
1308 return;
1309 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001310 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001311 },
1312 "xyz.openbmc_project.Settings",
1313 "/xyz/openbmc_project/control/processor_error_config",
1314 "org.freedesktop.DBus.Properties", "Get",
1315 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001316
1317 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001318 });
1319}
1320
1321static void err2Handler()
1322{
1323 if (!hostOff)
1324 {
1325 gpiod::line_event gpioLineEvent = err2Line.event_read();
1326
1327 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1328 if (err2)
1329 {
1330 err2AssertHandler();
1331 }
1332 else
1333 {
1334 err2AssertTimer.cancel();
1335 }
1336 }
1337 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1338 [](const boost::system::error_code ec) {
1339 if (ec)
1340 {
1341 std::cerr
1342 << "err2 handler error: " << ec.message()
1343 << "\n";
1344 return;
1345 }
1346 err2Handler();
1347 });
1348}
1349
Jason M. Bills89922f82019-08-06 11:10:02 -07001350static void smiAssertHandler()
1351{
1352 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1353 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1354 if (ec)
1355 {
1356 // operation_aborted is expected if timer is canceled before
1357 // completion.
1358 if (ec != boost::asio::error::operation_aborted)
1359 {
1360 std::cerr << "smi timeout async_wait failed: " << ec.message()
1361 << "\n";
1362 }
1363 return;
1364 }
1365 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1366 << " ms\n";
1367 smiTimeoutLog();
1368 conn->async_method_call(
1369 [](boost::system::error_code ec,
1370 const std::variant<bool>& property) {
1371 if (ec)
1372 {
1373 return;
1374 }
1375 const bool* reset = std::get_if<bool>(&property);
1376 if (reset == nullptr)
1377 {
1378 std::cerr << "Unable to read reset on SMI value\n";
1379 return;
1380 }
Jason M. Bills94785442020-01-07 15:22:09 -08001381#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001382 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001383#else
1384 if (*reset)
1385 {
1386 std::cout << "Recovering the system\n";
1387 startPowerCycle();
1388 }
1389#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001390 },
1391 "xyz.openbmc_project.Settings",
1392 "/xyz/openbmc_project/control/bmc_reset_disables",
1393 "org.freedesktop.DBus.Properties", "Get",
1394 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1395 });
1396}
1397
1398static void smiHandler()
1399{
1400 if (!hostOff)
1401 {
1402 gpiod::line_event gpioLineEvent = smiLine.event_read();
1403
1404 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1405 if (smi)
1406 {
1407 smiAssertHandler();
1408 }
1409 else
1410 {
1411 smiAssertTimer.cancel();
1412 }
1413 }
1414 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1415 [](const boost::system::error_code ec) {
1416 if (ec)
1417 {
1418 std::cerr
1419 << "smi handler error: " << ec.message()
1420 << "\n";
1421 return;
1422 }
1423 smiHandler();
1424 });
1425}
1426
Jason M. Billsa15c2522019-08-16 10:01:44 -07001427static void initializeErrorState()
1428{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001429 // Handle CPU1_MISMATCH if it's asserted now
1430 if (cpu1MismatchLine.get_value() == 1)
1431 {
1432 cpuMismatchLog(1);
1433 }
1434
1435 // Handle CPU2_MISMATCH if it's asserted now
1436 if (cpu2MismatchLine.get_value() == 1)
1437 {
1438 cpuMismatchLog(2);
1439 }
1440
Jason M. Billsa15c2522019-08-16 10:01:44 -07001441 // Handle CPU_CATERR if it's asserted now
1442 if (caterrLine.get_value() == 0)
1443 {
1444 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001445 std::vector<Association> associations;
1446 associations.emplace_back(
1447 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1448 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1449 host_error_monitor::associationCATAssert->set_property("Associations",
1450 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001451 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001452
Jason M. Bills8c584392019-08-19 11:05:51 -07001453 // Handle CPU_ERR0 if it's asserted now
1454 if (err0Line.get_value() == 0)
1455 {
1456 err0AssertHandler();
1457 }
1458
Jason M. Bills75af3962019-08-19 11:07:17 -07001459 // Handle CPU_ERR1 if it's asserted now
1460 if (err1Line.get_value() == 0)
1461 {
1462 err1AssertHandler();
1463 }
1464
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001465 // Handle CPU_ERR2 if it's asserted now
1466 if (err2Line.get_value() == 0)
1467 {
1468 err2AssertHandler();
1469 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001470
1471 // Handle SMI if it's asserted now
1472 if (smiLine.get_value() == 0)
1473 {
1474 smiAssertHandler();
1475 }
Jason M. Bills08866542019-08-16 12:04:19 -07001476
Jason M. Billse94f5e12019-09-13 11:11:34 -07001477 // Handle CPU1_THERMTRIP if it's asserted now
1478 if (cpu1ThermtripLine.get_value() == 0)
1479 {
1480 cpu1ThermtripAssertHandler();
1481 }
1482
1483 // Handle CPU2_THERMTRIP if it's asserted now
1484 if (cpu2ThermtripLine.get_value() == 0)
1485 {
1486 cpu2ThermtripAssertHandler();
1487 }
1488
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001489 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1490 if (cpu1MemtripLine.get_value() == 0)
1491 {
1492 memThermTripLog(1);
1493 }
1494
1495 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1496 if (cpu2MemtripLine.get_value() == 0)
1497 {
1498 memThermTripLog(2);
1499 }
1500
Jason M. Billse94f5e12019-09-13 11:11:34 -07001501 // Handle CPU1_VRHOT if it's asserted now
1502 if (cpu1VRHotLine.get_value() == 0)
1503 {
1504 cpu1VRHotAssertHandler();
1505 }
1506
1507 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1508 if (cpu1MemABCDVRHotLine.get_value() == 0)
1509 {
1510 cpu1MemABCDVRHotAssertHandler();
1511 }
1512
1513 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1514 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1515 {
1516 cpu1MemEFGHVRHotAssertHandler();
1517 }
1518
1519 // Handle CPU2_VRHOT if it's asserted now
1520 if (cpu2VRHotLine.get_value() == 0)
1521 {
1522 cpu2VRHotAssertHandler();
1523 }
1524
1525 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1526 if (cpu2MemABCDVRHotLine.get_value() == 0)
1527 {
1528 cpu2MemABCDVRHotAssertHandler();
1529 }
1530
1531 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1532 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1533 {
1534 cpu2MemEFGHVRHotAssertHandler();
1535 }
1536
Jason M. Bills08866542019-08-16 12:04:19 -07001537 // Handle PCH_BMC_THERMTRIP if it's asserted now
1538 if (pchThermtripLine.get_value() == 0)
1539 {
1540 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001541 std::vector<Association> associations;
1542 associations.emplace_back(
1543 "", "critical",
1544 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1545 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1546 host_error_monitor::associationSSBThermTrip->set_property(
1547 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001548 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001549}
Jason M. Bills1490b142019-07-01 15:48:43 -07001550} // namespace host_error_monitor
1551
1552int main(int argc, char* argv[])
1553{
1554 // setup connection to dbus
1555 host_error_monitor::conn =
1556 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1557
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001558 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001559 host_error_monitor::conn->request_name(
1560 "xyz.openbmc_project.HostErrorMonitor");
1561 sdbusplus::asio::object_server server =
1562 sdbusplus::asio::object_server(host_error_monitor::conn);
1563
Yong Li1429ca82020-04-27 16:49:45 +08001564 // Associations interface for led status
1565 std::vector<host_error_monitor::Association> associations;
1566 associations.emplace_back("", "", "");
1567 host_error_monitor::associationSSBThermTrip = server.add_interface(
1568 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1569 "xyz.openbmc_project.Association.Definitions");
1570 host_error_monitor::associationSSBThermTrip->register_property(
1571 "Associations", associations);
1572 host_error_monitor::associationSSBThermTrip->initialize();
1573
1574 host_error_monitor::associationCATAssert = server.add_interface(
1575 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1576 "xyz.openbmc_project.Association.Definitions");
1577 host_error_monitor::associationCATAssert->register_property("Associations",
1578 associations);
1579 host_error_monitor::associationCATAssert->initialize();
1580
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001581 // Restart Cause Interface
1582 host_error_monitor::hostErrorTimeoutIface =
1583 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1584 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1585
1586 host_error_monitor::hostErrorTimeoutIface->register_property(
1587 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1588 [](const std::size_t& requested, std::size_t& resp) {
1589 if (requested > host_error_monitor::caterrTimeoutMsMax)
1590 {
1591 std::cerr << "IERRTimeoutMs update to " << requested
1592 << "ms rejected. Cannot be greater than "
1593 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1594 return 0;
1595 }
1596 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1597 host_error_monitor::caterrTimeoutMs = requested;
1598 resp = requested;
1599 return 1;
1600 },
1601 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1602 host_error_monitor::hostErrorTimeoutIface->initialize();
1603
Jason M. Bills1490b142019-07-01 15:48:43 -07001604 // Start tracking host state
1605 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1606 host_error_monitor::startHostStateMonitor();
1607
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001608 // Request CPU1_MISMATCH GPIO events
1609 if (!host_error_monitor::requestGPIOInput(
1610 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1611 {
1612 return -1;
1613 }
1614
1615 // Request CPU2_MISMATCH GPIO events
1616 if (!host_error_monitor::requestGPIOInput(
1617 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1618 {
1619 return -1;
1620 }
1621
Jason M. Bills1490b142019-07-01 15:48:43 -07001622 // Initialize the host state
1623 host_error_monitor::initializeHostState();
1624
1625 // Request CPU_CATERR GPIO events
1626 if (!host_error_monitor::requestGPIOEvents(
1627 "CPU_CATERR", host_error_monitor::caterrHandler,
1628 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1629 {
1630 return -1;
1631 }
1632
Jason M. Bills8c584392019-08-19 11:05:51 -07001633 // Request CPU_ERR0 GPIO events
1634 if (!host_error_monitor::requestGPIOEvents(
1635 "CPU_ERR0", host_error_monitor::err0Handler,
1636 host_error_monitor::err0Line, host_error_monitor::err0Event))
1637 {
1638 return -1;
1639 }
1640
Jason M. Bills75af3962019-08-19 11:07:17 -07001641 // Request CPU_ERR1 GPIO events
1642 if (!host_error_monitor::requestGPIOEvents(
1643 "CPU_ERR1", host_error_monitor::err1Handler,
1644 host_error_monitor::err1Line, host_error_monitor::err1Event))
1645 {
1646 return -1;
1647 }
1648
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001649 // Request CPU_ERR2 GPIO events
1650 if (!host_error_monitor::requestGPIOEvents(
1651 "CPU_ERR2", host_error_monitor::err2Handler,
1652 host_error_monitor::err2Line, host_error_monitor::err2Event))
1653 {
1654 return -1;
1655 }
1656
Jason M. Bills89922f82019-08-06 11:10:02 -07001657 // Request SMI GPIO events
1658 if (!host_error_monitor::requestGPIOEvents(
1659 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1660 host_error_monitor::smiEvent))
1661 {
1662 return -1;
1663 }
1664
Jason M. Bills45e87e02019-09-09 14:45:38 -07001665 // Request CPU1_FIVR_FAULT GPIO input
1666 if (!host_error_monitor::requestGPIOInput(
1667 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1668 {
1669 return -1;
1670 }
1671
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001672 // Request CPU1_THERMTRIP GPIO events
1673 if (!host_error_monitor::requestGPIOEvents(
1674 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1675 host_error_monitor::cpu1ThermtripLine,
1676 host_error_monitor::cpu1ThermtripEvent))
1677 {
1678 return -1;
1679 }
1680
Jason M. Bills45e87e02019-09-09 14:45:38 -07001681 // Request CPU2_FIVR_FAULT GPIO input
1682 if (!host_error_monitor::requestGPIOInput(
1683 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1684 {
1685 return -1;
1686 }
1687
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001688 // Request CPU2_THERMTRIP GPIO events
1689 if (!host_error_monitor::requestGPIOEvents(
1690 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1691 host_error_monitor::cpu2ThermtripLine,
1692 host_error_monitor::cpu2ThermtripEvent))
1693 {
1694 return -1;
1695 }
1696
Jason M. Bills250fa632019-08-28 15:58:25 -07001697 // Request CPU1_VRHOT GPIO events
1698 if (!host_error_monitor::requestGPIOEvents(
1699 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1700 host_error_monitor::cpu1VRHotLine,
1701 host_error_monitor::cpu1VRHotEvent))
1702 {
1703 return -1;
1704 }
1705
Jason M. Bills9647ba72019-08-29 14:19:19 -07001706 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1707 if (!host_error_monitor::requestGPIOEvents(
1708 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1709 host_error_monitor::cpu1MemABCDVRHotLine,
1710 host_error_monitor::cpu1MemABCDVRHotEvent))
1711 {
1712 return -1;
1713 }
1714
1715 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1716 if (!host_error_monitor::requestGPIOEvents(
1717 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1718 host_error_monitor::cpu1MemEFGHVRHotLine,
1719 host_error_monitor::cpu1MemEFGHVRHotEvent))
1720 {
1721 return -1;
1722 }
1723
Jason M. Bills250fa632019-08-28 15:58:25 -07001724 // Request CPU2_VRHOT GPIO events
1725 if (!host_error_monitor::requestGPIOEvents(
1726 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1727 host_error_monitor::cpu2VRHotLine,
1728 host_error_monitor::cpu2VRHotEvent))
1729 {
1730 return -1;
1731 }
1732
Jason M. Bills9647ba72019-08-29 14:19:19 -07001733 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1734 if (!host_error_monitor::requestGPIOEvents(
1735 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1736 host_error_monitor::cpu2MemABCDVRHotLine,
1737 host_error_monitor::cpu2MemABCDVRHotEvent))
1738 {
1739 return -1;
1740 }
1741
1742 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1743 if (!host_error_monitor::requestGPIOEvents(
1744 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1745 host_error_monitor::cpu2MemEFGHVRHotLine,
1746 host_error_monitor::cpu2MemEFGHVRHotEvent))
1747 {
1748 return -1;
1749 }
1750
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001751 // Request PCH_BMC_THERMTRIP GPIO events
1752 if (!host_error_monitor::requestGPIOEvents(
1753 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1754 host_error_monitor::pchThermtripLine,
1755 host_error_monitor::pchThermtripEvent))
1756 {
1757 return -1;
1758 }
1759
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001760 // Request CPU1_MEM_THERM_EVENT GPIO events
1761 if (!host_error_monitor::requestGPIOEvents(
1762 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1763 host_error_monitor::cpu1MemtripLine,
1764 host_error_monitor::cpu1MemtripEvent))
1765 {
1766 return -1;
1767 }
1768
1769 // Request CPU2_MEM_THERM_EVENT GPIO events
1770 if (!host_error_monitor::requestGPIOEvents(
1771 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1772 host_error_monitor::cpu2MemtripLine,
1773 host_error_monitor::cpu2MemtripEvent))
1774 {
1775 return -1;
1776 }
1777
Jason M. Bills1490b142019-07-01 15:48:43 -07001778 host_error_monitor::io.run();
1779
1780 return 0;
1781}