blob: ddfb54a926b9b57be4886082522cebb8acaa183e [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills5245ed62020-12-04 16:50:21 -080022#include <error_monitors.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <gpiod.hpp>
Jason M. Billsd711cc82020-12-04 16:46:39 -080024#include <host_error_monitor.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070025#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070026
27#include <bitset>
28#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070029#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070030
31namespace host_error_monitor
32{
33static boost::asio::io_service io;
34static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080035static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070036
Yong Li1429ca82020-04-27 16:49:45 +080037using Association = std::tuple<std::string, std::string, std::string>;
38static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
39static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
40
41static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
42
Jason M. Bills1490b142019-07-01 15:48:43 -070043static bool hostOff = true;
Jason M. Billsd711cc82020-12-04 16:46:39 -080044bool hostIsOff()
45{
46 return hostOff;
47}
Jason M. Bills1490b142019-07-01 15:48:43 -070048
Jason M. Billsc4b91f22019-11-26 17:04:50 -080049static size_t caterrTimeoutMs = 2000;
50const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070051const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070052
53// Timers
54// Timer for CATERR asserted
55static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070056// Timer for ERR0 asserted
57static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070058// Timer for ERR1 asserted
59static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070060// Timer for ERR2 asserted
61static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070062
63// GPIO Lines and Event Descriptors
64static gpiod::line caterrLine;
65static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070066static gpiod::line err0Line;
67static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070068static gpiod::line err1Line;
69static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070070static gpiod::line err2Line;
71static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070072static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070073static gpiod::line cpu1ThermtripLine;
74static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070075static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070076static gpiod::line cpu2ThermtripLine;
77static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070078static gpiod::line cpu1VRHotLine;
79static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
80static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070081static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
82static gpiod::line cpu1MemEFGHVRHotLine;
83static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
84static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070085static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070086static gpiod::line cpu1MemABCDVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
88static gpiod::line cpu2MemEFGHVRHotLine;
89static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080090//----------------------------------
91// PCH_BMC_THERMTRIP function related definition
92//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080093static gpiod::line pchThermtripLine;
94static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000095//----------------------------------
96// CPU_MEM_THERM_EVENT function related definition
97//----------------------------------
98static gpiod::line cpu1MemtripLine;
99static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
100static gpiod::line cpu2MemtripLine;
101static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000102//---------------------------------
103// CPU_MISMATCH function related definition
104//---------------------------------
105static gpiod::line cpu1MismatchLine;
106static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700107
Yong Li061eb032020-02-26 15:06:18 +0800108// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800109const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800110const static constexpr uint8_t beepCPUErr2 = 5;
111
112static void beep(const uint8_t& beepPriority)
113{
114 conn->async_method_call(
115 [](boost::system::error_code ec) {
116 if (ec)
117 {
118 std::cerr << "beep returned error with "
119 "async_method_call (ec = "
120 << ec << ")\n";
121 return;
122 }
123 },
124 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
125 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
126}
127
Jason M. Billsa3397932019-08-06 11:07:21 -0700128static void cpuIERRLog()
129{
130 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
131 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
132 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
133}
134
135static void cpuIERRLog(const int cpuNum)
136{
137 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
138
139 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
140 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
141 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
142}
143
144static void cpuIERRLog(const int cpuNum, const std::string& type)
145{
146 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
147
148 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
149 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
150 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
151}
152
Jason M. Billscbf78532019-08-16 15:32:11 -0700153static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700154{
Jason M. Billscbf78532019-08-16 15:32:11 -0700155 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
156
157 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
158 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
159 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700160}
161
Jason M. Billscbf78532019-08-16 15:32:11 -0700162static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700163{
Jason M. Billscbf78532019-08-16 15:32:11 -0700164 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
165 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700166
167 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
168 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
169 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
170}
171
Jason M. Bills45e87e02019-09-09 14:45:38 -0700172static void cpuBootFIVRFaultLog(const int cpuNum)
173{
174 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
175
176 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
177 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
178 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
179}
180
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700181static void cpuThermTripLog(const int cpuNum)
182{
183 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
184
185 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
186 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
187 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
188 cpuNum, NULL);
189}
190
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000191static void memThermTripLog(const int cpuNum)
192{
193 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
194 std::string msg = cpuNumber + " Memory Thermal trip.";
195
196 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
197 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
198 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
199 cpuNumber.c_str(), NULL);
200}
201
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000202static void cpuMismatchLog(const int cpuNum)
203{
204 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
205
206 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
207 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
208 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
209}
210
Jason M. Bills250fa632019-08-28 15:58:25 -0700211static void cpuVRHotLog(const std::string& vr)
212{
213 std::string msg = vr + " Voltage Regulator Overheated.";
214
215 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
216 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
217 "OpenBMC.0.1.VoltageRegulatorOverheated",
218 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
219}
220
Jason M. Bills08866542019-08-16 12:04:19 -0700221static void ssbThermTripLog()
222{
223 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
224 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
225 "OpenBMC.0.1.SsbThermalTrip", NULL);
226}
227
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700228static inline bool peciError(EPECIStatus peciStatus, uint8_t cc)
229{
230 return (
231 peciStatus != PECI_CC_SUCCESS ||
232 (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED));
233}
234
235static void printPECIError(const std::string& reg, const size_t addr,
236 const EPECIStatus peciStatus, const size_t cc)
237{
238 std::cerr << "Failed to read " << reg << " on CPU address " << addr
239 << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc
240 << "\n";
241}
242
Jason M. Billsa15c2522019-08-16 10:01:44 -0700243static void initializeErrorState();
Jason M. Bills5245ed62020-12-04 16:50:21 -0800244static void init()
Jason M. Bills1490b142019-07-01 15:48:43 -0700245{
Jason M. Bills5245ed62020-12-04 16:50:21 -0800246 // Get the current host state to prepare to start the signal monitors
Jason M. Bills1490b142019-07-01 15:48:43 -0700247 conn->async_method_call(
248 [](boost::system::error_code ec,
249 const std::variant<std::string>& property) {
250 if (ec)
251 {
252 return;
253 }
254 const std::string* state = std::get_if<std::string>(&property);
255 if (state == nullptr)
256 {
257 std::cerr << "Unable to read host state value\n";
258 return;
259 }
260 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700261 // If the system is on, initialize the error state
262 if (!hostOff)
263 {
264 initializeErrorState();
265 }
Jason M. Bills5245ed62020-12-04 16:50:21 -0800266
267 // Now we have the host state, start the signal monitors
268 if (!error_monitors::startMonitors(io, conn))
269 {
270 throw std::runtime_error("Failed to start signal monitors");
271 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700272 },
273 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
274 "org.freedesktop.DBus.Properties", "Get",
275 "xyz.openbmc_project.State.Host", "CurrentHostState");
276}
277
278static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
279{
280 return std::make_shared<sdbusplus::bus::match::match>(
281 *conn,
282 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700283 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700284 [](sdbusplus::message::message& msg) {
285 std::string interfaceName;
286 boost::container::flat_map<std::string, std::variant<std::string>>
287 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700288 try
289 {
290 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700291 }
292 catch (std::exception& e)
293 {
294 std::cerr << "Unable to read host state\n";
295 return;
296 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700297 // We only want to check for CurrentHostState
298 if (propertiesChanged.begin()->first != "CurrentHostState")
299 {
300 return;
301 }
302 std::string* state =
303 std::get_if<std::string>(&(propertiesChanged.begin()->second));
304 if (state == nullptr)
305 {
306 std::cerr << propertiesChanged.begin()->first
307 << " property invalid\n";
308 return;
309 }
310
311 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700312
Jason M. Bills1490b142019-07-01 15:48:43 -0700313 if (hostOff)
314 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700315 // No host events should fire while off, so cancel any pending
316 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700317 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700318 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700319 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700320 err2AssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700321 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700322 else
323 {
324 // Handle any initial errors when the host turns on
325 initializeErrorState();
Jason M. Bills5245ed62020-12-04 16:50:21 -0800326 error_monitors::sendHostOn();
Jason M. Billse94f5e12019-09-13 11:11:34 -0700327 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700328 });
329}
330
331static bool requestGPIOEvents(
332 const std::string& name, const std::function<void()>& handler,
333 gpiod::line& gpioLine,
334 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
335{
336 // Find the GPIO line
337 gpioLine = gpiod::find_line(name);
338 if (!gpioLine)
339 {
340 std::cerr << "Failed to find the " << name << " line\n";
341 return false;
342 }
343
344 try
345 {
346 gpioLine.request(
347 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
348 }
349 catch (std::exception&)
350 {
351 std::cerr << "Failed to request events for " << name << "\n";
352 return false;
353 }
354
355 int gpioLineFd = gpioLine.event_get_fd();
356 if (gpioLineFd < 0)
357 {
358 std::cerr << "Failed to get " << name << " fd\n";
359 return false;
360 }
361
362 gpioEventDescriptor.assign(gpioLineFd);
363
364 gpioEventDescriptor.async_wait(
365 boost::asio::posix::stream_descriptor::wait_read,
366 [&name, handler](const boost::system::error_code ec) {
367 if (ec)
368 {
369 std::cerr << name << " fd handler error: " << ec.message()
370 << "\n";
371 return;
372 }
373 handler();
374 });
375 return true;
376}
377
Jason M. Bills45e87e02019-09-09 14:45:38 -0700378static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
379{
380 // Find the GPIO line
381 gpioLine = gpiod::find_line(name);
382 if (!gpioLine)
383 {
384 std::cerr << "Failed to find the " << name << " line.\n";
385 return false;
386 }
387
388 // Request GPIO input
389 try
390 {
391 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
392 }
393 catch (std::exception&)
394 {
395 std::cerr << "Failed to request " << name << " input\n";
396 return false;
397 }
398
399 return true;
400}
401
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700402static void incrementCPUErrorCount(int cpuNum)
403{
404 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
405
406 // Get the current count
407 conn->async_method_call(
408 [propertyName](boost::system::error_code ec,
409 const std::variant<uint8_t>& property) {
410 if (ec)
411 {
412 std::cerr << "Failed to read " << propertyName << ": "
413 << ec.message() << "\n";
414 return;
415 }
416 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
417 if (errorCountVariant == nullptr)
418 {
419 std::cerr << propertyName << " invalid\n";
420 return;
421 }
422 uint8_t errorCount = *errorCountVariant;
423 if (errorCount == std::numeric_limits<uint8_t>::max())
424 {
425 std::cerr << "Maximum error count reached\n";
426 return;
427 }
428 // Increment the count
429 errorCount++;
430 conn->async_method_call(
431 [propertyName](boost::system::error_code ec) {
432 if (ec)
433 {
434 std::cerr << "Failed to set " << propertyName << ": "
435 << ec.message() << "\n";
436 }
437 },
438 "xyz.openbmc_project.Settings",
439 "/xyz/openbmc_project/control/processor_error_config",
440 "org.freedesktop.DBus.Properties", "Set",
441 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
442 std::variant<uint8_t>{errorCount});
443 },
444 "xyz.openbmc_project.Settings",
445 "/xyz/openbmc_project/control/processor_error_config",
446 "org.freedesktop.DBus.Properties", "Get",
447 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
448}
449
Jason M. Billsa3397932019-08-06 11:07:21 -0700450static bool checkIERRCPUs()
451{
452 bool cpuIERRFound = false;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700453 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Billsa3397932019-08-06 11:07:21 -0700454 cpu++, addr++)
455 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700456 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Billsa3397932019-08-06 11:07:21 -0700457 uint8_t cc = 0;
458 CPUModel model{};
459 uint8_t stepping = 0;
460 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
461 {
462 std::cerr << "Cannot get CPUID!\n";
463 continue;
464 }
465
466 switch (model)
467 {
468 case skx:
469 {
470 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
471 // that caused the IERR
472 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700473 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
474 (uint8_t*)&mcaErrSrcLog, &cc);
475 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700476 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700477 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700478 continue;
479 }
480 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
481 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
482 {
483 // TODO: Light the CPU fault LED?
484 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700485 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700486 // Next check if it's a CPU/VR mismatch by reading the
487 // IA32_MC4_STATUS MSR (0x411)
488 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700489 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
490 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700491 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700492 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700493 continue;
494 }
495 // Check MSEC bits 31:24 for
496 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
497 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
498 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700499 uint64_t msec = (mc4Status >> 24) & 0xFF;
500 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700501 {
502 cpuIERRLog(cpu, "CPU/VR Mismatch");
503 continue;
504 }
505
506 // Next check if it's a Core FIVR fault by looking for a
507 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
508 // 80h)
509 uint32_t coreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700510 peciStatus = peci_RdPCIConfigLocal(
511 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
512 (uint8_t*)&coreFIVRErrLog, &cc);
513 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700514 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700515 printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus,
516 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700517 continue;
518 }
519 if (coreFIVRErrLog)
520 {
521 cpuIERRLog(cpu, "Core FIVR Fault");
522 continue;
523 }
524
525 // Next check if it's an Uncore FIVR fault by looking for a
526 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
527 // 84h)
528 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700529 peciStatus = peci_RdPCIConfigLocal(
530 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
531 (uint8_t*)&uncoreFIVRErrLog, &cc);
532 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700533 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700534 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
535 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700536 continue;
537 }
538 if (uncoreFIVRErrLog)
539 {
540 cpuIERRLog(cpu, "Uncore FIVR Fault");
541 continue;
542 }
543
544 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
545 // both zero, but MSEC bits 31:24 have either
546 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
547 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
548 // uncore FIVR fault
549 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700550 (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700551 {
552 cpuIERRLog(cpu, "Uncore FIVR Fault");
553 continue;
554 }
555 cpuIERRLog(cpu);
556 }
557 break;
558 }
559 case icx:
560 {
561 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
562 // that caused the IERR
563 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700564 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
565 (uint8_t*)&mcaErrSrcLog, &cc);
566 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700567 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700568 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700569 continue;
570 }
571 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
572 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
573 {
574 // TODO: Light the CPU fault LED?
575 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700576 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700577 // Next check if it's a CPU/VR mismatch by reading the
578 // IA32_MC4_STATUS MSR (0x411)
579 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700580 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
581 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700582 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700583 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700584 continue;
585 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700586 // Check MSEC bits 31:24 for
587 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
588 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
589 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700590 uint64_t msec = (mc4Status >> 24) & 0xFF;
591 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700592 {
593 cpuIERRLog(cpu, "CPU/VR Mismatch");
594 continue;
595 }
596
597 // Next check if it's a Core FIVR fault by looking for a
598 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
599 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
600 uint32_t coreFIVRErrLog0 = 0;
601 uint32_t coreFIVRErrLog1 = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700602 peciStatus = peci_RdEndPointConfigPciLocal(
603 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
604 (uint8_t*)&coreFIVRErrLog0, &cc);
605 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700606 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700607 printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus,
608 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700609 continue;
610 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700611 peciStatus = peci_RdEndPointConfigPciLocal(
612 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
613 (uint8_t*)&coreFIVRErrLog1, &cc);
614 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700615 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700616 printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus,
617 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700618 continue;
619 }
620 if (coreFIVRErrLog0 || coreFIVRErrLog1)
621 {
622 cpuIERRLog(cpu, "Core FIVR Fault");
623 continue;
624 }
625
626 // Next check if it's an Uncore FIVR fault by looking for a
627 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
628 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
629 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700630 peciStatus = peci_RdEndPointConfigPciLocal(
631 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
632 (uint8_t*)&uncoreFIVRErrLog, &cc);
633 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700634 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700635 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
636 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700637 continue;
638 }
639 if (uncoreFIVRErrLog)
640 {
641 cpuIERRLog(cpu, "Uncore FIVR Fault");
642 continue;
643 }
644
645 // TODO: Update MSEC/MSCOD_31_24 check
646 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
647 // both zero, but MSEC bits 31:24 have either
648 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
649 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
650 // uncore FIVR fault
651 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700652 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700653 {
654 cpuIERRLog(cpu, "Uncore FIVR Fault");
655 continue;
656 }
657 cpuIERRLog(cpu);
658 }
659 break;
660 }
661 }
662 }
663 return cpuIERRFound;
664}
665
Jason M. Billsa15c2522019-08-16 10:01:44 -0700666static void caterrAssertHandler()
667{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700668 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
669 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
670 if (ec)
671 {
672 // operation_aborted is expected if timer is canceled
673 // before completion.
674 if (ec != boost::asio::error::operation_aborted)
675 {
676 std::cerr << "caterr timeout async_wait failed: "
677 << ec.message() << "\n";
678 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700679 return;
680 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700681 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
682 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800683 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700684 if (!checkIERRCPUs())
685 {
686 cpuIERRLog();
687 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700688 conn->async_method_call(
689 [](boost::system::error_code ec,
690 const std::variant<bool>& property) {
691 if (ec)
692 {
693 return;
694 }
695 const bool* reset = std::get_if<bool>(&property);
696 if (reset == nullptr)
697 {
698 std::cerr << "Unable to read reset on CATERR value\n";
699 return;
700 }
Jason M. Billsd711cc82020-12-04 16:46:39 -0800701 startCrashdumpAndRecovery(conn, *reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700702 },
703 "xyz.openbmc_project.Settings",
704 "/xyz/openbmc_project/control/processor_error_config",
705 "org.freedesktop.DBus.Properties", "Get",
706 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
707 });
708}
709
Jason M. Bills1490b142019-07-01 15:48:43 -0700710static void caterrHandler()
711{
712 if (!hostOff)
713 {
714 gpiod::line_event gpioLineEvent = caterrLine.event_read();
715
716 bool caterr =
717 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800718
719 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700720 if (caterr)
721 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700722 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800723 associations.emplace_back(
724 "", "critical",
725 "/xyz/openbmc_project/host_error_monitor/cat_error");
726 associations.emplace_back("", "critical",
727 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700728 }
729 else
730 {
731 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800732 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700733 }
Yong Li1429ca82020-04-27 16:49:45 +0800734 host_error_monitor::associationCATAssert->set_property("Associations",
735 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700736 }
737 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
738 [](const boost::system::error_code ec) {
739 if (ec)
740 {
741 std::cerr << "caterr handler error: "
742 << ec.message() << "\n";
743 return;
744 }
745 caterrHandler();
746 });
747}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700748
Jason M. Billse94f5e12019-09-13 11:11:34 -0700749static void cpu1ThermtripAssertHandler()
750{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700751 if (cpu1FIVRFaultLine.get_value() == 0)
752 {
753 cpuBootFIVRFaultLog(1);
754 }
755 else
756 {
757 cpuThermTripLog(1);
758 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700759}
760
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700761static void cpu1ThermtripHandler()
762{
Jason M. Bills84951142020-04-17 15:57:11 -0700763 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700764
Jason M. Bills84951142020-04-17 15:57:11 -0700765 bool cpu1Thermtrip =
766 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
767 if (cpu1Thermtrip)
768 {
769 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700770 }
Jason M. Bills84951142020-04-17 15:57:11 -0700771
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700772 cpu1ThermtripEvent.async_wait(
773 boost::asio::posix::stream_descriptor::wait_read,
774 [](const boost::system::error_code ec) {
775 if (ec)
776 {
777 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
778 << "\n";
779 return;
780 }
781 cpu1ThermtripHandler();
782 });
783}
784
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000785static void cpu1MemtripHandler()
786{
Jason M. Bills5287c022020-05-19 11:16:09 -0700787 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000788
Jason M. Bills5287c022020-05-19 11:16:09 -0700789 bool cpu1Memtrip =
790 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
791 if (cpu1Memtrip)
792 {
793 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000794 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700795
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000796 cpu1MemtripEvent.async_wait(
797 boost::asio::posix::stream_descriptor::wait_read,
798 [](const boost::system::error_code ec) {
799 if (ec)
800 {
801 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
802 << ec.message() << "\n";
803 return;
804 }
805 cpu1MemtripHandler();
806 });
807}
808
Jason M. Billse94f5e12019-09-13 11:11:34 -0700809static void cpu2ThermtripAssertHandler()
810{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700811 if (cpu2FIVRFaultLine.get_value() == 0)
812 {
813 cpuBootFIVRFaultLog(2);
814 }
815 else
816 {
817 cpuThermTripLog(2);
818 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700819}
820
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700821static void cpu2ThermtripHandler()
822{
Jason M. Bills84951142020-04-17 15:57:11 -0700823 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700824
Jason M. Bills84951142020-04-17 15:57:11 -0700825 bool cpu2Thermtrip =
826 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
827 if (cpu2Thermtrip)
828 {
829 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700830 }
Jason M. Bills84951142020-04-17 15:57:11 -0700831
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700832 cpu2ThermtripEvent.async_wait(
833 boost::asio::posix::stream_descriptor::wait_read,
834 [](const boost::system::error_code ec) {
835 if (ec)
836 {
837 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
838 << "\n";
839 return;
840 }
841 cpu2ThermtripHandler();
842 });
843}
844
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000845static void cpu2MemtripHandler()
846{
Jason M. Bills5287c022020-05-19 11:16:09 -0700847 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000848
Jason M. Bills5287c022020-05-19 11:16:09 -0700849 bool cpu2Memtrip =
850 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
851 if (cpu2Memtrip)
852 {
853 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000854 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700855
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000856 cpu2MemtripEvent.async_wait(
857 boost::asio::posix::stream_descriptor::wait_read,
858 [](const boost::system::error_code ec) {
859 if (ec)
860 {
861 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
862 << ec.message() << "\n";
863 return;
864 }
865 cpu2MemtripHandler();
866 });
867}
868
Jason M. Billse94f5e12019-09-13 11:11:34 -0700869static void cpu1VRHotAssertHandler()
870{
871 cpuVRHotLog("CPU 1");
872}
873
Jason M. Bills250fa632019-08-28 15:58:25 -0700874static void cpu1VRHotHandler()
875{
Jason M. Bills84951142020-04-17 15:57:11 -0700876 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700877
Jason M. Bills84951142020-04-17 15:57:11 -0700878 bool cpu1VRHot =
879 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
880 if (cpu1VRHot)
881 {
882 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700883 }
Jason M. Bills84951142020-04-17 15:57:11 -0700884
Jason M. Bills250fa632019-08-28 15:58:25 -0700885 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
886 [](const boost::system::error_code ec) {
887 if (ec)
888 {
889 std::cerr << "CPU 1 VRHot handler error: "
890 << ec.message() << "\n";
891 return;
892 }
893 cpu1VRHotHandler();
894 });
895}
896
Jason M. Billse94f5e12019-09-13 11:11:34 -0700897static void cpu1MemABCDVRHotAssertHandler()
898{
899 cpuVRHotLog("CPU 1 Memory ABCD");
900}
901
Jason M. Bills9647ba72019-08-29 14:19:19 -0700902static void cpu1MemABCDVRHotHandler()
903{
Jason M. Bills84951142020-04-17 15:57:11 -0700904 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700905
Jason M. Bills84951142020-04-17 15:57:11 -0700906 bool cpu1MemABCDVRHot =
907 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
908 if (cpu1MemABCDVRHot)
909 {
910 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700911 }
Jason M. Bills84951142020-04-17 15:57:11 -0700912
Jason M. Bills9647ba72019-08-29 14:19:19 -0700913 cpu1MemABCDVRHotEvent.async_wait(
914 boost::asio::posix::stream_descriptor::wait_read,
915 [](const boost::system::error_code ec) {
916 if (ec)
917 {
918 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
919 << ec.message() << "\n";
920 return;
921 }
922 cpu1MemABCDVRHotHandler();
923 });
924}
925
Jason M. Billse94f5e12019-09-13 11:11:34 -0700926static void cpu1MemEFGHVRHotAssertHandler()
927{
928 cpuVRHotLog("CPU 1 Memory EFGH");
929}
930
Jason M. Bills9647ba72019-08-29 14:19:19 -0700931static void cpu1MemEFGHVRHotHandler()
932{
Jason M. Bills84951142020-04-17 15:57:11 -0700933 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700934
Jason M. Bills84951142020-04-17 15:57:11 -0700935 bool cpu1MemEFGHVRHot =
936 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
937 if (cpu1MemEFGHVRHot)
938 {
939 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700940 }
Jason M. Bills84951142020-04-17 15:57:11 -0700941
Jason M. Bills9647ba72019-08-29 14:19:19 -0700942 cpu1MemEFGHVRHotEvent.async_wait(
943 boost::asio::posix::stream_descriptor::wait_read,
944 [](const boost::system::error_code ec) {
945 if (ec)
946 {
947 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
948 << ec.message() << "\n";
949 return;
950 }
951 cpu1MemEFGHVRHotHandler();
952 });
953}
954
Jason M. Billse94f5e12019-09-13 11:11:34 -0700955static void cpu2VRHotAssertHandler()
956{
957 cpuVRHotLog("CPU 2");
958}
959
Jason M. Bills250fa632019-08-28 15:58:25 -0700960static void cpu2VRHotHandler()
961{
Jason M. Bills84951142020-04-17 15:57:11 -0700962 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700963
Jason M. Bills84951142020-04-17 15:57:11 -0700964 bool cpu2VRHot =
965 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
966 if (cpu2VRHot)
967 {
968 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700969 }
Jason M. Bills84951142020-04-17 15:57:11 -0700970
Jason M. Bills250fa632019-08-28 15:58:25 -0700971 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
972 [](const boost::system::error_code ec) {
973 if (ec)
974 {
975 std::cerr << "CPU 2 VRHot handler error: "
976 << ec.message() << "\n";
977 return;
978 }
979 cpu2VRHotHandler();
980 });
981}
982
Jason M. Billse94f5e12019-09-13 11:11:34 -0700983static void cpu2MemABCDVRHotAssertHandler()
984{
985 cpuVRHotLog("CPU 2 Memory ABCD");
986}
987
Jason M. Bills9647ba72019-08-29 14:19:19 -0700988static void cpu2MemABCDVRHotHandler()
989{
Jason M. Bills84951142020-04-17 15:57:11 -0700990 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700991
Jason M. Bills84951142020-04-17 15:57:11 -0700992 bool cpu2MemABCDVRHot =
993 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
994 if (cpu2MemABCDVRHot)
995 {
996 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700997 }
Jason M. Bills84951142020-04-17 15:57:11 -0700998
Jason M. Bills9647ba72019-08-29 14:19:19 -0700999 cpu2MemABCDVRHotEvent.async_wait(
1000 boost::asio::posix::stream_descriptor::wait_read,
1001 [](const boost::system::error_code ec) {
1002 if (ec)
1003 {
1004 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1005 << ec.message() << "\n";
1006 return;
1007 }
1008 cpu2MemABCDVRHotHandler();
1009 });
1010}
1011
Jason M. Billse94f5e12019-09-13 11:11:34 -07001012static void cpu2MemEFGHVRHotAssertHandler()
1013{
1014 cpuVRHotLog("CPU 2 Memory EFGH");
1015}
1016
Jason M. Bills9647ba72019-08-29 14:19:19 -07001017static void cpu2MemEFGHVRHotHandler()
1018{
Jason M. Bills84951142020-04-17 15:57:11 -07001019 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001020
Jason M. Bills84951142020-04-17 15:57:11 -07001021 bool cpu2MemEFGHVRHot =
1022 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1023 if (cpu2MemEFGHVRHot)
1024 {
1025 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001026 }
Jason M. Bills84951142020-04-17 15:57:11 -07001027
Jason M. Bills9647ba72019-08-29 14:19:19 -07001028 cpu2MemEFGHVRHotEvent.async_wait(
1029 boost::asio::posix::stream_descriptor::wait_read,
1030 [](const boost::system::error_code ec) {
1031 if (ec)
1032 {
1033 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1034 << ec.message() << "\n";
1035 return;
1036 }
1037 cpu2MemEFGHVRHotHandler();
1038 });
1039}
1040
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001041static void pchThermtripHandler()
1042{
Yong Li1429ca82020-04-27 16:49:45 +08001043 std::vector<Association> associations;
1044
Jason M. Bills84951142020-04-17 15:57:11 -07001045 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001046
Jason M. Bills84951142020-04-17 15:57:11 -07001047 bool pchThermtrip =
1048 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1049 if (pchThermtrip)
1050 {
1051 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001052 associations.emplace_back(
1053 "", "critical",
1054 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1055 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001056 }
Yong Li1429ca82020-04-27 16:49:45 +08001057 else
1058 {
1059 associations.emplace_back("", "", "");
1060 }
1061 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1062 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001063
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001064 pchThermtripEvent.async_wait(
1065 boost::asio::posix::stream_descriptor::wait_read,
1066 [](const boost::system::error_code ec) {
1067 if (ec)
1068 {
1069 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1070 << "\n";
1071 return;
1072 }
1073 pchThermtripHandler();
1074 });
1075}
1076
Jason M. Billscbf78532019-08-16 15:32:11 -07001077static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001078{
Jason M. Billscbf78532019-08-16 15:32:11 -07001079 int errPinSts = (1 << errPin);
1080 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001081 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001082 cpu++, addr++)
1083 {
1084 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1085 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001086 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001087 uint8_t cc = 0;
1088 CPUModel model{};
1089 uint8_t stepping = 0;
1090 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1091 {
1092 std::cerr << "Cannot get CPUID!\n";
1093 continue;
1094 }
1095
1096 switch (model)
1097 {
1098 case skx:
1099 {
1100 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001101 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001102 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001103 peciStatus = peci_RdPCIConfigLocal(
1104 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1105 (uint8_t*)&errpinsts, &cc);
1106 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001107 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001108 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1109 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001110 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001111
1112 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001113 break;
1114 }
1115 case icx:
1116 {
1117 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001118 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001119 // accessed on PECI as bus 13)
1120 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001121 peciStatus = peci_RdEndPointConfigPciLocal(
1122 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1123 (uint8_t*)&errpinsts, &cc);
1124 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001125 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001126 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1127 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001128 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001129
1130 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001131 break;
1132 }
1133 }
1134 }
1135 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001136 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001137}
1138
Jason M. Billscbf78532019-08-16 15:32:11 -07001139static void errXAssertHandler(const int errPin,
1140 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001141{
Jason M. Billscbf78532019-08-16 15:32:11 -07001142 // ERRx status is not guaranteed through the timeout, so save which
1143 // CPUs have it asserted
1144 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1145 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1146 errXAssertTimer.async_wait([errPin, errPinCPUs](
1147 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001148 if (ec)
1149 {
1150 // operation_aborted is expected if timer is canceled before
1151 // completion.
1152 if (ec != boost::asio::error::operation_aborted)
1153 {
1154 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1155 << "\n";
1156 }
1157 return;
1158 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001159 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1160 << std::to_string(errTimeoutMs) << " ms\n";
1161 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001162 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001163 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001164 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001165 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001166 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001167 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001168 }
1169 }
1170 }
1171 else
1172 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001173 cpuERRXLog(errPin);
1174 }
1175 });
1176}
1177
Jason M. Bills8c584392019-08-19 11:05:51 -07001178static void err0AssertHandler()
1179{
1180 // Handle the standard ERR0 detection and logging
1181 const static constexpr int err0 = 0;
1182 errXAssertHandler(err0, err0AssertTimer);
1183}
1184
1185static void err0Handler()
1186{
1187 if (!hostOff)
1188 {
1189 gpiod::line_event gpioLineEvent = err0Line.event_read();
1190
1191 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1192 if (err0)
1193 {
1194 err0AssertHandler();
1195 }
1196 else
1197 {
1198 err0AssertTimer.cancel();
1199 }
1200 }
1201 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1202 [](const boost::system::error_code ec) {
1203 if (ec)
1204 {
1205 std::cerr
1206 << "err0 handler error: " << ec.message()
1207 << "\n";
1208 return;
1209 }
1210 err0Handler();
1211 });
1212}
1213
Jason M. Bills75af3962019-08-19 11:07:17 -07001214static void err1AssertHandler()
1215{
1216 // Handle the standard ERR1 detection and logging
1217 const static constexpr int err1 = 1;
1218 errXAssertHandler(err1, err1AssertTimer);
1219}
1220
1221static void err1Handler()
1222{
1223 if (!hostOff)
1224 {
1225 gpiod::line_event gpioLineEvent = err1Line.event_read();
1226
1227 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1228 if (err1)
1229 {
1230 err1AssertHandler();
1231 }
1232 else
1233 {
1234 err1AssertTimer.cancel();
1235 }
1236 }
1237 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1238 [](const boost::system::error_code ec) {
1239 if (ec)
1240 {
1241 std::cerr
1242 << "err1 handler error: " << ec.message()
1243 << "\n";
1244 return;
1245 }
1246 err1Handler();
1247 });
1248}
1249
Jason M. Billscbf78532019-08-16 15:32:11 -07001250static void err2AssertHandler()
1251{
1252 // Handle the standard ERR2 detection and logging
1253 const static constexpr int err2 = 2;
1254 errXAssertHandler(err2, err2AssertTimer);
1255 // Also handle reset for ERR2
1256 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1257 if (ec)
1258 {
1259 // operation_aborted is expected if timer is canceled before
1260 // completion.
1261 if (ec != boost::asio::error::operation_aborted)
1262 {
1263 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1264 << "\n";
1265 }
1266 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001267 }
1268 conn->async_method_call(
1269 [](boost::system::error_code ec,
1270 const std::variant<bool>& property) {
1271 if (ec)
1272 {
1273 return;
1274 }
1275 const bool* reset = std::get_if<bool>(&property);
1276 if (reset == nullptr)
1277 {
1278 std::cerr << "Unable to read reset on ERR2 value\n";
1279 return;
1280 }
Jason M. Billsd711cc82020-12-04 16:46:39 -08001281 startCrashdumpAndRecovery(conn, *reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001282 },
1283 "xyz.openbmc_project.Settings",
1284 "/xyz/openbmc_project/control/processor_error_config",
1285 "org.freedesktop.DBus.Properties", "Get",
1286 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001287
1288 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001289 });
1290}
1291
1292static void err2Handler()
1293{
1294 if (!hostOff)
1295 {
1296 gpiod::line_event gpioLineEvent = err2Line.event_read();
1297
1298 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1299 if (err2)
1300 {
1301 err2AssertHandler();
1302 }
1303 else
1304 {
1305 err2AssertTimer.cancel();
1306 }
1307 }
1308 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1309 [](const boost::system::error_code ec) {
1310 if (ec)
1311 {
1312 std::cerr
1313 << "err2 handler error: " << ec.message()
1314 << "\n";
1315 return;
1316 }
1317 err2Handler();
1318 });
1319}
1320
Jason M. Billsa15c2522019-08-16 10:01:44 -07001321static void initializeErrorState()
1322{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001323 // Handle CPU1_MISMATCH if it's asserted now
1324 if (cpu1MismatchLine.get_value() == 1)
1325 {
1326 cpuMismatchLog(1);
1327 }
1328
1329 // Handle CPU2_MISMATCH if it's asserted now
1330 if (cpu2MismatchLine.get_value() == 1)
1331 {
1332 cpuMismatchLog(2);
1333 }
1334
Jason M. Billsa15c2522019-08-16 10:01:44 -07001335 // Handle CPU_CATERR if it's asserted now
1336 if (caterrLine.get_value() == 0)
1337 {
1338 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001339 std::vector<Association> associations;
1340 associations.emplace_back(
1341 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1342 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1343 host_error_monitor::associationCATAssert->set_property("Associations",
1344 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001345 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001346
Jason M. Bills8c584392019-08-19 11:05:51 -07001347 // Handle CPU_ERR0 if it's asserted now
1348 if (err0Line.get_value() == 0)
1349 {
1350 err0AssertHandler();
1351 }
1352
Jason M. Bills75af3962019-08-19 11:07:17 -07001353 // Handle CPU_ERR1 if it's asserted now
1354 if (err1Line.get_value() == 0)
1355 {
1356 err1AssertHandler();
1357 }
1358
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001359 // Handle CPU_ERR2 if it's asserted now
1360 if (err2Line.get_value() == 0)
1361 {
1362 err2AssertHandler();
1363 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001364
Jason M. Billse94f5e12019-09-13 11:11:34 -07001365 // Handle CPU1_THERMTRIP if it's asserted now
1366 if (cpu1ThermtripLine.get_value() == 0)
1367 {
1368 cpu1ThermtripAssertHandler();
1369 }
1370
1371 // Handle CPU2_THERMTRIP if it's asserted now
1372 if (cpu2ThermtripLine.get_value() == 0)
1373 {
1374 cpu2ThermtripAssertHandler();
1375 }
1376
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001377 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1378 if (cpu1MemtripLine.get_value() == 0)
1379 {
1380 memThermTripLog(1);
1381 }
1382
1383 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1384 if (cpu2MemtripLine.get_value() == 0)
1385 {
1386 memThermTripLog(2);
1387 }
1388
Jason M. Billse94f5e12019-09-13 11:11:34 -07001389 // Handle CPU1_VRHOT if it's asserted now
1390 if (cpu1VRHotLine.get_value() == 0)
1391 {
1392 cpu1VRHotAssertHandler();
1393 }
1394
1395 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1396 if (cpu1MemABCDVRHotLine.get_value() == 0)
1397 {
1398 cpu1MemABCDVRHotAssertHandler();
1399 }
1400
1401 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1402 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1403 {
1404 cpu1MemEFGHVRHotAssertHandler();
1405 }
1406
1407 // Handle CPU2_VRHOT if it's asserted now
1408 if (cpu2VRHotLine.get_value() == 0)
1409 {
1410 cpu2VRHotAssertHandler();
1411 }
1412
1413 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1414 if (cpu2MemABCDVRHotLine.get_value() == 0)
1415 {
1416 cpu2MemABCDVRHotAssertHandler();
1417 }
1418
1419 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1420 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1421 {
1422 cpu2MemEFGHVRHotAssertHandler();
1423 }
1424
Jason M. Bills08866542019-08-16 12:04:19 -07001425 // Handle PCH_BMC_THERMTRIP if it's asserted now
1426 if (pchThermtripLine.get_value() == 0)
1427 {
1428 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001429 std::vector<Association> associations;
1430 associations.emplace_back(
1431 "", "critical",
1432 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1433 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1434 host_error_monitor::associationSSBThermTrip->set_property(
1435 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001436 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001437}
Jason M. Bills1490b142019-07-01 15:48:43 -07001438} // namespace host_error_monitor
1439
1440int main(int argc, char* argv[])
1441{
1442 // setup connection to dbus
1443 host_error_monitor::conn =
1444 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1445
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001446 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001447 host_error_monitor::conn->request_name(
1448 "xyz.openbmc_project.HostErrorMonitor");
1449 sdbusplus::asio::object_server server =
1450 sdbusplus::asio::object_server(host_error_monitor::conn);
1451
Yong Li1429ca82020-04-27 16:49:45 +08001452 // Associations interface for led status
1453 std::vector<host_error_monitor::Association> associations;
1454 associations.emplace_back("", "", "");
1455 host_error_monitor::associationSSBThermTrip = server.add_interface(
1456 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1457 "xyz.openbmc_project.Association.Definitions");
1458 host_error_monitor::associationSSBThermTrip->register_property(
1459 "Associations", associations);
1460 host_error_monitor::associationSSBThermTrip->initialize();
1461
1462 host_error_monitor::associationCATAssert = server.add_interface(
1463 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1464 "xyz.openbmc_project.Association.Definitions");
1465 host_error_monitor::associationCATAssert->register_property("Associations",
1466 associations);
1467 host_error_monitor::associationCATAssert->initialize();
1468
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001469 // Restart Cause Interface
1470 host_error_monitor::hostErrorTimeoutIface =
1471 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1472 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1473
1474 host_error_monitor::hostErrorTimeoutIface->register_property(
1475 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1476 [](const std::size_t& requested, std::size_t& resp) {
1477 if (requested > host_error_monitor::caterrTimeoutMsMax)
1478 {
1479 std::cerr << "IERRTimeoutMs update to " << requested
1480 << "ms rejected. Cannot be greater than "
1481 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1482 return 0;
1483 }
1484 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1485 host_error_monitor::caterrTimeoutMs = requested;
1486 resp = requested;
1487 return 1;
1488 },
1489 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1490 host_error_monitor::hostErrorTimeoutIface->initialize();
1491
Jason M. Bills1490b142019-07-01 15:48:43 -07001492 // Start tracking host state
1493 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1494 host_error_monitor::startHostStateMonitor();
1495
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001496 // Request CPU1_MISMATCH GPIO events
1497 if (!host_error_monitor::requestGPIOInput(
1498 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1499 {
1500 return -1;
1501 }
1502
1503 // Request CPU2_MISMATCH GPIO events
1504 if (!host_error_monitor::requestGPIOInput(
1505 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1506 {
1507 return -1;
1508 }
1509
Jason M. Bills1490b142019-07-01 15:48:43 -07001510 // Request CPU_CATERR GPIO events
1511 if (!host_error_monitor::requestGPIOEvents(
1512 "CPU_CATERR", host_error_monitor::caterrHandler,
1513 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1514 {
1515 return -1;
1516 }
1517
Jason M. Bills8c584392019-08-19 11:05:51 -07001518 // Request CPU_ERR0 GPIO events
1519 if (!host_error_monitor::requestGPIOEvents(
1520 "CPU_ERR0", host_error_monitor::err0Handler,
1521 host_error_monitor::err0Line, host_error_monitor::err0Event))
1522 {
1523 return -1;
1524 }
1525
Jason M. Bills75af3962019-08-19 11:07:17 -07001526 // Request CPU_ERR1 GPIO events
1527 if (!host_error_monitor::requestGPIOEvents(
1528 "CPU_ERR1", host_error_monitor::err1Handler,
1529 host_error_monitor::err1Line, host_error_monitor::err1Event))
1530 {
1531 return -1;
1532 }
1533
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001534 // Request CPU_ERR2 GPIO events
1535 if (!host_error_monitor::requestGPIOEvents(
1536 "CPU_ERR2", host_error_monitor::err2Handler,
1537 host_error_monitor::err2Line, host_error_monitor::err2Event))
1538 {
1539 return -1;
1540 }
1541
Jason M. Bills45e87e02019-09-09 14:45:38 -07001542 // Request CPU1_FIVR_FAULT GPIO input
1543 if (!host_error_monitor::requestGPIOInput(
1544 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1545 {
1546 return -1;
1547 }
1548
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001549 // Request CPU1_THERMTRIP GPIO events
1550 if (!host_error_monitor::requestGPIOEvents(
1551 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1552 host_error_monitor::cpu1ThermtripLine,
1553 host_error_monitor::cpu1ThermtripEvent))
1554 {
1555 return -1;
1556 }
1557
Jason M. Bills45e87e02019-09-09 14:45:38 -07001558 // Request CPU2_FIVR_FAULT GPIO input
1559 if (!host_error_monitor::requestGPIOInput(
1560 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1561 {
1562 return -1;
1563 }
1564
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001565 // Request CPU2_THERMTRIP GPIO events
1566 if (!host_error_monitor::requestGPIOEvents(
1567 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1568 host_error_monitor::cpu2ThermtripLine,
1569 host_error_monitor::cpu2ThermtripEvent))
1570 {
1571 return -1;
1572 }
1573
Jason M. Bills250fa632019-08-28 15:58:25 -07001574 // Request CPU1_VRHOT GPIO events
1575 if (!host_error_monitor::requestGPIOEvents(
1576 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1577 host_error_monitor::cpu1VRHotLine,
1578 host_error_monitor::cpu1VRHotEvent))
1579 {
1580 return -1;
1581 }
1582
Jason M. Bills9647ba72019-08-29 14:19:19 -07001583 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1584 if (!host_error_monitor::requestGPIOEvents(
1585 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1586 host_error_monitor::cpu1MemABCDVRHotLine,
1587 host_error_monitor::cpu1MemABCDVRHotEvent))
1588 {
1589 return -1;
1590 }
1591
1592 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1593 if (!host_error_monitor::requestGPIOEvents(
1594 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1595 host_error_monitor::cpu1MemEFGHVRHotLine,
1596 host_error_monitor::cpu1MemEFGHVRHotEvent))
1597 {
1598 return -1;
1599 }
1600
Jason M. Bills250fa632019-08-28 15:58:25 -07001601 // Request CPU2_VRHOT GPIO events
1602 if (!host_error_monitor::requestGPIOEvents(
1603 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1604 host_error_monitor::cpu2VRHotLine,
1605 host_error_monitor::cpu2VRHotEvent))
1606 {
1607 return -1;
1608 }
1609
Jason M. Bills9647ba72019-08-29 14:19:19 -07001610 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1611 if (!host_error_monitor::requestGPIOEvents(
1612 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1613 host_error_monitor::cpu2MemABCDVRHotLine,
1614 host_error_monitor::cpu2MemABCDVRHotEvent))
1615 {
1616 return -1;
1617 }
1618
1619 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1620 if (!host_error_monitor::requestGPIOEvents(
1621 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1622 host_error_monitor::cpu2MemEFGHVRHotLine,
1623 host_error_monitor::cpu2MemEFGHVRHotEvent))
1624 {
1625 return -1;
1626 }
1627
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001628 // Request PCH_BMC_THERMTRIP GPIO events
1629 if (!host_error_monitor::requestGPIOEvents(
1630 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1631 host_error_monitor::pchThermtripLine,
1632 host_error_monitor::pchThermtripEvent))
1633 {
1634 return -1;
1635 }
1636
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001637 // Request CPU1_MEM_THERM_EVENT GPIO events
1638 if (!host_error_monitor::requestGPIOEvents(
1639 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1640 host_error_monitor::cpu1MemtripLine,
1641 host_error_monitor::cpu1MemtripEvent))
1642 {
1643 return -1;
1644 }
1645
1646 // Request CPU2_MEM_THERM_EVENT GPIO events
1647 if (!host_error_monitor::requestGPIOEvents(
1648 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1649 host_error_monitor::cpu2MemtripLine,
1650 host_error_monitor::cpu2MemtripEvent))
1651 {
1652 return -1;
1653 }
1654
Jason M. Bills5245ed62020-12-04 16:50:21 -08001655 // Initialize the signal monitors
1656 host_error_monitor::init();
1657
Jason M. Bills1490b142019-07-01 15:48:43 -07001658 host_error_monitor::io.run();
1659
1660 return 0;
1661}