blob: 7937d9b64f2a4e6d5798ce79f9ea122c789a3321 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070022#include <gpiod.hpp>
Jason M. Billsd711cc82020-12-04 16:46:39 -080023#include <host_error_monitor.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070024#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070025
26#include <bitset>
27#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070028#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070029
30namespace host_error_monitor
31{
32static boost::asio::io_service io;
33static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080034static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070035
Yong Li1429ca82020-04-27 16:49:45 +080036using Association = std::tuple<std::string, std::string, std::string>;
37static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
38static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
39
40static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
41
Jason M. Bills1490b142019-07-01 15:48:43 -070042static bool hostOff = true;
Jason M. Billsd711cc82020-12-04 16:46:39 -080043bool hostIsOff()
44{
45 return hostOff;
46}
Jason M. Bills1490b142019-07-01 15:48:43 -070047
Jason M. Billsc4b91f22019-11-26 17:04:50 -080048static size_t caterrTimeoutMs = 2000;
49const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070050const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070051const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070052
53// Timers
54// Timer for CATERR asserted
55static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070056// Timer for ERR0 asserted
57static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070058// Timer for ERR1 asserted
59static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070060// Timer for ERR2 asserted
61static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070062// Timer for SMI asserted
63static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070064
65// GPIO Lines and Event Descriptors
66static gpiod::line caterrLine;
67static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070068static gpiod::line err0Line;
69static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070070static gpiod::line err1Line;
71static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070072static gpiod::line err2Line;
73static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070074static gpiod::line smiLine;
75static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070076static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070077static gpiod::line cpu1ThermtripLine;
78static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070079static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070080static gpiod::line cpu2ThermtripLine;
81static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070082static gpiod::line cpu1VRHotLine;
83static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
84static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070085static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
86static gpiod::line cpu1MemEFGHVRHotLine;
87static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
88static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070089static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070090static gpiod::line cpu1MemABCDVRHotLine;
91static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
92static gpiod::line cpu2MemEFGHVRHotLine;
93static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080094//----------------------------------
95// PCH_BMC_THERMTRIP function related definition
96//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080097static gpiod::line pchThermtripLine;
98static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000099//----------------------------------
100// CPU_MEM_THERM_EVENT function related definition
101//----------------------------------
102static gpiod::line cpu1MemtripLine;
103static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
104static gpiod::line cpu2MemtripLine;
105static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000106//---------------------------------
107// CPU_MISMATCH function related definition
108//---------------------------------
109static gpiod::line cpu1MismatchLine;
110static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700111
Yong Li061eb032020-02-26 15:06:18 +0800112// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800113const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800114const static constexpr uint8_t beepCPUErr2 = 5;
115
116static void beep(const uint8_t& beepPriority)
117{
118 conn->async_method_call(
119 [](boost::system::error_code ec) {
120 if (ec)
121 {
122 std::cerr << "beep returned error with "
123 "async_method_call (ec = "
124 << ec << ")\n";
125 return;
126 }
127 },
128 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
129 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
130}
131
Jason M. Billsa3397932019-08-06 11:07:21 -0700132static void cpuIERRLog()
133{
134 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
135 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
136 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
137}
138
139static void cpuIERRLog(const int cpuNum)
140{
141 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
142
143 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
144 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
145 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
146}
147
148static void cpuIERRLog(const int cpuNum, const std::string& type)
149{
150 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
151
152 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
153 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
154 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
155}
156
Jason M. Billscbf78532019-08-16 15:32:11 -0700157static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700158{
Jason M. Billscbf78532019-08-16 15:32:11 -0700159 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
160
161 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
162 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
163 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700164}
165
Jason M. Billscbf78532019-08-16 15:32:11 -0700166static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700167{
Jason M. Billscbf78532019-08-16 15:32:11 -0700168 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
169 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700170
171 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
172 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
173 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
174}
175
Jason M. Bills89922f82019-08-06 11:10:02 -0700176static void smiTimeoutLog()
177{
178 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
179 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
180 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
181}
182
Jason M. Bills45e87e02019-09-09 14:45:38 -0700183static void cpuBootFIVRFaultLog(const int cpuNum)
184{
185 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
186
187 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
188 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
189 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
190}
191
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700192static void cpuThermTripLog(const int cpuNum)
193{
194 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
195
196 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
197 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
198 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
199 cpuNum, NULL);
200}
201
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000202static void memThermTripLog(const int cpuNum)
203{
204 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
205 std::string msg = cpuNumber + " Memory Thermal trip.";
206
207 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
208 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
209 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
210 cpuNumber.c_str(), NULL);
211}
212
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000213static void cpuMismatchLog(const int cpuNum)
214{
215 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
216
217 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
218 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
219 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
220}
221
Jason M. Bills250fa632019-08-28 15:58:25 -0700222static void cpuVRHotLog(const std::string& vr)
223{
224 std::string msg = vr + " Voltage Regulator Overheated.";
225
226 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
227 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
228 "OpenBMC.0.1.VoltageRegulatorOverheated",
229 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
230}
231
Jason M. Bills08866542019-08-16 12:04:19 -0700232static void ssbThermTripLog()
233{
234 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
235 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
236 "OpenBMC.0.1.SsbThermalTrip", NULL);
237}
238
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700239static inline bool peciError(EPECIStatus peciStatus, uint8_t cc)
240{
241 return (
242 peciStatus != PECI_CC_SUCCESS ||
243 (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED));
244}
245
246static void printPECIError(const std::string& reg, const size_t addr,
247 const EPECIStatus peciStatus, const size_t cc)
248{
249 std::cerr << "Failed to read " << reg << " on CPU address " << addr
250 << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc
251 << "\n";
252}
253
Jason M. Billsa15c2522019-08-16 10:01:44 -0700254static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700255static void initializeHostState()
256{
257 conn->async_method_call(
258 [](boost::system::error_code ec,
259 const std::variant<std::string>& property) {
260 if (ec)
261 {
262 return;
263 }
264 const std::string* state = std::get_if<std::string>(&property);
265 if (state == nullptr)
266 {
267 std::cerr << "Unable to read host state value\n";
268 return;
269 }
270 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700271 // If the system is on, initialize the error state
272 if (!hostOff)
273 {
274 initializeErrorState();
275 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700276 },
277 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
278 "org.freedesktop.DBus.Properties", "Get",
279 "xyz.openbmc_project.State.Host", "CurrentHostState");
280}
281
282static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
283{
284 return std::make_shared<sdbusplus::bus::match::match>(
285 *conn,
286 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700287 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700288 [](sdbusplus::message::message& msg) {
289 std::string interfaceName;
290 boost::container::flat_map<std::string, std::variant<std::string>>
291 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700292 try
293 {
294 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700295 }
296 catch (std::exception& e)
297 {
298 std::cerr << "Unable to read host state\n";
299 return;
300 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700301 // We only want to check for CurrentHostState
302 if (propertiesChanged.begin()->first != "CurrentHostState")
303 {
304 return;
305 }
306 std::string* state =
307 std::get_if<std::string>(&(propertiesChanged.begin()->second));
308 if (state == nullptr)
309 {
310 std::cerr << propertiesChanged.begin()->first
311 << " property invalid\n";
312 return;
313 }
314
315 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700316
Jason M. Bills1490b142019-07-01 15:48:43 -0700317 if (hostOff)
318 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700319 // No host events should fire while off, so cancel any pending
320 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700321 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700322 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700323 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700324 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700325 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700326 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700327 else
328 {
329 // Handle any initial errors when the host turns on
330 initializeErrorState();
331 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700332 });
333}
334
335static bool requestGPIOEvents(
336 const std::string& name, const std::function<void()>& handler,
337 gpiod::line& gpioLine,
338 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
339{
340 // Find the GPIO line
341 gpioLine = gpiod::find_line(name);
342 if (!gpioLine)
343 {
344 std::cerr << "Failed to find the " << name << " line\n";
345 return false;
346 }
347
348 try
349 {
350 gpioLine.request(
351 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
352 }
353 catch (std::exception&)
354 {
355 std::cerr << "Failed to request events for " << name << "\n";
356 return false;
357 }
358
359 int gpioLineFd = gpioLine.event_get_fd();
360 if (gpioLineFd < 0)
361 {
362 std::cerr << "Failed to get " << name << " fd\n";
363 return false;
364 }
365
366 gpioEventDescriptor.assign(gpioLineFd);
367
368 gpioEventDescriptor.async_wait(
369 boost::asio::posix::stream_descriptor::wait_read,
370 [&name, handler](const boost::system::error_code ec) {
371 if (ec)
372 {
373 std::cerr << name << " fd handler error: " << ec.message()
374 << "\n";
375 return;
376 }
377 handler();
378 });
379 return true;
380}
381
Jason M. Bills45e87e02019-09-09 14:45:38 -0700382static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
383{
384 // Find the GPIO line
385 gpioLine = gpiod::find_line(name);
386 if (!gpioLine)
387 {
388 std::cerr << "Failed to find the " << name << " line.\n";
389 return false;
390 }
391
392 // Request GPIO input
393 try
394 {
395 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
396 }
397 catch (std::exception&)
398 {
399 std::cerr << "Failed to request " << name << " input\n";
400 return false;
401 }
402
403 return true;
404}
405
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700406static void incrementCPUErrorCount(int cpuNum)
407{
408 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
409
410 // Get the current count
411 conn->async_method_call(
412 [propertyName](boost::system::error_code ec,
413 const std::variant<uint8_t>& property) {
414 if (ec)
415 {
416 std::cerr << "Failed to read " << propertyName << ": "
417 << ec.message() << "\n";
418 return;
419 }
420 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
421 if (errorCountVariant == nullptr)
422 {
423 std::cerr << propertyName << " invalid\n";
424 return;
425 }
426 uint8_t errorCount = *errorCountVariant;
427 if (errorCount == std::numeric_limits<uint8_t>::max())
428 {
429 std::cerr << "Maximum error count reached\n";
430 return;
431 }
432 // Increment the count
433 errorCount++;
434 conn->async_method_call(
435 [propertyName](boost::system::error_code ec) {
436 if (ec)
437 {
438 std::cerr << "Failed to set " << propertyName << ": "
439 << ec.message() << "\n";
440 }
441 },
442 "xyz.openbmc_project.Settings",
443 "/xyz/openbmc_project/control/processor_error_config",
444 "org.freedesktop.DBus.Properties", "Set",
445 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
446 std::variant<uint8_t>{errorCount});
447 },
448 "xyz.openbmc_project.Settings",
449 "/xyz/openbmc_project/control/processor_error_config",
450 "org.freedesktop.DBus.Properties", "Get",
451 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
452}
453
Jason M. Billsa3397932019-08-06 11:07:21 -0700454static bool checkIERRCPUs()
455{
456 bool cpuIERRFound = false;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700457 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Billsa3397932019-08-06 11:07:21 -0700458 cpu++, addr++)
459 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700460 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Billsa3397932019-08-06 11:07:21 -0700461 uint8_t cc = 0;
462 CPUModel model{};
463 uint8_t stepping = 0;
464 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
465 {
466 std::cerr << "Cannot get CPUID!\n";
467 continue;
468 }
469
470 switch (model)
471 {
472 case skx:
473 {
474 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
475 // that caused the IERR
476 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700477 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
478 (uint8_t*)&mcaErrSrcLog, &cc);
479 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700480 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700481 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700482 continue;
483 }
484 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
485 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
486 {
487 // TODO: Light the CPU fault LED?
488 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700489 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700490 // Next check if it's a CPU/VR mismatch by reading the
491 // IA32_MC4_STATUS MSR (0x411)
492 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700493 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
494 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700495 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700496 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700497 continue;
498 }
499 // Check MSEC bits 31:24 for
500 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
501 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
502 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700503 uint64_t msec = (mc4Status >> 24) & 0xFF;
504 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700505 {
506 cpuIERRLog(cpu, "CPU/VR Mismatch");
507 continue;
508 }
509
510 // Next check if it's a Core FIVR fault by looking for a
511 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
512 // 80h)
513 uint32_t coreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700514 peciStatus = peci_RdPCIConfigLocal(
515 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
516 (uint8_t*)&coreFIVRErrLog, &cc);
517 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700518 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700519 printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus,
520 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700521 continue;
522 }
523 if (coreFIVRErrLog)
524 {
525 cpuIERRLog(cpu, "Core FIVR Fault");
526 continue;
527 }
528
529 // Next check if it's an Uncore FIVR fault by looking for a
530 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
531 // 84h)
532 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700533 peciStatus = peci_RdPCIConfigLocal(
534 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
535 (uint8_t*)&uncoreFIVRErrLog, &cc);
536 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700537 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700538 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
539 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700540 continue;
541 }
542 if (uncoreFIVRErrLog)
543 {
544 cpuIERRLog(cpu, "Uncore FIVR Fault");
545 continue;
546 }
547
548 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
549 // both zero, but MSEC bits 31:24 have either
550 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
551 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
552 // uncore FIVR fault
553 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700554 (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700555 {
556 cpuIERRLog(cpu, "Uncore FIVR Fault");
557 continue;
558 }
559 cpuIERRLog(cpu);
560 }
561 break;
562 }
563 case icx:
564 {
565 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
566 // that caused the IERR
567 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700568 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
569 (uint8_t*)&mcaErrSrcLog, &cc);
570 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700571 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700572 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700573 continue;
574 }
575 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
576 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
577 {
578 // TODO: Light the CPU fault LED?
579 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700580 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700581 // Next check if it's a CPU/VR mismatch by reading the
582 // IA32_MC4_STATUS MSR (0x411)
583 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700584 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
585 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700586 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700587 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700588 continue;
589 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700590 // Check MSEC bits 31:24 for
591 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
592 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
593 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700594 uint64_t msec = (mc4Status >> 24) & 0xFF;
595 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700596 {
597 cpuIERRLog(cpu, "CPU/VR Mismatch");
598 continue;
599 }
600
601 // Next check if it's a Core FIVR fault by looking for a
602 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
603 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
604 uint32_t coreFIVRErrLog0 = 0;
605 uint32_t coreFIVRErrLog1 = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700606 peciStatus = peci_RdEndPointConfigPciLocal(
607 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
608 (uint8_t*)&coreFIVRErrLog0, &cc);
609 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700610 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700611 printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus,
612 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700613 continue;
614 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700615 peciStatus = peci_RdEndPointConfigPciLocal(
616 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
617 (uint8_t*)&coreFIVRErrLog1, &cc);
618 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700619 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700620 printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus,
621 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700622 continue;
623 }
624 if (coreFIVRErrLog0 || coreFIVRErrLog1)
625 {
626 cpuIERRLog(cpu, "Core FIVR Fault");
627 continue;
628 }
629
630 // Next check if it's an Uncore FIVR fault by looking for a
631 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
632 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
633 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700634 peciStatus = peci_RdEndPointConfigPciLocal(
635 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
636 (uint8_t*)&uncoreFIVRErrLog, &cc);
637 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700638 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700639 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
640 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700641 continue;
642 }
643 if (uncoreFIVRErrLog)
644 {
645 cpuIERRLog(cpu, "Uncore FIVR Fault");
646 continue;
647 }
648
649 // TODO: Update MSEC/MSCOD_31_24 check
650 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
651 // both zero, but MSEC bits 31:24 have either
652 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
653 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
654 // uncore FIVR fault
655 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700656 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700657 {
658 cpuIERRLog(cpu, "Uncore FIVR Fault");
659 continue;
660 }
661 cpuIERRLog(cpu);
662 }
663 break;
664 }
665 }
666 }
667 return cpuIERRFound;
668}
669
Jason M. Billsa15c2522019-08-16 10:01:44 -0700670static void caterrAssertHandler()
671{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700672 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
673 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
674 if (ec)
675 {
676 // operation_aborted is expected if timer is canceled
677 // before completion.
678 if (ec != boost::asio::error::operation_aborted)
679 {
680 std::cerr << "caterr timeout async_wait failed: "
681 << ec.message() << "\n";
682 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700683 return;
684 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700685 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
686 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800687 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700688 if (!checkIERRCPUs())
689 {
690 cpuIERRLog();
691 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700692 conn->async_method_call(
693 [](boost::system::error_code ec,
694 const std::variant<bool>& property) {
695 if (ec)
696 {
697 return;
698 }
699 const bool* reset = std::get_if<bool>(&property);
700 if (reset == nullptr)
701 {
702 std::cerr << "Unable to read reset on CATERR value\n";
703 return;
704 }
Jason M. Billsd711cc82020-12-04 16:46:39 -0800705 startCrashdumpAndRecovery(conn, *reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700706 },
707 "xyz.openbmc_project.Settings",
708 "/xyz/openbmc_project/control/processor_error_config",
709 "org.freedesktop.DBus.Properties", "Get",
710 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
711 });
712}
713
Jason M. Bills1490b142019-07-01 15:48:43 -0700714static void caterrHandler()
715{
716 if (!hostOff)
717 {
718 gpiod::line_event gpioLineEvent = caterrLine.event_read();
719
720 bool caterr =
721 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800722
723 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700724 if (caterr)
725 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700726 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800727 associations.emplace_back(
728 "", "critical",
729 "/xyz/openbmc_project/host_error_monitor/cat_error");
730 associations.emplace_back("", "critical",
731 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700732 }
733 else
734 {
735 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800736 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700737 }
Yong Li1429ca82020-04-27 16:49:45 +0800738 host_error_monitor::associationCATAssert->set_property("Associations",
739 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700740 }
741 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
742 [](const boost::system::error_code ec) {
743 if (ec)
744 {
745 std::cerr << "caterr handler error: "
746 << ec.message() << "\n";
747 return;
748 }
749 caterrHandler();
750 });
751}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700752
Jason M. Billse94f5e12019-09-13 11:11:34 -0700753static void cpu1ThermtripAssertHandler()
754{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700755 if (cpu1FIVRFaultLine.get_value() == 0)
756 {
757 cpuBootFIVRFaultLog(1);
758 }
759 else
760 {
761 cpuThermTripLog(1);
762 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700763}
764
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700765static void cpu1ThermtripHandler()
766{
Jason M. Bills84951142020-04-17 15:57:11 -0700767 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700768
Jason M. Bills84951142020-04-17 15:57:11 -0700769 bool cpu1Thermtrip =
770 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
771 if (cpu1Thermtrip)
772 {
773 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700774 }
Jason M. Bills84951142020-04-17 15:57:11 -0700775
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700776 cpu1ThermtripEvent.async_wait(
777 boost::asio::posix::stream_descriptor::wait_read,
778 [](const boost::system::error_code ec) {
779 if (ec)
780 {
781 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
782 << "\n";
783 return;
784 }
785 cpu1ThermtripHandler();
786 });
787}
788
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000789static void cpu1MemtripHandler()
790{
Jason M. Bills5287c022020-05-19 11:16:09 -0700791 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000792
Jason M. Bills5287c022020-05-19 11:16:09 -0700793 bool cpu1Memtrip =
794 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
795 if (cpu1Memtrip)
796 {
797 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000798 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700799
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000800 cpu1MemtripEvent.async_wait(
801 boost::asio::posix::stream_descriptor::wait_read,
802 [](const boost::system::error_code ec) {
803 if (ec)
804 {
805 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
806 << ec.message() << "\n";
807 return;
808 }
809 cpu1MemtripHandler();
810 });
811}
812
Jason M. Billse94f5e12019-09-13 11:11:34 -0700813static void cpu2ThermtripAssertHandler()
814{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700815 if (cpu2FIVRFaultLine.get_value() == 0)
816 {
817 cpuBootFIVRFaultLog(2);
818 }
819 else
820 {
821 cpuThermTripLog(2);
822 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700823}
824
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700825static void cpu2ThermtripHandler()
826{
Jason M. Bills84951142020-04-17 15:57:11 -0700827 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700828
Jason M. Bills84951142020-04-17 15:57:11 -0700829 bool cpu2Thermtrip =
830 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
831 if (cpu2Thermtrip)
832 {
833 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700834 }
Jason M. Bills84951142020-04-17 15:57:11 -0700835
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700836 cpu2ThermtripEvent.async_wait(
837 boost::asio::posix::stream_descriptor::wait_read,
838 [](const boost::system::error_code ec) {
839 if (ec)
840 {
841 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
842 << "\n";
843 return;
844 }
845 cpu2ThermtripHandler();
846 });
847}
848
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000849static void cpu2MemtripHandler()
850{
Jason M. Bills5287c022020-05-19 11:16:09 -0700851 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000852
Jason M. Bills5287c022020-05-19 11:16:09 -0700853 bool cpu2Memtrip =
854 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
855 if (cpu2Memtrip)
856 {
857 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000858 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700859
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000860 cpu2MemtripEvent.async_wait(
861 boost::asio::posix::stream_descriptor::wait_read,
862 [](const boost::system::error_code ec) {
863 if (ec)
864 {
865 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
866 << ec.message() << "\n";
867 return;
868 }
869 cpu2MemtripHandler();
870 });
871}
872
Jason M. Billse94f5e12019-09-13 11:11:34 -0700873static void cpu1VRHotAssertHandler()
874{
875 cpuVRHotLog("CPU 1");
876}
877
Jason M. Bills250fa632019-08-28 15:58:25 -0700878static void cpu1VRHotHandler()
879{
Jason M. Bills84951142020-04-17 15:57:11 -0700880 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700881
Jason M. Bills84951142020-04-17 15:57:11 -0700882 bool cpu1VRHot =
883 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
884 if (cpu1VRHot)
885 {
886 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700887 }
Jason M. Bills84951142020-04-17 15:57:11 -0700888
Jason M. Bills250fa632019-08-28 15:58:25 -0700889 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
890 [](const boost::system::error_code ec) {
891 if (ec)
892 {
893 std::cerr << "CPU 1 VRHot handler error: "
894 << ec.message() << "\n";
895 return;
896 }
897 cpu1VRHotHandler();
898 });
899}
900
Jason M. Billse94f5e12019-09-13 11:11:34 -0700901static void cpu1MemABCDVRHotAssertHandler()
902{
903 cpuVRHotLog("CPU 1 Memory ABCD");
904}
905
Jason M. Bills9647ba72019-08-29 14:19:19 -0700906static void cpu1MemABCDVRHotHandler()
907{
Jason M. Bills84951142020-04-17 15:57:11 -0700908 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700909
Jason M. Bills84951142020-04-17 15:57:11 -0700910 bool cpu1MemABCDVRHot =
911 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
912 if (cpu1MemABCDVRHot)
913 {
914 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700915 }
Jason M. Bills84951142020-04-17 15:57:11 -0700916
Jason M. Bills9647ba72019-08-29 14:19:19 -0700917 cpu1MemABCDVRHotEvent.async_wait(
918 boost::asio::posix::stream_descriptor::wait_read,
919 [](const boost::system::error_code ec) {
920 if (ec)
921 {
922 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
923 << ec.message() << "\n";
924 return;
925 }
926 cpu1MemABCDVRHotHandler();
927 });
928}
929
Jason M. Billse94f5e12019-09-13 11:11:34 -0700930static void cpu1MemEFGHVRHotAssertHandler()
931{
932 cpuVRHotLog("CPU 1 Memory EFGH");
933}
934
Jason M. Bills9647ba72019-08-29 14:19:19 -0700935static void cpu1MemEFGHVRHotHandler()
936{
Jason M. Bills84951142020-04-17 15:57:11 -0700937 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700938
Jason M. Bills84951142020-04-17 15:57:11 -0700939 bool cpu1MemEFGHVRHot =
940 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
941 if (cpu1MemEFGHVRHot)
942 {
943 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700944 }
Jason M. Bills84951142020-04-17 15:57:11 -0700945
Jason M. Bills9647ba72019-08-29 14:19:19 -0700946 cpu1MemEFGHVRHotEvent.async_wait(
947 boost::asio::posix::stream_descriptor::wait_read,
948 [](const boost::system::error_code ec) {
949 if (ec)
950 {
951 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
952 << ec.message() << "\n";
953 return;
954 }
955 cpu1MemEFGHVRHotHandler();
956 });
957}
958
Jason M. Billse94f5e12019-09-13 11:11:34 -0700959static void cpu2VRHotAssertHandler()
960{
961 cpuVRHotLog("CPU 2");
962}
963
Jason M. Bills250fa632019-08-28 15:58:25 -0700964static void cpu2VRHotHandler()
965{
Jason M. Bills84951142020-04-17 15:57:11 -0700966 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700967
Jason M. Bills84951142020-04-17 15:57:11 -0700968 bool cpu2VRHot =
969 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
970 if (cpu2VRHot)
971 {
972 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700973 }
Jason M. Bills84951142020-04-17 15:57:11 -0700974
Jason M. Bills250fa632019-08-28 15:58:25 -0700975 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
976 [](const boost::system::error_code ec) {
977 if (ec)
978 {
979 std::cerr << "CPU 2 VRHot handler error: "
980 << ec.message() << "\n";
981 return;
982 }
983 cpu2VRHotHandler();
984 });
985}
986
Jason M. Billse94f5e12019-09-13 11:11:34 -0700987static void cpu2MemABCDVRHotAssertHandler()
988{
989 cpuVRHotLog("CPU 2 Memory ABCD");
990}
991
Jason M. Bills9647ba72019-08-29 14:19:19 -0700992static void cpu2MemABCDVRHotHandler()
993{
Jason M. Bills84951142020-04-17 15:57:11 -0700994 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700995
Jason M. Bills84951142020-04-17 15:57:11 -0700996 bool cpu2MemABCDVRHot =
997 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
998 if (cpu2MemABCDVRHot)
999 {
1000 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001001 }
Jason M. Bills84951142020-04-17 15:57:11 -07001002
Jason M. Bills9647ba72019-08-29 14:19:19 -07001003 cpu2MemABCDVRHotEvent.async_wait(
1004 boost::asio::posix::stream_descriptor::wait_read,
1005 [](const boost::system::error_code ec) {
1006 if (ec)
1007 {
1008 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1009 << ec.message() << "\n";
1010 return;
1011 }
1012 cpu2MemABCDVRHotHandler();
1013 });
1014}
1015
Jason M. Billse94f5e12019-09-13 11:11:34 -07001016static void cpu2MemEFGHVRHotAssertHandler()
1017{
1018 cpuVRHotLog("CPU 2 Memory EFGH");
1019}
1020
Jason M. Bills9647ba72019-08-29 14:19:19 -07001021static void cpu2MemEFGHVRHotHandler()
1022{
Jason M. Bills84951142020-04-17 15:57:11 -07001023 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001024
Jason M. Bills84951142020-04-17 15:57:11 -07001025 bool cpu2MemEFGHVRHot =
1026 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1027 if (cpu2MemEFGHVRHot)
1028 {
1029 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001030 }
Jason M. Bills84951142020-04-17 15:57:11 -07001031
Jason M. Bills9647ba72019-08-29 14:19:19 -07001032 cpu2MemEFGHVRHotEvent.async_wait(
1033 boost::asio::posix::stream_descriptor::wait_read,
1034 [](const boost::system::error_code ec) {
1035 if (ec)
1036 {
1037 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1038 << ec.message() << "\n";
1039 return;
1040 }
1041 cpu2MemEFGHVRHotHandler();
1042 });
1043}
1044
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001045static void pchThermtripHandler()
1046{
Yong Li1429ca82020-04-27 16:49:45 +08001047 std::vector<Association> associations;
1048
Jason M. Bills84951142020-04-17 15:57:11 -07001049 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001050
Jason M. Bills84951142020-04-17 15:57:11 -07001051 bool pchThermtrip =
1052 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1053 if (pchThermtrip)
1054 {
1055 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001056 associations.emplace_back(
1057 "", "critical",
1058 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1059 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001060 }
Yong Li1429ca82020-04-27 16:49:45 +08001061 else
1062 {
1063 associations.emplace_back("", "", "");
1064 }
1065 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1066 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001067
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001068 pchThermtripEvent.async_wait(
1069 boost::asio::posix::stream_descriptor::wait_read,
1070 [](const boost::system::error_code ec) {
1071 if (ec)
1072 {
1073 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1074 << "\n";
1075 return;
1076 }
1077 pchThermtripHandler();
1078 });
1079}
1080
Jason M. Billscbf78532019-08-16 15:32:11 -07001081static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001082{
Jason M. Billscbf78532019-08-16 15:32:11 -07001083 int errPinSts = (1 << errPin);
1084 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001085 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001086 cpu++, addr++)
1087 {
1088 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1089 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001090 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001091 uint8_t cc = 0;
1092 CPUModel model{};
1093 uint8_t stepping = 0;
1094 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1095 {
1096 std::cerr << "Cannot get CPUID!\n";
1097 continue;
1098 }
1099
1100 switch (model)
1101 {
1102 case skx:
1103 {
1104 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001105 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001106 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001107 peciStatus = peci_RdPCIConfigLocal(
1108 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1109 (uint8_t*)&errpinsts, &cc);
1110 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001111 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001112 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1113 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001114 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001115
1116 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001117 break;
1118 }
1119 case icx:
1120 {
1121 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001122 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001123 // accessed on PECI as bus 13)
1124 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001125 peciStatus = peci_RdEndPointConfigPciLocal(
1126 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1127 (uint8_t*)&errpinsts, &cc);
1128 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001129 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001130 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1131 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001132 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001133
1134 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001135 break;
1136 }
1137 }
1138 }
1139 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001140 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001141}
1142
Jason M. Billscbf78532019-08-16 15:32:11 -07001143static void errXAssertHandler(const int errPin,
1144 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001145{
Jason M. Billscbf78532019-08-16 15:32:11 -07001146 // ERRx status is not guaranteed through the timeout, so save which
1147 // CPUs have it asserted
1148 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1149 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1150 errXAssertTimer.async_wait([errPin, errPinCPUs](
1151 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001152 if (ec)
1153 {
1154 // operation_aborted is expected if timer is canceled before
1155 // completion.
1156 if (ec != boost::asio::error::operation_aborted)
1157 {
1158 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1159 << "\n";
1160 }
1161 return;
1162 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001163 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1164 << std::to_string(errTimeoutMs) << " ms\n";
1165 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001166 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001167 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001168 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001169 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001170 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001171 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001172 }
1173 }
1174 }
1175 else
1176 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001177 cpuERRXLog(errPin);
1178 }
1179 });
1180}
1181
Jason M. Bills8c584392019-08-19 11:05:51 -07001182static void err0AssertHandler()
1183{
1184 // Handle the standard ERR0 detection and logging
1185 const static constexpr int err0 = 0;
1186 errXAssertHandler(err0, err0AssertTimer);
1187}
1188
1189static void err0Handler()
1190{
1191 if (!hostOff)
1192 {
1193 gpiod::line_event gpioLineEvent = err0Line.event_read();
1194
1195 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1196 if (err0)
1197 {
1198 err0AssertHandler();
1199 }
1200 else
1201 {
1202 err0AssertTimer.cancel();
1203 }
1204 }
1205 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1206 [](const boost::system::error_code ec) {
1207 if (ec)
1208 {
1209 std::cerr
1210 << "err0 handler error: " << ec.message()
1211 << "\n";
1212 return;
1213 }
1214 err0Handler();
1215 });
1216}
1217
Jason M. Bills75af3962019-08-19 11:07:17 -07001218static void err1AssertHandler()
1219{
1220 // Handle the standard ERR1 detection and logging
1221 const static constexpr int err1 = 1;
1222 errXAssertHandler(err1, err1AssertTimer);
1223}
1224
1225static void err1Handler()
1226{
1227 if (!hostOff)
1228 {
1229 gpiod::line_event gpioLineEvent = err1Line.event_read();
1230
1231 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1232 if (err1)
1233 {
1234 err1AssertHandler();
1235 }
1236 else
1237 {
1238 err1AssertTimer.cancel();
1239 }
1240 }
1241 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1242 [](const boost::system::error_code ec) {
1243 if (ec)
1244 {
1245 std::cerr
1246 << "err1 handler error: " << ec.message()
1247 << "\n";
1248 return;
1249 }
1250 err1Handler();
1251 });
1252}
1253
Jason M. Billscbf78532019-08-16 15:32:11 -07001254static void err2AssertHandler()
1255{
1256 // Handle the standard ERR2 detection and logging
1257 const static constexpr int err2 = 2;
1258 errXAssertHandler(err2, err2AssertTimer);
1259 // Also handle reset for ERR2
1260 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1261 if (ec)
1262 {
1263 // operation_aborted is expected if timer is canceled before
1264 // completion.
1265 if (ec != boost::asio::error::operation_aborted)
1266 {
1267 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1268 << "\n";
1269 }
1270 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001271 }
1272 conn->async_method_call(
1273 [](boost::system::error_code ec,
1274 const std::variant<bool>& property) {
1275 if (ec)
1276 {
1277 return;
1278 }
1279 const bool* reset = std::get_if<bool>(&property);
1280 if (reset == nullptr)
1281 {
1282 std::cerr << "Unable to read reset on ERR2 value\n";
1283 return;
1284 }
Jason M. Billsd711cc82020-12-04 16:46:39 -08001285 startCrashdumpAndRecovery(conn, *reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001286 },
1287 "xyz.openbmc_project.Settings",
1288 "/xyz/openbmc_project/control/processor_error_config",
1289 "org.freedesktop.DBus.Properties", "Get",
1290 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001291
1292 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001293 });
1294}
1295
1296static void err2Handler()
1297{
1298 if (!hostOff)
1299 {
1300 gpiod::line_event gpioLineEvent = err2Line.event_read();
1301
1302 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1303 if (err2)
1304 {
1305 err2AssertHandler();
1306 }
1307 else
1308 {
1309 err2AssertTimer.cancel();
1310 }
1311 }
1312 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1313 [](const boost::system::error_code ec) {
1314 if (ec)
1315 {
1316 std::cerr
1317 << "err2 handler error: " << ec.message()
1318 << "\n";
1319 return;
1320 }
1321 err2Handler();
1322 });
1323}
1324
Jason M. Bills89922f82019-08-06 11:10:02 -07001325static void smiAssertHandler()
1326{
1327 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1328 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1329 if (ec)
1330 {
1331 // operation_aborted is expected if timer is canceled before
1332 // completion.
1333 if (ec != boost::asio::error::operation_aborted)
1334 {
1335 std::cerr << "smi timeout async_wait failed: " << ec.message()
1336 << "\n";
1337 }
1338 return;
1339 }
1340 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1341 << " ms\n";
1342 smiTimeoutLog();
1343 conn->async_method_call(
1344 [](boost::system::error_code ec,
1345 const std::variant<bool>& property) {
1346 if (ec)
1347 {
1348 return;
1349 }
1350 const bool* reset = std::get_if<bool>(&property);
1351 if (reset == nullptr)
1352 {
1353 std::cerr << "Unable to read reset on SMI value\n";
1354 return;
1355 }
Jason M. Bills94785442020-01-07 15:22:09 -08001356#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001357 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001358#else
1359 if (*reset)
1360 {
Jason M. Billsd69549b2020-08-27 11:42:43 -07001361 std::cerr << "Recovering the system\n";
Jason M. Billsd711cc82020-12-04 16:46:39 -08001362 startWarmReset(conn);
Jason M. Bills94785442020-01-07 15:22:09 -08001363 }
1364#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001365 },
1366 "xyz.openbmc_project.Settings",
1367 "/xyz/openbmc_project/control/bmc_reset_disables",
1368 "org.freedesktop.DBus.Properties", "Get",
1369 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1370 });
1371}
1372
1373static void smiHandler()
1374{
1375 if (!hostOff)
1376 {
1377 gpiod::line_event gpioLineEvent = smiLine.event_read();
1378
1379 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1380 if (smi)
1381 {
1382 smiAssertHandler();
1383 }
1384 else
1385 {
1386 smiAssertTimer.cancel();
1387 }
1388 }
1389 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1390 [](const boost::system::error_code ec) {
1391 if (ec)
1392 {
1393 std::cerr
1394 << "smi handler error: " << ec.message()
1395 << "\n";
1396 return;
1397 }
1398 smiHandler();
1399 });
1400}
1401
Jason M. Billsa15c2522019-08-16 10:01:44 -07001402static void initializeErrorState()
1403{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001404 // Handle CPU1_MISMATCH if it's asserted now
1405 if (cpu1MismatchLine.get_value() == 1)
1406 {
1407 cpuMismatchLog(1);
1408 }
1409
1410 // Handle CPU2_MISMATCH if it's asserted now
1411 if (cpu2MismatchLine.get_value() == 1)
1412 {
1413 cpuMismatchLog(2);
1414 }
1415
Jason M. Billsa15c2522019-08-16 10:01:44 -07001416 // Handle CPU_CATERR if it's asserted now
1417 if (caterrLine.get_value() == 0)
1418 {
1419 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001420 std::vector<Association> associations;
1421 associations.emplace_back(
1422 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1423 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1424 host_error_monitor::associationCATAssert->set_property("Associations",
1425 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001426 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001427
Jason M. Bills8c584392019-08-19 11:05:51 -07001428 // Handle CPU_ERR0 if it's asserted now
1429 if (err0Line.get_value() == 0)
1430 {
1431 err0AssertHandler();
1432 }
1433
Jason M. Bills75af3962019-08-19 11:07:17 -07001434 // Handle CPU_ERR1 if it's asserted now
1435 if (err1Line.get_value() == 0)
1436 {
1437 err1AssertHandler();
1438 }
1439
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001440 // Handle CPU_ERR2 if it's asserted now
1441 if (err2Line.get_value() == 0)
1442 {
1443 err2AssertHandler();
1444 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001445
1446 // Handle SMI if it's asserted now
1447 if (smiLine.get_value() == 0)
1448 {
1449 smiAssertHandler();
1450 }
Jason M. Bills08866542019-08-16 12:04:19 -07001451
Jason M. Billse94f5e12019-09-13 11:11:34 -07001452 // Handle CPU1_THERMTRIP if it's asserted now
1453 if (cpu1ThermtripLine.get_value() == 0)
1454 {
1455 cpu1ThermtripAssertHandler();
1456 }
1457
1458 // Handle CPU2_THERMTRIP if it's asserted now
1459 if (cpu2ThermtripLine.get_value() == 0)
1460 {
1461 cpu2ThermtripAssertHandler();
1462 }
1463
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001464 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1465 if (cpu1MemtripLine.get_value() == 0)
1466 {
1467 memThermTripLog(1);
1468 }
1469
1470 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1471 if (cpu2MemtripLine.get_value() == 0)
1472 {
1473 memThermTripLog(2);
1474 }
1475
Jason M. Billse94f5e12019-09-13 11:11:34 -07001476 // Handle CPU1_VRHOT if it's asserted now
1477 if (cpu1VRHotLine.get_value() == 0)
1478 {
1479 cpu1VRHotAssertHandler();
1480 }
1481
1482 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1483 if (cpu1MemABCDVRHotLine.get_value() == 0)
1484 {
1485 cpu1MemABCDVRHotAssertHandler();
1486 }
1487
1488 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1489 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1490 {
1491 cpu1MemEFGHVRHotAssertHandler();
1492 }
1493
1494 // Handle CPU2_VRHOT if it's asserted now
1495 if (cpu2VRHotLine.get_value() == 0)
1496 {
1497 cpu2VRHotAssertHandler();
1498 }
1499
1500 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1501 if (cpu2MemABCDVRHotLine.get_value() == 0)
1502 {
1503 cpu2MemABCDVRHotAssertHandler();
1504 }
1505
1506 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1507 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1508 {
1509 cpu2MemEFGHVRHotAssertHandler();
1510 }
1511
Jason M. Bills08866542019-08-16 12:04:19 -07001512 // Handle PCH_BMC_THERMTRIP if it's asserted now
1513 if (pchThermtripLine.get_value() == 0)
1514 {
1515 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001516 std::vector<Association> associations;
1517 associations.emplace_back(
1518 "", "critical",
1519 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1520 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1521 host_error_monitor::associationSSBThermTrip->set_property(
1522 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001523 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001524}
Jason M. Bills1490b142019-07-01 15:48:43 -07001525} // namespace host_error_monitor
1526
1527int main(int argc, char* argv[])
1528{
1529 // setup connection to dbus
1530 host_error_monitor::conn =
1531 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1532
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001533 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001534 host_error_monitor::conn->request_name(
1535 "xyz.openbmc_project.HostErrorMonitor");
1536 sdbusplus::asio::object_server server =
1537 sdbusplus::asio::object_server(host_error_monitor::conn);
1538
Yong Li1429ca82020-04-27 16:49:45 +08001539 // Associations interface for led status
1540 std::vector<host_error_monitor::Association> associations;
1541 associations.emplace_back("", "", "");
1542 host_error_monitor::associationSSBThermTrip = server.add_interface(
1543 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1544 "xyz.openbmc_project.Association.Definitions");
1545 host_error_monitor::associationSSBThermTrip->register_property(
1546 "Associations", associations);
1547 host_error_monitor::associationSSBThermTrip->initialize();
1548
1549 host_error_monitor::associationCATAssert = server.add_interface(
1550 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1551 "xyz.openbmc_project.Association.Definitions");
1552 host_error_monitor::associationCATAssert->register_property("Associations",
1553 associations);
1554 host_error_monitor::associationCATAssert->initialize();
1555
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001556 // Restart Cause Interface
1557 host_error_monitor::hostErrorTimeoutIface =
1558 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1559 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1560
1561 host_error_monitor::hostErrorTimeoutIface->register_property(
1562 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1563 [](const std::size_t& requested, std::size_t& resp) {
1564 if (requested > host_error_monitor::caterrTimeoutMsMax)
1565 {
1566 std::cerr << "IERRTimeoutMs update to " << requested
1567 << "ms rejected. Cannot be greater than "
1568 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1569 return 0;
1570 }
1571 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1572 host_error_monitor::caterrTimeoutMs = requested;
1573 resp = requested;
1574 return 1;
1575 },
1576 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1577 host_error_monitor::hostErrorTimeoutIface->initialize();
1578
Jason M. Bills1490b142019-07-01 15:48:43 -07001579 // Start tracking host state
1580 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1581 host_error_monitor::startHostStateMonitor();
1582
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001583 // Request CPU1_MISMATCH GPIO events
1584 if (!host_error_monitor::requestGPIOInput(
1585 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1586 {
1587 return -1;
1588 }
1589
1590 // Request CPU2_MISMATCH GPIO events
1591 if (!host_error_monitor::requestGPIOInput(
1592 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1593 {
1594 return -1;
1595 }
1596
Jason M. Bills1490b142019-07-01 15:48:43 -07001597 // Initialize the host state
1598 host_error_monitor::initializeHostState();
1599
1600 // Request CPU_CATERR GPIO events
1601 if (!host_error_monitor::requestGPIOEvents(
1602 "CPU_CATERR", host_error_monitor::caterrHandler,
1603 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1604 {
1605 return -1;
1606 }
1607
Jason M. Bills8c584392019-08-19 11:05:51 -07001608 // Request CPU_ERR0 GPIO events
1609 if (!host_error_monitor::requestGPIOEvents(
1610 "CPU_ERR0", host_error_monitor::err0Handler,
1611 host_error_monitor::err0Line, host_error_monitor::err0Event))
1612 {
1613 return -1;
1614 }
1615
Jason M. Bills75af3962019-08-19 11:07:17 -07001616 // Request CPU_ERR1 GPIO events
1617 if (!host_error_monitor::requestGPIOEvents(
1618 "CPU_ERR1", host_error_monitor::err1Handler,
1619 host_error_monitor::err1Line, host_error_monitor::err1Event))
1620 {
1621 return -1;
1622 }
1623
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001624 // Request CPU_ERR2 GPIO events
1625 if (!host_error_monitor::requestGPIOEvents(
1626 "CPU_ERR2", host_error_monitor::err2Handler,
1627 host_error_monitor::err2Line, host_error_monitor::err2Event))
1628 {
1629 return -1;
1630 }
1631
Jason M. Bills89922f82019-08-06 11:10:02 -07001632 // Request SMI GPIO events
1633 if (!host_error_monitor::requestGPIOEvents(
1634 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1635 host_error_monitor::smiEvent))
1636 {
1637 return -1;
1638 }
1639
Jason M. Bills45e87e02019-09-09 14:45:38 -07001640 // Request CPU1_FIVR_FAULT GPIO input
1641 if (!host_error_monitor::requestGPIOInput(
1642 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1643 {
1644 return -1;
1645 }
1646
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001647 // Request CPU1_THERMTRIP GPIO events
1648 if (!host_error_monitor::requestGPIOEvents(
1649 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1650 host_error_monitor::cpu1ThermtripLine,
1651 host_error_monitor::cpu1ThermtripEvent))
1652 {
1653 return -1;
1654 }
1655
Jason M. Bills45e87e02019-09-09 14:45:38 -07001656 // Request CPU2_FIVR_FAULT GPIO input
1657 if (!host_error_monitor::requestGPIOInput(
1658 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1659 {
1660 return -1;
1661 }
1662
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001663 // Request CPU2_THERMTRIP GPIO events
1664 if (!host_error_monitor::requestGPIOEvents(
1665 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1666 host_error_monitor::cpu2ThermtripLine,
1667 host_error_monitor::cpu2ThermtripEvent))
1668 {
1669 return -1;
1670 }
1671
Jason M. Bills250fa632019-08-28 15:58:25 -07001672 // Request CPU1_VRHOT GPIO events
1673 if (!host_error_monitor::requestGPIOEvents(
1674 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1675 host_error_monitor::cpu1VRHotLine,
1676 host_error_monitor::cpu1VRHotEvent))
1677 {
1678 return -1;
1679 }
1680
Jason M. Bills9647ba72019-08-29 14:19:19 -07001681 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1682 if (!host_error_monitor::requestGPIOEvents(
1683 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1684 host_error_monitor::cpu1MemABCDVRHotLine,
1685 host_error_monitor::cpu1MemABCDVRHotEvent))
1686 {
1687 return -1;
1688 }
1689
1690 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1691 if (!host_error_monitor::requestGPIOEvents(
1692 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1693 host_error_monitor::cpu1MemEFGHVRHotLine,
1694 host_error_monitor::cpu1MemEFGHVRHotEvent))
1695 {
1696 return -1;
1697 }
1698
Jason M. Bills250fa632019-08-28 15:58:25 -07001699 // Request CPU2_VRHOT GPIO events
1700 if (!host_error_monitor::requestGPIOEvents(
1701 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1702 host_error_monitor::cpu2VRHotLine,
1703 host_error_monitor::cpu2VRHotEvent))
1704 {
1705 return -1;
1706 }
1707
Jason M. Bills9647ba72019-08-29 14:19:19 -07001708 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1709 if (!host_error_monitor::requestGPIOEvents(
1710 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1711 host_error_monitor::cpu2MemABCDVRHotLine,
1712 host_error_monitor::cpu2MemABCDVRHotEvent))
1713 {
1714 return -1;
1715 }
1716
1717 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1718 if (!host_error_monitor::requestGPIOEvents(
1719 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1720 host_error_monitor::cpu2MemEFGHVRHotLine,
1721 host_error_monitor::cpu2MemEFGHVRHotEvent))
1722 {
1723 return -1;
1724 }
1725
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001726 // Request PCH_BMC_THERMTRIP GPIO events
1727 if (!host_error_monitor::requestGPIOEvents(
1728 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1729 host_error_monitor::pchThermtripLine,
1730 host_error_monitor::pchThermtripEvent))
1731 {
1732 return -1;
1733 }
1734
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001735 // Request CPU1_MEM_THERM_EVENT GPIO events
1736 if (!host_error_monitor::requestGPIOEvents(
1737 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1738 host_error_monitor::cpu1MemtripLine,
1739 host_error_monitor::cpu1MemtripEvent))
1740 {
1741 return -1;
1742 }
1743
1744 // Request CPU2_MEM_THERM_EVENT GPIO events
1745 if (!host_error_monitor::requestGPIOEvents(
1746 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1747 host_error_monitor::cpu2MemtripLine,
1748 host_error_monitor::cpu2MemtripEvent))
1749 {
1750 return -1;
1751 }
1752
Jason M. Bills1490b142019-07-01 15:48:43 -07001753 host_error_monitor::io.run();
1754
1755 return 0;
1756}