blob: 4f88b889aa1faf264272b93e5931d6c84613b4d0 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills5245ed62020-12-04 16:50:21 -080022#include <error_monitors.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <gpiod.hpp>
Jason M. Billsd711cc82020-12-04 16:46:39 -080024#include <host_error_monitor.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070025#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070026
27#include <bitset>
28#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070029#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070030
31namespace host_error_monitor
32{
33static boost::asio::io_service io;
34static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080035static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070036
Yong Li1429ca82020-04-27 16:49:45 +080037using Association = std::tuple<std::string, std::string, std::string>;
38static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
39static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
40
41static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
42
Jason M. Bills1490b142019-07-01 15:48:43 -070043static bool hostOff = true;
Jason M. Billsd711cc82020-12-04 16:46:39 -080044bool hostIsOff()
45{
46 return hostOff;
47}
Jason M. Bills1490b142019-07-01 15:48:43 -070048
Jason M. Billsc4b91f22019-11-26 17:04:50 -080049static size_t caterrTimeoutMs = 2000;
50const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070051const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070052const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070053
54// Timers
55// Timer for CATERR asserted
56static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070057// Timer for ERR0 asserted
58static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070059// Timer for ERR1 asserted
60static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070061// Timer for ERR2 asserted
62static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070063// Timer for SMI asserted
64static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070065
66// GPIO Lines and Event Descriptors
67static gpiod::line caterrLine;
68static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070069static gpiod::line err0Line;
70static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070071static gpiod::line err1Line;
72static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070073static gpiod::line err2Line;
74static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070075static gpiod::line smiLine;
76static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070077static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070078static gpiod::line cpu1ThermtripLine;
79static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070080static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070081static gpiod::line cpu2ThermtripLine;
82static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070083static gpiod::line cpu1VRHotLine;
84static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
85static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070086static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
87static gpiod::line cpu1MemEFGHVRHotLine;
88static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
89static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070090static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070091static gpiod::line cpu1MemABCDVRHotLine;
92static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
93static gpiod::line cpu2MemEFGHVRHotLine;
94static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080095//----------------------------------
96// PCH_BMC_THERMTRIP function related definition
97//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080098static gpiod::line pchThermtripLine;
99static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000100//----------------------------------
101// CPU_MEM_THERM_EVENT function related definition
102//----------------------------------
103static gpiod::line cpu1MemtripLine;
104static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
105static gpiod::line cpu2MemtripLine;
106static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000107//---------------------------------
108// CPU_MISMATCH function related definition
109//---------------------------------
110static gpiod::line cpu1MismatchLine;
111static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700112
Yong Li061eb032020-02-26 15:06:18 +0800113// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800114const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800115const static constexpr uint8_t beepCPUErr2 = 5;
116
117static void beep(const uint8_t& beepPriority)
118{
119 conn->async_method_call(
120 [](boost::system::error_code ec) {
121 if (ec)
122 {
123 std::cerr << "beep returned error with "
124 "async_method_call (ec = "
125 << ec << ")\n";
126 return;
127 }
128 },
129 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
130 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
131}
132
Jason M. Billsa3397932019-08-06 11:07:21 -0700133static void cpuIERRLog()
134{
135 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
136 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
137 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
138}
139
140static void cpuIERRLog(const int cpuNum)
141{
142 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
143
144 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
145 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
146 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
147}
148
149static void cpuIERRLog(const int cpuNum, const std::string& type)
150{
151 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
152
153 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
154 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
155 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
156}
157
Jason M. Billscbf78532019-08-16 15:32:11 -0700158static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700159{
Jason M. Billscbf78532019-08-16 15:32:11 -0700160 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
161
162 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
163 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
164 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700165}
166
Jason M. Billscbf78532019-08-16 15:32:11 -0700167static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700168{
Jason M. Billscbf78532019-08-16 15:32:11 -0700169 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
170 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700171
172 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
173 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
174 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
175}
176
Jason M. Bills89922f82019-08-06 11:10:02 -0700177static void smiTimeoutLog()
178{
179 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
180 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
181 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
182}
183
Jason M. Bills45e87e02019-09-09 14:45:38 -0700184static void cpuBootFIVRFaultLog(const int cpuNum)
185{
186 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
187
188 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
189 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
190 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
191}
192
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700193static void cpuThermTripLog(const int cpuNum)
194{
195 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
196
197 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
198 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
199 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
200 cpuNum, NULL);
201}
202
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000203static void memThermTripLog(const int cpuNum)
204{
205 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
206 std::string msg = cpuNumber + " Memory Thermal trip.";
207
208 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
209 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
210 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
211 cpuNumber.c_str(), NULL);
212}
213
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000214static void cpuMismatchLog(const int cpuNum)
215{
216 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
217
218 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
219 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
220 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
221}
222
Jason M. Bills250fa632019-08-28 15:58:25 -0700223static void cpuVRHotLog(const std::string& vr)
224{
225 std::string msg = vr + " Voltage Regulator Overheated.";
226
227 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
228 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
229 "OpenBMC.0.1.VoltageRegulatorOverheated",
230 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
231}
232
Jason M. Bills08866542019-08-16 12:04:19 -0700233static void ssbThermTripLog()
234{
235 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
236 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
237 "OpenBMC.0.1.SsbThermalTrip", NULL);
238}
239
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700240static inline bool peciError(EPECIStatus peciStatus, uint8_t cc)
241{
242 return (
243 peciStatus != PECI_CC_SUCCESS ||
244 (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED));
245}
246
247static void printPECIError(const std::string& reg, const size_t addr,
248 const EPECIStatus peciStatus, const size_t cc)
249{
250 std::cerr << "Failed to read " << reg << " on CPU address " << addr
251 << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc
252 << "\n";
253}
254
Jason M. Billsa15c2522019-08-16 10:01:44 -0700255static void initializeErrorState();
Jason M. Bills5245ed62020-12-04 16:50:21 -0800256static void init()
Jason M. Bills1490b142019-07-01 15:48:43 -0700257{
Jason M. Bills5245ed62020-12-04 16:50:21 -0800258 // Get the current host state to prepare to start the signal monitors
Jason M. Bills1490b142019-07-01 15:48:43 -0700259 conn->async_method_call(
260 [](boost::system::error_code ec,
261 const std::variant<std::string>& property) {
262 if (ec)
263 {
264 return;
265 }
266 const std::string* state = std::get_if<std::string>(&property);
267 if (state == nullptr)
268 {
269 std::cerr << "Unable to read host state value\n";
270 return;
271 }
272 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700273 // If the system is on, initialize the error state
274 if (!hostOff)
275 {
276 initializeErrorState();
277 }
Jason M. Bills5245ed62020-12-04 16:50:21 -0800278
279 // Now we have the host state, start the signal monitors
280 if (!error_monitors::startMonitors(io, conn))
281 {
282 throw std::runtime_error("Failed to start signal monitors");
283 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700284 },
285 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
286 "org.freedesktop.DBus.Properties", "Get",
287 "xyz.openbmc_project.State.Host", "CurrentHostState");
288}
289
290static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
291{
292 return std::make_shared<sdbusplus::bus::match::match>(
293 *conn,
294 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700295 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700296 [](sdbusplus::message::message& msg) {
297 std::string interfaceName;
298 boost::container::flat_map<std::string, std::variant<std::string>>
299 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700300 try
301 {
302 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700303 }
304 catch (std::exception& e)
305 {
306 std::cerr << "Unable to read host state\n";
307 return;
308 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700309 // We only want to check for CurrentHostState
310 if (propertiesChanged.begin()->first != "CurrentHostState")
311 {
312 return;
313 }
314 std::string* state =
315 std::get_if<std::string>(&(propertiesChanged.begin()->second));
316 if (state == nullptr)
317 {
318 std::cerr << propertiesChanged.begin()->first
319 << " property invalid\n";
320 return;
321 }
322
323 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700324
Jason M. Bills1490b142019-07-01 15:48:43 -0700325 if (hostOff)
326 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700327 // No host events should fire while off, so cancel any pending
328 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700329 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700330 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700331 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700332 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700333 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700334 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700335 else
336 {
337 // Handle any initial errors when the host turns on
338 initializeErrorState();
Jason M. Bills5245ed62020-12-04 16:50:21 -0800339 error_monitors::sendHostOn();
Jason M. Billse94f5e12019-09-13 11:11:34 -0700340 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700341 });
342}
343
344static bool requestGPIOEvents(
345 const std::string& name, const std::function<void()>& handler,
346 gpiod::line& gpioLine,
347 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
348{
349 // Find the GPIO line
350 gpioLine = gpiod::find_line(name);
351 if (!gpioLine)
352 {
353 std::cerr << "Failed to find the " << name << " line\n";
354 return false;
355 }
356
357 try
358 {
359 gpioLine.request(
360 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
361 }
362 catch (std::exception&)
363 {
364 std::cerr << "Failed to request events for " << name << "\n";
365 return false;
366 }
367
368 int gpioLineFd = gpioLine.event_get_fd();
369 if (gpioLineFd < 0)
370 {
371 std::cerr << "Failed to get " << name << " fd\n";
372 return false;
373 }
374
375 gpioEventDescriptor.assign(gpioLineFd);
376
377 gpioEventDescriptor.async_wait(
378 boost::asio::posix::stream_descriptor::wait_read,
379 [&name, handler](const boost::system::error_code ec) {
380 if (ec)
381 {
382 std::cerr << name << " fd handler error: " << ec.message()
383 << "\n";
384 return;
385 }
386 handler();
387 });
388 return true;
389}
390
Jason M. Bills45e87e02019-09-09 14:45:38 -0700391static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
392{
393 // Find the GPIO line
394 gpioLine = gpiod::find_line(name);
395 if (!gpioLine)
396 {
397 std::cerr << "Failed to find the " << name << " line.\n";
398 return false;
399 }
400
401 // Request GPIO input
402 try
403 {
404 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
405 }
406 catch (std::exception&)
407 {
408 std::cerr << "Failed to request " << name << " input\n";
409 return false;
410 }
411
412 return true;
413}
414
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700415static void incrementCPUErrorCount(int cpuNum)
416{
417 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
418
419 // Get the current count
420 conn->async_method_call(
421 [propertyName](boost::system::error_code ec,
422 const std::variant<uint8_t>& property) {
423 if (ec)
424 {
425 std::cerr << "Failed to read " << propertyName << ": "
426 << ec.message() << "\n";
427 return;
428 }
429 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
430 if (errorCountVariant == nullptr)
431 {
432 std::cerr << propertyName << " invalid\n";
433 return;
434 }
435 uint8_t errorCount = *errorCountVariant;
436 if (errorCount == std::numeric_limits<uint8_t>::max())
437 {
438 std::cerr << "Maximum error count reached\n";
439 return;
440 }
441 // Increment the count
442 errorCount++;
443 conn->async_method_call(
444 [propertyName](boost::system::error_code ec) {
445 if (ec)
446 {
447 std::cerr << "Failed to set " << propertyName << ": "
448 << ec.message() << "\n";
449 }
450 },
451 "xyz.openbmc_project.Settings",
452 "/xyz/openbmc_project/control/processor_error_config",
453 "org.freedesktop.DBus.Properties", "Set",
454 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
455 std::variant<uint8_t>{errorCount});
456 },
457 "xyz.openbmc_project.Settings",
458 "/xyz/openbmc_project/control/processor_error_config",
459 "org.freedesktop.DBus.Properties", "Get",
460 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
461}
462
Jason M. Billsa3397932019-08-06 11:07:21 -0700463static bool checkIERRCPUs()
464{
465 bool cpuIERRFound = false;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700466 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Billsa3397932019-08-06 11:07:21 -0700467 cpu++, addr++)
468 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700469 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Billsa3397932019-08-06 11:07:21 -0700470 uint8_t cc = 0;
471 CPUModel model{};
472 uint8_t stepping = 0;
473 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
474 {
475 std::cerr << "Cannot get CPUID!\n";
476 continue;
477 }
478
479 switch (model)
480 {
481 case skx:
482 {
483 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
484 // that caused the IERR
485 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700486 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
487 (uint8_t*)&mcaErrSrcLog, &cc);
488 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700489 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700490 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700491 continue;
492 }
493 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
494 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
495 {
496 // TODO: Light the CPU fault LED?
497 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700498 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700499 // Next check if it's a CPU/VR mismatch by reading the
500 // IA32_MC4_STATUS MSR (0x411)
501 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700502 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
503 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700504 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700505 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700506 continue;
507 }
508 // Check MSEC bits 31:24 for
509 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
510 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
511 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700512 uint64_t msec = (mc4Status >> 24) & 0xFF;
513 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700514 {
515 cpuIERRLog(cpu, "CPU/VR Mismatch");
516 continue;
517 }
518
519 // Next check if it's a Core FIVR fault by looking for a
520 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
521 // 80h)
522 uint32_t coreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700523 peciStatus = peci_RdPCIConfigLocal(
524 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
525 (uint8_t*)&coreFIVRErrLog, &cc);
526 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700527 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700528 printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus,
529 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700530 continue;
531 }
532 if (coreFIVRErrLog)
533 {
534 cpuIERRLog(cpu, "Core FIVR Fault");
535 continue;
536 }
537
538 // Next check if it's an Uncore FIVR fault by looking for a
539 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
540 // 84h)
541 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700542 peciStatus = peci_RdPCIConfigLocal(
543 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
544 (uint8_t*)&uncoreFIVRErrLog, &cc);
545 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700546 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700547 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
548 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700549 continue;
550 }
551 if (uncoreFIVRErrLog)
552 {
553 cpuIERRLog(cpu, "Uncore FIVR Fault");
554 continue;
555 }
556
557 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
558 // both zero, but MSEC bits 31:24 have either
559 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
560 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
561 // uncore FIVR fault
562 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700563 (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700564 {
565 cpuIERRLog(cpu, "Uncore FIVR Fault");
566 continue;
567 }
568 cpuIERRLog(cpu);
569 }
570 break;
571 }
572 case icx:
573 {
574 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
575 // that caused the IERR
576 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700577 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
578 (uint8_t*)&mcaErrSrcLog, &cc);
579 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700580 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700581 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700582 continue;
583 }
584 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
585 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
586 {
587 // TODO: Light the CPU fault LED?
588 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700589 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700590 // Next check if it's a CPU/VR mismatch by reading the
591 // IA32_MC4_STATUS MSR (0x411)
592 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700593 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
594 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700595 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700596 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700597 continue;
598 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700599 // Check MSEC bits 31:24 for
600 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
601 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
602 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700603 uint64_t msec = (mc4Status >> 24) & 0xFF;
604 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700605 {
606 cpuIERRLog(cpu, "CPU/VR Mismatch");
607 continue;
608 }
609
610 // Next check if it's a Core FIVR fault by looking for a
611 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
612 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
613 uint32_t coreFIVRErrLog0 = 0;
614 uint32_t coreFIVRErrLog1 = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700615 peciStatus = peci_RdEndPointConfigPciLocal(
616 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
617 (uint8_t*)&coreFIVRErrLog0, &cc);
618 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700619 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700620 printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus,
621 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700622 continue;
623 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700624 peciStatus = peci_RdEndPointConfigPciLocal(
625 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
626 (uint8_t*)&coreFIVRErrLog1, &cc);
627 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700628 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700629 printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus,
630 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700631 continue;
632 }
633 if (coreFIVRErrLog0 || coreFIVRErrLog1)
634 {
635 cpuIERRLog(cpu, "Core FIVR Fault");
636 continue;
637 }
638
639 // Next check if it's an Uncore FIVR fault by looking for a
640 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
641 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
642 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700643 peciStatus = peci_RdEndPointConfigPciLocal(
644 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
645 (uint8_t*)&uncoreFIVRErrLog, &cc);
646 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700647 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700648 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
649 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700650 continue;
651 }
652 if (uncoreFIVRErrLog)
653 {
654 cpuIERRLog(cpu, "Uncore FIVR Fault");
655 continue;
656 }
657
658 // TODO: Update MSEC/MSCOD_31_24 check
659 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
660 // both zero, but MSEC bits 31:24 have either
661 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
662 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
663 // uncore FIVR fault
664 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700665 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700666 {
667 cpuIERRLog(cpu, "Uncore FIVR Fault");
668 continue;
669 }
670 cpuIERRLog(cpu);
671 }
672 break;
673 }
674 }
675 }
676 return cpuIERRFound;
677}
678
Jason M. Billsa15c2522019-08-16 10:01:44 -0700679static void caterrAssertHandler()
680{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700681 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
682 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
683 if (ec)
684 {
685 // operation_aborted is expected if timer is canceled
686 // before completion.
687 if (ec != boost::asio::error::operation_aborted)
688 {
689 std::cerr << "caterr timeout async_wait failed: "
690 << ec.message() << "\n";
691 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700692 return;
693 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700694 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
695 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800696 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700697 if (!checkIERRCPUs())
698 {
699 cpuIERRLog();
700 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700701 conn->async_method_call(
702 [](boost::system::error_code ec,
703 const std::variant<bool>& property) {
704 if (ec)
705 {
706 return;
707 }
708 const bool* reset = std::get_if<bool>(&property);
709 if (reset == nullptr)
710 {
711 std::cerr << "Unable to read reset on CATERR value\n";
712 return;
713 }
Jason M. Billsd711cc82020-12-04 16:46:39 -0800714 startCrashdumpAndRecovery(conn, *reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700715 },
716 "xyz.openbmc_project.Settings",
717 "/xyz/openbmc_project/control/processor_error_config",
718 "org.freedesktop.DBus.Properties", "Get",
719 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
720 });
721}
722
Jason M. Bills1490b142019-07-01 15:48:43 -0700723static void caterrHandler()
724{
725 if (!hostOff)
726 {
727 gpiod::line_event gpioLineEvent = caterrLine.event_read();
728
729 bool caterr =
730 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800731
732 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700733 if (caterr)
734 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700735 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800736 associations.emplace_back(
737 "", "critical",
738 "/xyz/openbmc_project/host_error_monitor/cat_error");
739 associations.emplace_back("", "critical",
740 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700741 }
742 else
743 {
744 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800745 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700746 }
Yong Li1429ca82020-04-27 16:49:45 +0800747 host_error_monitor::associationCATAssert->set_property("Associations",
748 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700749 }
750 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
751 [](const boost::system::error_code ec) {
752 if (ec)
753 {
754 std::cerr << "caterr handler error: "
755 << ec.message() << "\n";
756 return;
757 }
758 caterrHandler();
759 });
760}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700761
Jason M. Billse94f5e12019-09-13 11:11:34 -0700762static void cpu1ThermtripAssertHandler()
763{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700764 if (cpu1FIVRFaultLine.get_value() == 0)
765 {
766 cpuBootFIVRFaultLog(1);
767 }
768 else
769 {
770 cpuThermTripLog(1);
771 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700772}
773
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700774static void cpu1ThermtripHandler()
775{
Jason M. Bills84951142020-04-17 15:57:11 -0700776 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700777
Jason M. Bills84951142020-04-17 15:57:11 -0700778 bool cpu1Thermtrip =
779 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
780 if (cpu1Thermtrip)
781 {
782 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700783 }
Jason M. Bills84951142020-04-17 15:57:11 -0700784
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700785 cpu1ThermtripEvent.async_wait(
786 boost::asio::posix::stream_descriptor::wait_read,
787 [](const boost::system::error_code ec) {
788 if (ec)
789 {
790 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
791 << "\n";
792 return;
793 }
794 cpu1ThermtripHandler();
795 });
796}
797
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000798static void cpu1MemtripHandler()
799{
Jason M. Bills5287c022020-05-19 11:16:09 -0700800 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000801
Jason M. Bills5287c022020-05-19 11:16:09 -0700802 bool cpu1Memtrip =
803 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
804 if (cpu1Memtrip)
805 {
806 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000807 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700808
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000809 cpu1MemtripEvent.async_wait(
810 boost::asio::posix::stream_descriptor::wait_read,
811 [](const boost::system::error_code ec) {
812 if (ec)
813 {
814 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
815 << ec.message() << "\n";
816 return;
817 }
818 cpu1MemtripHandler();
819 });
820}
821
Jason M. Billse94f5e12019-09-13 11:11:34 -0700822static void cpu2ThermtripAssertHandler()
823{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700824 if (cpu2FIVRFaultLine.get_value() == 0)
825 {
826 cpuBootFIVRFaultLog(2);
827 }
828 else
829 {
830 cpuThermTripLog(2);
831 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700832}
833
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700834static void cpu2ThermtripHandler()
835{
Jason M. Bills84951142020-04-17 15:57:11 -0700836 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700837
Jason M. Bills84951142020-04-17 15:57:11 -0700838 bool cpu2Thermtrip =
839 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
840 if (cpu2Thermtrip)
841 {
842 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700843 }
Jason M. Bills84951142020-04-17 15:57:11 -0700844
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700845 cpu2ThermtripEvent.async_wait(
846 boost::asio::posix::stream_descriptor::wait_read,
847 [](const boost::system::error_code ec) {
848 if (ec)
849 {
850 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
851 << "\n";
852 return;
853 }
854 cpu2ThermtripHandler();
855 });
856}
857
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000858static void cpu2MemtripHandler()
859{
Jason M. Bills5287c022020-05-19 11:16:09 -0700860 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000861
Jason M. Bills5287c022020-05-19 11:16:09 -0700862 bool cpu2Memtrip =
863 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
864 if (cpu2Memtrip)
865 {
866 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000867 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700868
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000869 cpu2MemtripEvent.async_wait(
870 boost::asio::posix::stream_descriptor::wait_read,
871 [](const boost::system::error_code ec) {
872 if (ec)
873 {
874 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
875 << ec.message() << "\n";
876 return;
877 }
878 cpu2MemtripHandler();
879 });
880}
881
Jason M. Billse94f5e12019-09-13 11:11:34 -0700882static void cpu1VRHotAssertHandler()
883{
884 cpuVRHotLog("CPU 1");
885}
886
Jason M. Bills250fa632019-08-28 15:58:25 -0700887static void cpu1VRHotHandler()
888{
Jason M. Bills84951142020-04-17 15:57:11 -0700889 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700890
Jason M. Bills84951142020-04-17 15:57:11 -0700891 bool cpu1VRHot =
892 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
893 if (cpu1VRHot)
894 {
895 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700896 }
Jason M. Bills84951142020-04-17 15:57:11 -0700897
Jason M. Bills250fa632019-08-28 15:58:25 -0700898 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
899 [](const boost::system::error_code ec) {
900 if (ec)
901 {
902 std::cerr << "CPU 1 VRHot handler error: "
903 << ec.message() << "\n";
904 return;
905 }
906 cpu1VRHotHandler();
907 });
908}
909
Jason M. Billse94f5e12019-09-13 11:11:34 -0700910static void cpu1MemABCDVRHotAssertHandler()
911{
912 cpuVRHotLog("CPU 1 Memory ABCD");
913}
914
Jason M. Bills9647ba72019-08-29 14:19:19 -0700915static void cpu1MemABCDVRHotHandler()
916{
Jason M. Bills84951142020-04-17 15:57:11 -0700917 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700918
Jason M. Bills84951142020-04-17 15:57:11 -0700919 bool cpu1MemABCDVRHot =
920 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
921 if (cpu1MemABCDVRHot)
922 {
923 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700924 }
Jason M. Bills84951142020-04-17 15:57:11 -0700925
Jason M. Bills9647ba72019-08-29 14:19:19 -0700926 cpu1MemABCDVRHotEvent.async_wait(
927 boost::asio::posix::stream_descriptor::wait_read,
928 [](const boost::system::error_code ec) {
929 if (ec)
930 {
931 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
932 << ec.message() << "\n";
933 return;
934 }
935 cpu1MemABCDVRHotHandler();
936 });
937}
938
Jason M. Billse94f5e12019-09-13 11:11:34 -0700939static void cpu1MemEFGHVRHotAssertHandler()
940{
941 cpuVRHotLog("CPU 1 Memory EFGH");
942}
943
Jason M. Bills9647ba72019-08-29 14:19:19 -0700944static void cpu1MemEFGHVRHotHandler()
945{
Jason M. Bills84951142020-04-17 15:57:11 -0700946 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700947
Jason M. Bills84951142020-04-17 15:57:11 -0700948 bool cpu1MemEFGHVRHot =
949 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
950 if (cpu1MemEFGHVRHot)
951 {
952 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700953 }
Jason M. Bills84951142020-04-17 15:57:11 -0700954
Jason M. Bills9647ba72019-08-29 14:19:19 -0700955 cpu1MemEFGHVRHotEvent.async_wait(
956 boost::asio::posix::stream_descriptor::wait_read,
957 [](const boost::system::error_code ec) {
958 if (ec)
959 {
960 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
961 << ec.message() << "\n";
962 return;
963 }
964 cpu1MemEFGHVRHotHandler();
965 });
966}
967
Jason M. Billse94f5e12019-09-13 11:11:34 -0700968static void cpu2VRHotAssertHandler()
969{
970 cpuVRHotLog("CPU 2");
971}
972
Jason M. Bills250fa632019-08-28 15:58:25 -0700973static void cpu2VRHotHandler()
974{
Jason M. Bills84951142020-04-17 15:57:11 -0700975 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700976
Jason M. Bills84951142020-04-17 15:57:11 -0700977 bool cpu2VRHot =
978 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
979 if (cpu2VRHot)
980 {
981 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700982 }
Jason M. Bills84951142020-04-17 15:57:11 -0700983
Jason M. Bills250fa632019-08-28 15:58:25 -0700984 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
985 [](const boost::system::error_code ec) {
986 if (ec)
987 {
988 std::cerr << "CPU 2 VRHot handler error: "
989 << ec.message() << "\n";
990 return;
991 }
992 cpu2VRHotHandler();
993 });
994}
995
Jason M. Billse94f5e12019-09-13 11:11:34 -0700996static void cpu2MemABCDVRHotAssertHandler()
997{
998 cpuVRHotLog("CPU 2 Memory ABCD");
999}
1000
Jason M. Bills9647ba72019-08-29 14:19:19 -07001001static void cpu2MemABCDVRHotHandler()
1002{
Jason M. Bills84951142020-04-17 15:57:11 -07001003 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001004
Jason M. Bills84951142020-04-17 15:57:11 -07001005 bool cpu2MemABCDVRHot =
1006 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1007 if (cpu2MemABCDVRHot)
1008 {
1009 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001010 }
Jason M. Bills84951142020-04-17 15:57:11 -07001011
Jason M. Bills9647ba72019-08-29 14:19:19 -07001012 cpu2MemABCDVRHotEvent.async_wait(
1013 boost::asio::posix::stream_descriptor::wait_read,
1014 [](const boost::system::error_code ec) {
1015 if (ec)
1016 {
1017 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1018 << ec.message() << "\n";
1019 return;
1020 }
1021 cpu2MemABCDVRHotHandler();
1022 });
1023}
1024
Jason M. Billse94f5e12019-09-13 11:11:34 -07001025static void cpu2MemEFGHVRHotAssertHandler()
1026{
1027 cpuVRHotLog("CPU 2 Memory EFGH");
1028}
1029
Jason M. Bills9647ba72019-08-29 14:19:19 -07001030static void cpu2MemEFGHVRHotHandler()
1031{
Jason M. Bills84951142020-04-17 15:57:11 -07001032 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001033
Jason M. Bills84951142020-04-17 15:57:11 -07001034 bool cpu2MemEFGHVRHot =
1035 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1036 if (cpu2MemEFGHVRHot)
1037 {
1038 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001039 }
Jason M. Bills84951142020-04-17 15:57:11 -07001040
Jason M. Bills9647ba72019-08-29 14:19:19 -07001041 cpu2MemEFGHVRHotEvent.async_wait(
1042 boost::asio::posix::stream_descriptor::wait_read,
1043 [](const boost::system::error_code ec) {
1044 if (ec)
1045 {
1046 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1047 << ec.message() << "\n";
1048 return;
1049 }
1050 cpu2MemEFGHVRHotHandler();
1051 });
1052}
1053
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001054static void pchThermtripHandler()
1055{
Yong Li1429ca82020-04-27 16:49:45 +08001056 std::vector<Association> associations;
1057
Jason M. Bills84951142020-04-17 15:57:11 -07001058 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001059
Jason M. Bills84951142020-04-17 15:57:11 -07001060 bool pchThermtrip =
1061 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1062 if (pchThermtrip)
1063 {
1064 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001065 associations.emplace_back(
1066 "", "critical",
1067 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1068 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001069 }
Yong Li1429ca82020-04-27 16:49:45 +08001070 else
1071 {
1072 associations.emplace_back("", "", "");
1073 }
1074 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1075 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001076
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001077 pchThermtripEvent.async_wait(
1078 boost::asio::posix::stream_descriptor::wait_read,
1079 [](const boost::system::error_code ec) {
1080 if (ec)
1081 {
1082 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1083 << "\n";
1084 return;
1085 }
1086 pchThermtripHandler();
1087 });
1088}
1089
Jason M. Billscbf78532019-08-16 15:32:11 -07001090static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001091{
Jason M. Billscbf78532019-08-16 15:32:11 -07001092 int errPinSts = (1 << errPin);
1093 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001094 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001095 cpu++, addr++)
1096 {
1097 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1098 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001099 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001100 uint8_t cc = 0;
1101 CPUModel model{};
1102 uint8_t stepping = 0;
1103 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1104 {
1105 std::cerr << "Cannot get CPUID!\n";
1106 continue;
1107 }
1108
1109 switch (model)
1110 {
1111 case skx:
1112 {
1113 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001114 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001115 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001116 peciStatus = peci_RdPCIConfigLocal(
1117 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1118 (uint8_t*)&errpinsts, &cc);
1119 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001120 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001121 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1122 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001123 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001124
1125 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001126 break;
1127 }
1128 case icx:
1129 {
1130 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001131 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001132 // accessed on PECI as bus 13)
1133 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001134 peciStatus = peci_RdEndPointConfigPciLocal(
1135 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1136 (uint8_t*)&errpinsts, &cc);
1137 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001138 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001139 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1140 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001141 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001142
1143 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001144 break;
1145 }
1146 }
1147 }
1148 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001149 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001150}
1151
Jason M. Billscbf78532019-08-16 15:32:11 -07001152static void errXAssertHandler(const int errPin,
1153 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001154{
Jason M. Billscbf78532019-08-16 15:32:11 -07001155 // ERRx status is not guaranteed through the timeout, so save which
1156 // CPUs have it asserted
1157 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1158 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1159 errXAssertTimer.async_wait([errPin, errPinCPUs](
1160 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001161 if (ec)
1162 {
1163 // operation_aborted is expected if timer is canceled before
1164 // completion.
1165 if (ec != boost::asio::error::operation_aborted)
1166 {
1167 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1168 << "\n";
1169 }
1170 return;
1171 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001172 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1173 << std::to_string(errTimeoutMs) << " ms\n";
1174 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001175 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001176 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001177 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001178 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001179 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001180 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001181 }
1182 }
1183 }
1184 else
1185 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001186 cpuERRXLog(errPin);
1187 }
1188 });
1189}
1190
Jason M. Bills8c584392019-08-19 11:05:51 -07001191static void err0AssertHandler()
1192{
1193 // Handle the standard ERR0 detection and logging
1194 const static constexpr int err0 = 0;
1195 errXAssertHandler(err0, err0AssertTimer);
1196}
1197
1198static void err0Handler()
1199{
1200 if (!hostOff)
1201 {
1202 gpiod::line_event gpioLineEvent = err0Line.event_read();
1203
1204 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1205 if (err0)
1206 {
1207 err0AssertHandler();
1208 }
1209 else
1210 {
1211 err0AssertTimer.cancel();
1212 }
1213 }
1214 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1215 [](const boost::system::error_code ec) {
1216 if (ec)
1217 {
1218 std::cerr
1219 << "err0 handler error: " << ec.message()
1220 << "\n";
1221 return;
1222 }
1223 err0Handler();
1224 });
1225}
1226
Jason M. Bills75af3962019-08-19 11:07:17 -07001227static void err1AssertHandler()
1228{
1229 // Handle the standard ERR1 detection and logging
1230 const static constexpr int err1 = 1;
1231 errXAssertHandler(err1, err1AssertTimer);
1232}
1233
1234static void err1Handler()
1235{
1236 if (!hostOff)
1237 {
1238 gpiod::line_event gpioLineEvent = err1Line.event_read();
1239
1240 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1241 if (err1)
1242 {
1243 err1AssertHandler();
1244 }
1245 else
1246 {
1247 err1AssertTimer.cancel();
1248 }
1249 }
1250 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1251 [](const boost::system::error_code ec) {
1252 if (ec)
1253 {
1254 std::cerr
1255 << "err1 handler error: " << ec.message()
1256 << "\n";
1257 return;
1258 }
1259 err1Handler();
1260 });
1261}
1262
Jason M. Billscbf78532019-08-16 15:32:11 -07001263static void err2AssertHandler()
1264{
1265 // Handle the standard ERR2 detection and logging
1266 const static constexpr int err2 = 2;
1267 errXAssertHandler(err2, err2AssertTimer);
1268 // Also handle reset for ERR2
1269 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1270 if (ec)
1271 {
1272 // operation_aborted is expected if timer is canceled before
1273 // completion.
1274 if (ec != boost::asio::error::operation_aborted)
1275 {
1276 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1277 << "\n";
1278 }
1279 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001280 }
1281 conn->async_method_call(
1282 [](boost::system::error_code ec,
1283 const std::variant<bool>& property) {
1284 if (ec)
1285 {
1286 return;
1287 }
1288 const bool* reset = std::get_if<bool>(&property);
1289 if (reset == nullptr)
1290 {
1291 std::cerr << "Unable to read reset on ERR2 value\n";
1292 return;
1293 }
Jason M. Billsd711cc82020-12-04 16:46:39 -08001294 startCrashdumpAndRecovery(conn, *reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001295 },
1296 "xyz.openbmc_project.Settings",
1297 "/xyz/openbmc_project/control/processor_error_config",
1298 "org.freedesktop.DBus.Properties", "Get",
1299 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001300
1301 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001302 });
1303}
1304
1305static void err2Handler()
1306{
1307 if (!hostOff)
1308 {
1309 gpiod::line_event gpioLineEvent = err2Line.event_read();
1310
1311 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1312 if (err2)
1313 {
1314 err2AssertHandler();
1315 }
1316 else
1317 {
1318 err2AssertTimer.cancel();
1319 }
1320 }
1321 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1322 [](const boost::system::error_code ec) {
1323 if (ec)
1324 {
1325 std::cerr
1326 << "err2 handler error: " << ec.message()
1327 << "\n";
1328 return;
1329 }
1330 err2Handler();
1331 });
1332}
1333
Jason M. Bills89922f82019-08-06 11:10:02 -07001334static void smiAssertHandler()
1335{
1336 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1337 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1338 if (ec)
1339 {
1340 // operation_aborted is expected if timer is canceled before
1341 // completion.
1342 if (ec != boost::asio::error::operation_aborted)
1343 {
1344 std::cerr << "smi timeout async_wait failed: " << ec.message()
1345 << "\n";
1346 }
1347 return;
1348 }
1349 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1350 << " ms\n";
1351 smiTimeoutLog();
1352 conn->async_method_call(
1353 [](boost::system::error_code ec,
1354 const std::variant<bool>& property) {
1355 if (ec)
1356 {
1357 return;
1358 }
1359 const bool* reset = std::get_if<bool>(&property);
1360 if (reset == nullptr)
1361 {
1362 std::cerr << "Unable to read reset on SMI value\n";
1363 return;
1364 }
Jason M. Bills94785442020-01-07 15:22:09 -08001365#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001366 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001367#else
1368 if (*reset)
1369 {
Jason M. Billsd69549b2020-08-27 11:42:43 -07001370 std::cerr << "Recovering the system\n";
Jason M. Billsd711cc82020-12-04 16:46:39 -08001371 startWarmReset(conn);
Jason M. Bills94785442020-01-07 15:22:09 -08001372 }
1373#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001374 },
1375 "xyz.openbmc_project.Settings",
1376 "/xyz/openbmc_project/control/bmc_reset_disables",
1377 "org.freedesktop.DBus.Properties", "Get",
1378 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1379 });
1380}
1381
1382static void smiHandler()
1383{
1384 if (!hostOff)
1385 {
1386 gpiod::line_event gpioLineEvent = smiLine.event_read();
1387
1388 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1389 if (smi)
1390 {
1391 smiAssertHandler();
1392 }
1393 else
1394 {
1395 smiAssertTimer.cancel();
1396 }
1397 }
1398 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1399 [](const boost::system::error_code ec) {
1400 if (ec)
1401 {
1402 std::cerr
1403 << "smi handler error: " << ec.message()
1404 << "\n";
1405 return;
1406 }
1407 smiHandler();
1408 });
1409}
1410
Jason M. Billsa15c2522019-08-16 10:01:44 -07001411static void initializeErrorState()
1412{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001413 // Handle CPU1_MISMATCH if it's asserted now
1414 if (cpu1MismatchLine.get_value() == 1)
1415 {
1416 cpuMismatchLog(1);
1417 }
1418
1419 // Handle CPU2_MISMATCH if it's asserted now
1420 if (cpu2MismatchLine.get_value() == 1)
1421 {
1422 cpuMismatchLog(2);
1423 }
1424
Jason M. Billsa15c2522019-08-16 10:01:44 -07001425 // Handle CPU_CATERR if it's asserted now
1426 if (caterrLine.get_value() == 0)
1427 {
1428 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001429 std::vector<Association> associations;
1430 associations.emplace_back(
1431 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1432 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1433 host_error_monitor::associationCATAssert->set_property("Associations",
1434 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001435 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001436
Jason M. Bills8c584392019-08-19 11:05:51 -07001437 // Handle CPU_ERR0 if it's asserted now
1438 if (err0Line.get_value() == 0)
1439 {
1440 err0AssertHandler();
1441 }
1442
Jason M. Bills75af3962019-08-19 11:07:17 -07001443 // Handle CPU_ERR1 if it's asserted now
1444 if (err1Line.get_value() == 0)
1445 {
1446 err1AssertHandler();
1447 }
1448
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001449 // Handle CPU_ERR2 if it's asserted now
1450 if (err2Line.get_value() == 0)
1451 {
1452 err2AssertHandler();
1453 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001454
1455 // Handle SMI if it's asserted now
1456 if (smiLine.get_value() == 0)
1457 {
1458 smiAssertHandler();
1459 }
Jason M. Bills08866542019-08-16 12:04:19 -07001460
Jason M. Billse94f5e12019-09-13 11:11:34 -07001461 // Handle CPU1_THERMTRIP if it's asserted now
1462 if (cpu1ThermtripLine.get_value() == 0)
1463 {
1464 cpu1ThermtripAssertHandler();
1465 }
1466
1467 // Handle CPU2_THERMTRIP if it's asserted now
1468 if (cpu2ThermtripLine.get_value() == 0)
1469 {
1470 cpu2ThermtripAssertHandler();
1471 }
1472
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001473 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1474 if (cpu1MemtripLine.get_value() == 0)
1475 {
1476 memThermTripLog(1);
1477 }
1478
1479 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1480 if (cpu2MemtripLine.get_value() == 0)
1481 {
1482 memThermTripLog(2);
1483 }
1484
Jason M. Billse94f5e12019-09-13 11:11:34 -07001485 // Handle CPU1_VRHOT if it's asserted now
1486 if (cpu1VRHotLine.get_value() == 0)
1487 {
1488 cpu1VRHotAssertHandler();
1489 }
1490
1491 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1492 if (cpu1MemABCDVRHotLine.get_value() == 0)
1493 {
1494 cpu1MemABCDVRHotAssertHandler();
1495 }
1496
1497 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1498 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1499 {
1500 cpu1MemEFGHVRHotAssertHandler();
1501 }
1502
1503 // Handle CPU2_VRHOT if it's asserted now
1504 if (cpu2VRHotLine.get_value() == 0)
1505 {
1506 cpu2VRHotAssertHandler();
1507 }
1508
1509 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1510 if (cpu2MemABCDVRHotLine.get_value() == 0)
1511 {
1512 cpu2MemABCDVRHotAssertHandler();
1513 }
1514
1515 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1516 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1517 {
1518 cpu2MemEFGHVRHotAssertHandler();
1519 }
1520
Jason M. Bills08866542019-08-16 12:04:19 -07001521 // Handle PCH_BMC_THERMTRIP if it's asserted now
1522 if (pchThermtripLine.get_value() == 0)
1523 {
1524 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001525 std::vector<Association> associations;
1526 associations.emplace_back(
1527 "", "critical",
1528 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1529 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1530 host_error_monitor::associationSSBThermTrip->set_property(
1531 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001532 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001533}
Jason M. Bills1490b142019-07-01 15:48:43 -07001534} // namespace host_error_monitor
1535
1536int main(int argc, char* argv[])
1537{
1538 // setup connection to dbus
1539 host_error_monitor::conn =
1540 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1541
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001542 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001543 host_error_monitor::conn->request_name(
1544 "xyz.openbmc_project.HostErrorMonitor");
1545 sdbusplus::asio::object_server server =
1546 sdbusplus::asio::object_server(host_error_monitor::conn);
1547
Yong Li1429ca82020-04-27 16:49:45 +08001548 // Associations interface for led status
1549 std::vector<host_error_monitor::Association> associations;
1550 associations.emplace_back("", "", "");
1551 host_error_monitor::associationSSBThermTrip = server.add_interface(
1552 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1553 "xyz.openbmc_project.Association.Definitions");
1554 host_error_monitor::associationSSBThermTrip->register_property(
1555 "Associations", associations);
1556 host_error_monitor::associationSSBThermTrip->initialize();
1557
1558 host_error_monitor::associationCATAssert = server.add_interface(
1559 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1560 "xyz.openbmc_project.Association.Definitions");
1561 host_error_monitor::associationCATAssert->register_property("Associations",
1562 associations);
1563 host_error_monitor::associationCATAssert->initialize();
1564
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001565 // Restart Cause Interface
1566 host_error_monitor::hostErrorTimeoutIface =
1567 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1568 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1569
1570 host_error_monitor::hostErrorTimeoutIface->register_property(
1571 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1572 [](const std::size_t& requested, std::size_t& resp) {
1573 if (requested > host_error_monitor::caterrTimeoutMsMax)
1574 {
1575 std::cerr << "IERRTimeoutMs update to " << requested
1576 << "ms rejected. Cannot be greater than "
1577 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1578 return 0;
1579 }
1580 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1581 host_error_monitor::caterrTimeoutMs = requested;
1582 resp = requested;
1583 return 1;
1584 },
1585 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1586 host_error_monitor::hostErrorTimeoutIface->initialize();
1587
Jason M. Bills1490b142019-07-01 15:48:43 -07001588 // Start tracking host state
1589 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1590 host_error_monitor::startHostStateMonitor();
1591
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001592 // Request CPU1_MISMATCH GPIO events
1593 if (!host_error_monitor::requestGPIOInput(
1594 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1595 {
1596 return -1;
1597 }
1598
1599 // Request CPU2_MISMATCH GPIO events
1600 if (!host_error_monitor::requestGPIOInput(
1601 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1602 {
1603 return -1;
1604 }
1605
Jason M. Bills1490b142019-07-01 15:48:43 -07001606 // Request CPU_CATERR GPIO events
1607 if (!host_error_monitor::requestGPIOEvents(
1608 "CPU_CATERR", host_error_monitor::caterrHandler,
1609 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1610 {
1611 return -1;
1612 }
1613
Jason M. Bills8c584392019-08-19 11:05:51 -07001614 // Request CPU_ERR0 GPIO events
1615 if (!host_error_monitor::requestGPIOEvents(
1616 "CPU_ERR0", host_error_monitor::err0Handler,
1617 host_error_monitor::err0Line, host_error_monitor::err0Event))
1618 {
1619 return -1;
1620 }
1621
Jason M. Bills75af3962019-08-19 11:07:17 -07001622 // Request CPU_ERR1 GPIO events
1623 if (!host_error_monitor::requestGPIOEvents(
1624 "CPU_ERR1", host_error_monitor::err1Handler,
1625 host_error_monitor::err1Line, host_error_monitor::err1Event))
1626 {
1627 return -1;
1628 }
1629
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001630 // Request CPU_ERR2 GPIO events
1631 if (!host_error_monitor::requestGPIOEvents(
1632 "CPU_ERR2", host_error_monitor::err2Handler,
1633 host_error_monitor::err2Line, host_error_monitor::err2Event))
1634 {
1635 return -1;
1636 }
1637
Jason M. Bills89922f82019-08-06 11:10:02 -07001638 // Request SMI GPIO events
1639 if (!host_error_monitor::requestGPIOEvents(
1640 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1641 host_error_monitor::smiEvent))
1642 {
1643 return -1;
1644 }
1645
Jason M. Bills45e87e02019-09-09 14:45:38 -07001646 // Request CPU1_FIVR_FAULT GPIO input
1647 if (!host_error_monitor::requestGPIOInput(
1648 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1649 {
1650 return -1;
1651 }
1652
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001653 // Request CPU1_THERMTRIP GPIO events
1654 if (!host_error_monitor::requestGPIOEvents(
1655 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1656 host_error_monitor::cpu1ThermtripLine,
1657 host_error_monitor::cpu1ThermtripEvent))
1658 {
1659 return -1;
1660 }
1661
Jason M. Bills45e87e02019-09-09 14:45:38 -07001662 // Request CPU2_FIVR_FAULT GPIO input
1663 if (!host_error_monitor::requestGPIOInput(
1664 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1665 {
1666 return -1;
1667 }
1668
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001669 // Request CPU2_THERMTRIP GPIO events
1670 if (!host_error_monitor::requestGPIOEvents(
1671 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1672 host_error_monitor::cpu2ThermtripLine,
1673 host_error_monitor::cpu2ThermtripEvent))
1674 {
1675 return -1;
1676 }
1677
Jason M. Bills250fa632019-08-28 15:58:25 -07001678 // Request CPU1_VRHOT GPIO events
1679 if (!host_error_monitor::requestGPIOEvents(
1680 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1681 host_error_monitor::cpu1VRHotLine,
1682 host_error_monitor::cpu1VRHotEvent))
1683 {
1684 return -1;
1685 }
1686
Jason M. Bills9647ba72019-08-29 14:19:19 -07001687 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1688 if (!host_error_monitor::requestGPIOEvents(
1689 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1690 host_error_monitor::cpu1MemABCDVRHotLine,
1691 host_error_monitor::cpu1MemABCDVRHotEvent))
1692 {
1693 return -1;
1694 }
1695
1696 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1697 if (!host_error_monitor::requestGPIOEvents(
1698 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1699 host_error_monitor::cpu1MemEFGHVRHotLine,
1700 host_error_monitor::cpu1MemEFGHVRHotEvent))
1701 {
1702 return -1;
1703 }
1704
Jason M. Bills250fa632019-08-28 15:58:25 -07001705 // Request CPU2_VRHOT GPIO events
1706 if (!host_error_monitor::requestGPIOEvents(
1707 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1708 host_error_monitor::cpu2VRHotLine,
1709 host_error_monitor::cpu2VRHotEvent))
1710 {
1711 return -1;
1712 }
1713
Jason M. Bills9647ba72019-08-29 14:19:19 -07001714 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1715 if (!host_error_monitor::requestGPIOEvents(
1716 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1717 host_error_monitor::cpu2MemABCDVRHotLine,
1718 host_error_monitor::cpu2MemABCDVRHotEvent))
1719 {
1720 return -1;
1721 }
1722
1723 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1724 if (!host_error_monitor::requestGPIOEvents(
1725 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1726 host_error_monitor::cpu2MemEFGHVRHotLine,
1727 host_error_monitor::cpu2MemEFGHVRHotEvent))
1728 {
1729 return -1;
1730 }
1731
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001732 // Request PCH_BMC_THERMTRIP GPIO events
1733 if (!host_error_monitor::requestGPIOEvents(
1734 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1735 host_error_monitor::pchThermtripLine,
1736 host_error_monitor::pchThermtripEvent))
1737 {
1738 return -1;
1739 }
1740
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001741 // Request CPU1_MEM_THERM_EVENT GPIO events
1742 if (!host_error_monitor::requestGPIOEvents(
1743 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1744 host_error_monitor::cpu1MemtripLine,
1745 host_error_monitor::cpu1MemtripEvent))
1746 {
1747 return -1;
1748 }
1749
1750 // Request CPU2_MEM_THERM_EVENT GPIO events
1751 if (!host_error_monitor::requestGPIOEvents(
1752 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1753 host_error_monitor::cpu2MemtripLine,
1754 host_error_monitor::cpu2MemtripEvent))
1755 {
1756 return -1;
1757 }
1758
Jason M. Bills5245ed62020-12-04 16:50:21 -08001759 // Initialize the signal monitors
1760 host_error_monitor::init();
1761
Jason M. Bills1490b142019-07-01 15:48:43 -07001762 host_error_monitor::io.run();
1763
1764 return 0;
1765}