blob: c637188e30893df81fdbe078c7d230fa9855b904 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070022#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070024
25#include <bitset>
26#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070027#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070028
29namespace host_error_monitor
30{
31static boost::asio::io_service io;
32static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080033static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070034
Yong Li1429ca82020-04-27 16:49:45 +080035using Association = std::tuple<std::string, std::string, std::string>;
36static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
37static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
38
39static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
40
Jason M. Bills1490b142019-07-01 15:48:43 -070041static bool hostOff = true;
42
Jason M. Billsc4b91f22019-11-26 17:04:50 -080043static size_t caterrTimeoutMs = 2000;
44const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070045const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070046const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070047const static constexpr size_t crashdumpTimeoutS = 300;
48
49// Timers
50// Timer for CATERR asserted
51static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070052// Timer for ERR0 asserted
53static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070054// Timer for ERR1 asserted
55static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070056// Timer for ERR2 asserted
57static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070058// Timer for SMI asserted
59static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070060
61// GPIO Lines and Event Descriptors
62static gpiod::line caterrLine;
63static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070064static gpiod::line err0Line;
65static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070066static gpiod::line err1Line;
67static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070068static gpiod::line err2Line;
69static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070070static gpiod::line smiLine;
71static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070072static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070073static gpiod::line cpu1ThermtripLine;
74static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070075static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070076static gpiod::line cpu2ThermtripLine;
77static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070078static gpiod::line cpu1VRHotLine;
79static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
80static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070081static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
82static gpiod::line cpu1MemEFGHVRHotLine;
83static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
84static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070085static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070086static gpiod::line cpu1MemABCDVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
88static gpiod::line cpu2MemEFGHVRHotLine;
89static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080090//----------------------------------
91// PCH_BMC_THERMTRIP function related definition
92//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080093static gpiod::line pchThermtripLine;
94static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000095//----------------------------------
96// CPU_MEM_THERM_EVENT function related definition
97//----------------------------------
98static gpiod::line cpu1MemtripLine;
99static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
100static gpiod::line cpu2MemtripLine;
101static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000102//---------------------------------
103// CPU_MISMATCH function related definition
104//---------------------------------
105static gpiod::line cpu1MismatchLine;
106static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700107
Yong Li061eb032020-02-26 15:06:18 +0800108// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800109const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800110const static constexpr uint8_t beepCPUErr2 = 5;
111
112static void beep(const uint8_t& beepPriority)
113{
114 conn->async_method_call(
115 [](boost::system::error_code ec) {
116 if (ec)
117 {
118 std::cerr << "beep returned error with "
119 "async_method_call (ec = "
120 << ec << ")\n";
121 return;
122 }
123 },
124 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
125 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
126}
127
Jason M. Billsa3397932019-08-06 11:07:21 -0700128static void cpuIERRLog()
129{
130 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
131 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
132 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
133}
134
135static void cpuIERRLog(const int cpuNum)
136{
137 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
138
139 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
140 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
141 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
142}
143
144static void cpuIERRLog(const int cpuNum, const std::string& type)
145{
146 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
147
148 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
149 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
150 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
151}
152
Jason M. Billscbf78532019-08-16 15:32:11 -0700153static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700154{
Jason M. Billscbf78532019-08-16 15:32:11 -0700155 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
156
157 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
158 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
159 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700160}
161
Jason M. Billscbf78532019-08-16 15:32:11 -0700162static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700163{
Jason M. Billscbf78532019-08-16 15:32:11 -0700164 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
165 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700166
167 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
168 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
169 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
170}
171
Jason M. Bills89922f82019-08-06 11:10:02 -0700172static void smiTimeoutLog()
173{
174 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
175 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
176 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
177}
178
Jason M. Bills45e87e02019-09-09 14:45:38 -0700179static void cpuBootFIVRFaultLog(const int cpuNum)
180{
181 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
182
183 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
184 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
185 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
186}
187
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700188static void cpuThermTripLog(const int cpuNum)
189{
190 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
191
192 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
193 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
194 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
195 cpuNum, NULL);
196}
197
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000198static void memThermTripLog(const int cpuNum)
199{
200 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
201 std::string msg = cpuNumber + " Memory Thermal trip.";
202
203 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
204 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
205 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
206 cpuNumber.c_str(), NULL);
207}
208
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000209static void cpuMismatchLog(const int cpuNum)
210{
211 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
212
213 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
214 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
215 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
216}
217
Jason M. Bills250fa632019-08-28 15:58:25 -0700218static void cpuVRHotLog(const std::string& vr)
219{
220 std::string msg = vr + " Voltage Regulator Overheated.";
221
222 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
223 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
224 "OpenBMC.0.1.VoltageRegulatorOverheated",
225 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
226}
227
Jason M. Bills08866542019-08-16 12:04:19 -0700228static void ssbThermTripLog()
229{
230 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
231 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
232 "OpenBMC.0.1.SsbThermalTrip", NULL);
233}
234
Jason M. Billsa15c2522019-08-16 10:01:44 -0700235static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700236static void initializeHostState()
237{
238 conn->async_method_call(
239 [](boost::system::error_code ec,
240 const std::variant<std::string>& property) {
241 if (ec)
242 {
243 return;
244 }
245 const std::string* state = std::get_if<std::string>(&property);
246 if (state == nullptr)
247 {
248 std::cerr << "Unable to read host state value\n";
249 return;
250 }
251 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700252 // If the system is on, initialize the error state
253 if (!hostOff)
254 {
255 initializeErrorState();
256 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700257 },
258 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
259 "org.freedesktop.DBus.Properties", "Get",
260 "xyz.openbmc_project.State.Host", "CurrentHostState");
261}
262
263static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
264{
265 return std::make_shared<sdbusplus::bus::match::match>(
266 *conn,
267 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700268 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700269 [](sdbusplus::message::message& msg) {
270 std::string interfaceName;
271 boost::container::flat_map<std::string, std::variant<std::string>>
272 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700273 try
274 {
275 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700276 }
277 catch (std::exception& e)
278 {
279 std::cerr << "Unable to read host state\n";
280 return;
281 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700282 // We only want to check for CurrentHostState
283 if (propertiesChanged.begin()->first != "CurrentHostState")
284 {
285 return;
286 }
287 std::string* state =
288 std::get_if<std::string>(&(propertiesChanged.begin()->second));
289 if (state == nullptr)
290 {
291 std::cerr << propertiesChanged.begin()->first
292 << " property invalid\n";
293 return;
294 }
295
296 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700297
Jason M. Bills1490b142019-07-01 15:48:43 -0700298 if (hostOff)
299 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700300 // No host events should fire while off, so cancel any pending
301 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700302 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700303 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700304 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700305 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700306 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700307 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700308 else
309 {
310 // Handle any initial errors when the host turns on
311 initializeErrorState();
312 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700313 });
314}
315
316static bool requestGPIOEvents(
317 const std::string& name, const std::function<void()>& handler,
318 gpiod::line& gpioLine,
319 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
320{
321 // Find the GPIO line
322 gpioLine = gpiod::find_line(name);
323 if (!gpioLine)
324 {
325 std::cerr << "Failed to find the " << name << " line\n";
326 return false;
327 }
328
329 try
330 {
331 gpioLine.request(
332 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
333 }
334 catch (std::exception&)
335 {
336 std::cerr << "Failed to request events for " << name << "\n";
337 return false;
338 }
339
340 int gpioLineFd = gpioLine.event_get_fd();
341 if (gpioLineFd < 0)
342 {
343 std::cerr << "Failed to get " << name << " fd\n";
344 return false;
345 }
346
347 gpioEventDescriptor.assign(gpioLineFd);
348
349 gpioEventDescriptor.async_wait(
350 boost::asio::posix::stream_descriptor::wait_read,
351 [&name, handler](const boost::system::error_code ec) {
352 if (ec)
353 {
354 std::cerr << name << " fd handler error: " << ec.message()
355 << "\n";
356 return;
357 }
358 handler();
359 });
360 return true;
361}
362
Jason M. Bills45e87e02019-09-09 14:45:38 -0700363static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
364{
365 // Find the GPIO line
366 gpioLine = gpiod::find_line(name);
367 if (!gpioLine)
368 {
369 std::cerr << "Failed to find the " << name << " line.\n";
370 return false;
371 }
372
373 // Request GPIO input
374 try
375 {
376 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
377 }
378 catch (std::exception&)
379 {
380 std::cerr << "Failed to request " << name << " input\n";
381 return false;
382 }
383
384 return true;
385}
386
Jason M. Bills1490b142019-07-01 15:48:43 -0700387static void startPowerCycle()
388{
389 conn->async_method_call(
390 [](boost::system::error_code ec) {
391 if (ec)
392 {
393 std::cerr << "failed to set Chassis State\n";
394 }
395 },
396 "xyz.openbmc_project.State.Chassis",
397 "/xyz/openbmc_project/state/chassis0",
398 "org.freedesktop.DBus.Properties", "Set",
399 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
400 std::variant<std::string>{
401 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
402}
403
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700404static void startWarmReset()
405{
406 conn->async_method_call(
407 [](boost::system::error_code ec) {
408 if (ec)
409 {
410 std::cerr << "failed to set Host State\n";
411 }
412 },
413 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
414 "org.freedesktop.DBus.Properties", "Set",
415 "xyz.openbmc_project.State.Host", "RequestedHostTransition",
416 std::variant<std::string>{
417 "xyz.openbmc_project.State.Host.Transition.ForceWarmReboot"});
418}
419
Jason M. Billsb61766b2019-11-26 17:02:44 -0800420static void startCrashdumpAndRecovery(bool recoverSystem,
421 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700422{
423 std::cout << "Starting crashdump\n";
424 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
425 static boost::asio::steady_timer crashdumpTimer(io);
426
427 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
428 *conn,
429 "type='signal',interface='org.freedesktop.DBus.Properties',"
430 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
431 [recoverSystem](sdbusplus::message::message& msg) {
432 crashdumpTimer.cancel();
433 std::cout << "Crashdump completed\n";
434 if (recoverSystem)
435 {
436 std::cout << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700437 startWarmReset();
Jason M. Bills1490b142019-07-01 15:48:43 -0700438 }
439 crashdumpCompleteMatch.reset();
440 });
441
442 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
443 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
444 if (ec)
445 {
446 // operation_aborted is expected if timer is canceled
447 if (ec != boost::asio::error::operation_aborted)
448 {
449 std::cerr << "Crashdump async_wait failed: " << ec.message()
450 << "\n";
451 }
452 std::cout << "Crashdump timer canceled\n";
453 return;
454 }
455 std::cerr << "Crashdump failed to complete before timeout\n";
456 crashdumpCompleteMatch.reset();
457 });
458
459 conn->async_method_call(
460 [](boost::system::error_code ec) {
461 if (ec)
462 {
463 std::cerr << "failed to start Crashdump\n";
464 crashdumpTimer.cancel();
465 crashdumpCompleteMatch.reset();
466 }
467 },
468 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800469 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700470}
471
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700472static void incrementCPUErrorCount(int cpuNum)
473{
474 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
475
476 // Get the current count
477 conn->async_method_call(
478 [propertyName](boost::system::error_code ec,
479 const std::variant<uint8_t>& property) {
480 if (ec)
481 {
482 std::cerr << "Failed to read " << propertyName << ": "
483 << ec.message() << "\n";
484 return;
485 }
486 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
487 if (errorCountVariant == nullptr)
488 {
489 std::cerr << propertyName << " invalid\n";
490 return;
491 }
492 uint8_t errorCount = *errorCountVariant;
493 if (errorCount == std::numeric_limits<uint8_t>::max())
494 {
495 std::cerr << "Maximum error count reached\n";
496 return;
497 }
498 // Increment the count
499 errorCount++;
500 conn->async_method_call(
501 [propertyName](boost::system::error_code ec) {
502 if (ec)
503 {
504 std::cerr << "Failed to set " << propertyName << ": "
505 << ec.message() << "\n";
506 }
507 },
508 "xyz.openbmc_project.Settings",
509 "/xyz/openbmc_project/control/processor_error_config",
510 "org.freedesktop.DBus.Properties", "Set",
511 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
512 std::variant<uint8_t>{errorCount});
513 },
514 "xyz.openbmc_project.Settings",
515 "/xyz/openbmc_project/control/processor_error_config",
516 "org.freedesktop.DBus.Properties", "Get",
517 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
518}
519
Jason M. Billsa3397932019-08-06 11:07:21 -0700520static bool checkIERRCPUs()
521{
522 bool cpuIERRFound = false;
523 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
524 cpu++, addr++)
525 {
526 uint8_t cc = 0;
527 CPUModel model{};
528 uint8_t stepping = 0;
529 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
530 {
531 std::cerr << "Cannot get CPUID!\n";
532 continue;
533 }
534
535 switch (model)
536 {
537 case skx:
538 {
539 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
540 // that caused the IERR
541 uint32_t mcaErrSrcLog = 0;
542 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
543 &cc) != PECI_CC_SUCCESS)
544 {
545 continue;
546 }
547 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
548 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
549 {
550 // TODO: Light the CPU fault LED?
551 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700552 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700553 // Next check if it's a CPU/VR mismatch by reading the
554 // IA32_MC4_STATUS MSR (0x411)
555 uint64_t mc4Status = 0;
556 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
557 PECI_CC_SUCCESS)
558 {
559 continue;
560 }
561 // Check MSEC bits 31:24 for
562 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
563 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
564 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
565 if ((mc4Status & (0x40 << 24)) ||
566 (mc4Status & (0x42 << 24)) ||
567 (mc4Status & (0x43 << 24)))
568 {
569 cpuIERRLog(cpu, "CPU/VR Mismatch");
570 continue;
571 }
572
573 // Next check if it's a Core FIVR fault by looking for a
574 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
575 // 80h)
576 uint32_t coreFIVRErrLog = 0;
577 if (peci_RdPCIConfigLocal(
578 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
579 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
580 {
581 continue;
582 }
583 if (coreFIVRErrLog)
584 {
585 cpuIERRLog(cpu, "Core FIVR Fault");
586 continue;
587 }
588
589 // Next check if it's an Uncore FIVR fault by looking for a
590 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
591 // 84h)
592 uint32_t uncoreFIVRErrLog = 0;
593 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
594 sizeof(uint32_t),
595 (uint8_t*)&uncoreFIVRErrLog,
596 &cc) != PECI_CC_SUCCESS)
597 {
598 continue;
599 }
600 if (uncoreFIVRErrLog)
601 {
602 cpuIERRLog(cpu, "Uncore FIVR Fault");
603 continue;
604 }
605
606 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
607 // both zero, but MSEC bits 31:24 have either
608 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
609 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
610 // uncore FIVR fault
611 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
612 ((mc4Status & (0x51 << 24)) ||
613 (mc4Status & (0x52 << 24))))
614 {
615 cpuIERRLog(cpu, "Uncore FIVR Fault");
616 continue;
617 }
618 cpuIERRLog(cpu);
619 }
620 break;
621 }
622 case icx:
623 {
624 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
625 // that caused the IERR
626 uint32_t mcaErrSrcLog = 0;
627 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
628 &cc) != PECI_CC_SUCCESS)
629 {
630 continue;
631 }
632 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
633 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
634 {
635 // TODO: Light the CPU fault LED?
636 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700637 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700638 // Next check if it's a CPU/VR mismatch by reading the
639 // IA32_MC4_STATUS MSR (0x411)
640 uint64_t mc4Status = 0;
641 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
642 PECI_CC_SUCCESS)
643 {
644 continue;
645 }
646 // TODO: Update MSEC/MSCOD_31_24 check
647 // Check MSEC bits 31:24 for
648 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
649 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
650 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
651 if ((mc4Status & (0x40 << 24)) ||
652 (mc4Status & (0x42 << 24)) ||
653 (mc4Status & (0x43 << 24)))
654 {
655 cpuIERRLog(cpu, "CPU/VR Mismatch");
656 continue;
657 }
658
659 // Next check if it's a Core FIVR fault by looking for a
660 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
661 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
662 uint32_t coreFIVRErrLog0 = 0;
663 uint32_t coreFIVRErrLog1 = 0;
664 if (peci_RdEndPointConfigPciLocal(
665 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
666 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
667 {
668 continue;
669 }
670 if (peci_RdEndPointConfigPciLocal(
671 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
672 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
673 {
674 continue;
675 }
676 if (coreFIVRErrLog0 || coreFIVRErrLog1)
677 {
678 cpuIERRLog(cpu, "Core FIVR Fault");
679 continue;
680 }
681
682 // Next check if it's an Uncore FIVR fault by looking for a
683 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
684 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
685 uint32_t uncoreFIVRErrLog = 0;
686 if (peci_RdEndPointConfigPciLocal(
687 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
688 (uint8_t*)&uncoreFIVRErrLog,
689 &cc) != PECI_CC_SUCCESS)
690 {
691 continue;
692 }
693 if (uncoreFIVRErrLog)
694 {
695 cpuIERRLog(cpu, "Uncore FIVR Fault");
696 continue;
697 }
698
699 // TODO: Update MSEC/MSCOD_31_24 check
700 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
701 // both zero, but MSEC bits 31:24 have either
702 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
703 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
704 // uncore FIVR fault
705 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
706 !uncoreFIVRErrLog &&
707 ((mc4Status & (0x51 << 24)) ||
708 (mc4Status & (0x52 << 24))))
709 {
710 cpuIERRLog(cpu, "Uncore FIVR Fault");
711 continue;
712 }
713 cpuIERRLog(cpu);
714 }
715 break;
716 }
717 }
718 }
719 return cpuIERRFound;
720}
721
Jason M. Billsa15c2522019-08-16 10:01:44 -0700722static void caterrAssertHandler()
723{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700724 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
725 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
726 if (ec)
727 {
728 // operation_aborted is expected if timer is canceled
729 // before completion.
730 if (ec != boost::asio::error::operation_aborted)
731 {
732 std::cerr << "caterr timeout async_wait failed: "
733 << ec.message() << "\n";
734 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700735 return;
736 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700737 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
738 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800739 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700740 if (!checkIERRCPUs())
741 {
742 cpuIERRLog();
743 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700744 conn->async_method_call(
745 [](boost::system::error_code ec,
746 const std::variant<bool>& property) {
747 if (ec)
748 {
749 return;
750 }
751 const bool* reset = std::get_if<bool>(&property);
752 if (reset == nullptr)
753 {
754 std::cerr << "Unable to read reset on CATERR value\n";
755 return;
756 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800757 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700758 },
759 "xyz.openbmc_project.Settings",
760 "/xyz/openbmc_project/control/processor_error_config",
761 "org.freedesktop.DBus.Properties", "Get",
762 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
763 });
764}
765
Jason M. Bills1490b142019-07-01 15:48:43 -0700766static void caterrHandler()
767{
768 if (!hostOff)
769 {
770 gpiod::line_event gpioLineEvent = caterrLine.event_read();
771
772 bool caterr =
773 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800774
775 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700776 if (caterr)
777 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700778 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800779 associations.emplace_back(
780 "", "critical",
781 "/xyz/openbmc_project/host_error_monitor/cat_error");
782 associations.emplace_back("", "critical",
783 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700784 }
785 else
786 {
787 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800788 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700789 }
Yong Li1429ca82020-04-27 16:49:45 +0800790 host_error_monitor::associationCATAssert->set_property("Associations",
791 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700792 }
793 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
794 [](const boost::system::error_code ec) {
795 if (ec)
796 {
797 std::cerr << "caterr handler error: "
798 << ec.message() << "\n";
799 return;
800 }
801 caterrHandler();
802 });
803}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700804
Jason M. Billse94f5e12019-09-13 11:11:34 -0700805static void cpu1ThermtripAssertHandler()
806{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700807 if (cpu1FIVRFaultLine.get_value() == 0)
808 {
809 cpuBootFIVRFaultLog(1);
810 }
811 else
812 {
813 cpuThermTripLog(1);
814 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700815}
816
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700817static void cpu1ThermtripHandler()
818{
Jason M. Bills84951142020-04-17 15:57:11 -0700819 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700820
Jason M. Bills84951142020-04-17 15:57:11 -0700821 bool cpu1Thermtrip =
822 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
823 if (cpu1Thermtrip)
824 {
825 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700826 }
Jason M. Bills84951142020-04-17 15:57:11 -0700827
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700828 cpu1ThermtripEvent.async_wait(
829 boost::asio::posix::stream_descriptor::wait_read,
830 [](const boost::system::error_code ec) {
831 if (ec)
832 {
833 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
834 << "\n";
835 return;
836 }
837 cpu1ThermtripHandler();
838 });
839}
840
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000841static void cpu1MemtripHandler()
842{
Jason M. Bills5287c022020-05-19 11:16:09 -0700843 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000844
Jason M. Bills5287c022020-05-19 11:16:09 -0700845 bool cpu1Memtrip =
846 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
847 if (cpu1Memtrip)
848 {
849 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000850 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700851
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000852 cpu1MemtripEvent.async_wait(
853 boost::asio::posix::stream_descriptor::wait_read,
854 [](const boost::system::error_code ec) {
855 if (ec)
856 {
857 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
858 << ec.message() << "\n";
859 return;
860 }
861 cpu1MemtripHandler();
862 });
863}
864
Jason M. Billse94f5e12019-09-13 11:11:34 -0700865static void cpu2ThermtripAssertHandler()
866{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700867 if (cpu2FIVRFaultLine.get_value() == 0)
868 {
869 cpuBootFIVRFaultLog(2);
870 }
871 else
872 {
873 cpuThermTripLog(2);
874 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700875}
876
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700877static void cpu2ThermtripHandler()
878{
Jason M. Bills84951142020-04-17 15:57:11 -0700879 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700880
Jason M. Bills84951142020-04-17 15:57:11 -0700881 bool cpu2Thermtrip =
882 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
883 if (cpu2Thermtrip)
884 {
885 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700886 }
Jason M. Bills84951142020-04-17 15:57:11 -0700887
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700888 cpu2ThermtripEvent.async_wait(
889 boost::asio::posix::stream_descriptor::wait_read,
890 [](const boost::system::error_code ec) {
891 if (ec)
892 {
893 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
894 << "\n";
895 return;
896 }
897 cpu2ThermtripHandler();
898 });
899}
900
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000901static void cpu2MemtripHandler()
902{
Jason M. Bills5287c022020-05-19 11:16:09 -0700903 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000904
Jason M. Bills5287c022020-05-19 11:16:09 -0700905 bool cpu2Memtrip =
906 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
907 if (cpu2Memtrip)
908 {
909 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000910 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700911
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000912 cpu2MemtripEvent.async_wait(
913 boost::asio::posix::stream_descriptor::wait_read,
914 [](const boost::system::error_code ec) {
915 if (ec)
916 {
917 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
918 << ec.message() << "\n";
919 return;
920 }
921 cpu2MemtripHandler();
922 });
923}
924
Jason M. Billse94f5e12019-09-13 11:11:34 -0700925static void cpu1VRHotAssertHandler()
926{
927 cpuVRHotLog("CPU 1");
928}
929
Jason M. Bills250fa632019-08-28 15:58:25 -0700930static void cpu1VRHotHandler()
931{
Jason M. Bills84951142020-04-17 15:57:11 -0700932 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700933
Jason M. Bills84951142020-04-17 15:57:11 -0700934 bool cpu1VRHot =
935 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
936 if (cpu1VRHot)
937 {
938 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700939 }
Jason M. Bills84951142020-04-17 15:57:11 -0700940
Jason M. Bills250fa632019-08-28 15:58:25 -0700941 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
942 [](const boost::system::error_code ec) {
943 if (ec)
944 {
945 std::cerr << "CPU 1 VRHot handler error: "
946 << ec.message() << "\n";
947 return;
948 }
949 cpu1VRHotHandler();
950 });
951}
952
Jason M. Billse94f5e12019-09-13 11:11:34 -0700953static void cpu1MemABCDVRHotAssertHandler()
954{
955 cpuVRHotLog("CPU 1 Memory ABCD");
956}
957
Jason M. Bills9647ba72019-08-29 14:19:19 -0700958static void cpu1MemABCDVRHotHandler()
959{
Jason M. Bills84951142020-04-17 15:57:11 -0700960 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700961
Jason M. Bills84951142020-04-17 15:57:11 -0700962 bool cpu1MemABCDVRHot =
963 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
964 if (cpu1MemABCDVRHot)
965 {
966 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700967 }
Jason M. Bills84951142020-04-17 15:57:11 -0700968
Jason M. Bills9647ba72019-08-29 14:19:19 -0700969 cpu1MemABCDVRHotEvent.async_wait(
970 boost::asio::posix::stream_descriptor::wait_read,
971 [](const boost::system::error_code ec) {
972 if (ec)
973 {
974 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
975 << ec.message() << "\n";
976 return;
977 }
978 cpu1MemABCDVRHotHandler();
979 });
980}
981
Jason M. Billse94f5e12019-09-13 11:11:34 -0700982static void cpu1MemEFGHVRHotAssertHandler()
983{
984 cpuVRHotLog("CPU 1 Memory EFGH");
985}
986
Jason M. Bills9647ba72019-08-29 14:19:19 -0700987static void cpu1MemEFGHVRHotHandler()
988{
Jason M. Bills84951142020-04-17 15:57:11 -0700989 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700990
Jason M. Bills84951142020-04-17 15:57:11 -0700991 bool cpu1MemEFGHVRHot =
992 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
993 if (cpu1MemEFGHVRHot)
994 {
995 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700996 }
Jason M. Bills84951142020-04-17 15:57:11 -0700997
Jason M. Bills9647ba72019-08-29 14:19:19 -0700998 cpu1MemEFGHVRHotEvent.async_wait(
999 boost::asio::posix::stream_descriptor::wait_read,
1000 [](const boost::system::error_code ec) {
1001 if (ec)
1002 {
1003 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
1004 << ec.message() << "\n";
1005 return;
1006 }
1007 cpu1MemEFGHVRHotHandler();
1008 });
1009}
1010
Jason M. Billse94f5e12019-09-13 11:11:34 -07001011static void cpu2VRHotAssertHandler()
1012{
1013 cpuVRHotLog("CPU 2");
1014}
1015
Jason M. Bills250fa632019-08-28 15:58:25 -07001016static void cpu2VRHotHandler()
1017{
Jason M. Bills84951142020-04-17 15:57:11 -07001018 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001019
Jason M. Bills84951142020-04-17 15:57:11 -07001020 bool cpu2VRHot =
1021 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1022 if (cpu2VRHot)
1023 {
1024 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001025 }
Jason M. Bills84951142020-04-17 15:57:11 -07001026
Jason M. Bills250fa632019-08-28 15:58:25 -07001027 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1028 [](const boost::system::error_code ec) {
1029 if (ec)
1030 {
1031 std::cerr << "CPU 2 VRHot handler error: "
1032 << ec.message() << "\n";
1033 return;
1034 }
1035 cpu2VRHotHandler();
1036 });
1037}
1038
Jason M. Billse94f5e12019-09-13 11:11:34 -07001039static void cpu2MemABCDVRHotAssertHandler()
1040{
1041 cpuVRHotLog("CPU 2 Memory ABCD");
1042}
1043
Jason M. Bills9647ba72019-08-29 14:19:19 -07001044static void cpu2MemABCDVRHotHandler()
1045{
Jason M. Bills84951142020-04-17 15:57:11 -07001046 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001047
Jason M. Bills84951142020-04-17 15:57:11 -07001048 bool cpu2MemABCDVRHot =
1049 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1050 if (cpu2MemABCDVRHot)
1051 {
1052 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001053 }
Jason M. Bills84951142020-04-17 15:57:11 -07001054
Jason M. Bills9647ba72019-08-29 14:19:19 -07001055 cpu2MemABCDVRHotEvent.async_wait(
1056 boost::asio::posix::stream_descriptor::wait_read,
1057 [](const boost::system::error_code ec) {
1058 if (ec)
1059 {
1060 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1061 << ec.message() << "\n";
1062 return;
1063 }
1064 cpu2MemABCDVRHotHandler();
1065 });
1066}
1067
Jason M. Billse94f5e12019-09-13 11:11:34 -07001068static void cpu2MemEFGHVRHotAssertHandler()
1069{
1070 cpuVRHotLog("CPU 2 Memory EFGH");
1071}
1072
Jason M. Bills9647ba72019-08-29 14:19:19 -07001073static void cpu2MemEFGHVRHotHandler()
1074{
Jason M. Bills84951142020-04-17 15:57:11 -07001075 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001076
Jason M. Bills84951142020-04-17 15:57:11 -07001077 bool cpu2MemEFGHVRHot =
1078 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1079 if (cpu2MemEFGHVRHot)
1080 {
1081 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001082 }
Jason M. Bills84951142020-04-17 15:57:11 -07001083
Jason M. Bills9647ba72019-08-29 14:19:19 -07001084 cpu2MemEFGHVRHotEvent.async_wait(
1085 boost::asio::posix::stream_descriptor::wait_read,
1086 [](const boost::system::error_code ec) {
1087 if (ec)
1088 {
1089 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1090 << ec.message() << "\n";
1091 return;
1092 }
1093 cpu2MemEFGHVRHotHandler();
1094 });
1095}
1096
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001097static void pchThermtripHandler()
1098{
Yong Li1429ca82020-04-27 16:49:45 +08001099 std::vector<Association> associations;
1100
Jason M. Bills84951142020-04-17 15:57:11 -07001101 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001102
Jason M. Bills84951142020-04-17 15:57:11 -07001103 bool pchThermtrip =
1104 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1105 if (pchThermtrip)
1106 {
1107 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001108 associations.emplace_back(
1109 "", "critical",
1110 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1111 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001112 }
Yong Li1429ca82020-04-27 16:49:45 +08001113 else
1114 {
1115 associations.emplace_back("", "", "");
1116 }
1117 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1118 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001119
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001120 pchThermtripEvent.async_wait(
1121 boost::asio::posix::stream_descriptor::wait_read,
1122 [](const boost::system::error_code ec) {
1123 if (ec)
1124 {
1125 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1126 << "\n";
1127 return;
1128 }
1129 pchThermtripHandler();
1130 });
1131}
1132
Jason M. Billscbf78532019-08-16 15:32:11 -07001133static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001134{
Jason M. Billscbf78532019-08-16 15:32:11 -07001135 int errPinSts = (1 << errPin);
1136 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001137 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1138 cpu++, addr++)
1139 {
1140 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1141 {
1142 uint8_t cc = 0;
1143 CPUModel model{};
1144 uint8_t stepping = 0;
1145 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1146 {
1147 std::cerr << "Cannot get CPUID!\n";
1148 continue;
1149 }
1150
1151 switch (model)
1152 {
1153 case skx:
1154 {
1155 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001156 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001157 uint32_t errpinsts = 0;
1158 if (peci_RdPCIConfigLocal(
1159 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1160 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1161 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001162 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001163 }
1164 break;
1165 }
1166 case icx:
1167 {
1168 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001169 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001170 // accessed on PECI as bus 13)
1171 uint32_t errpinsts = 0;
1172 if (peci_RdEndPointConfigPciLocal(
1173 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1174 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1175 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001176 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001177 }
1178 break;
1179 }
1180 }
1181 }
1182 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001183 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001184}
1185
Jason M. Billscbf78532019-08-16 15:32:11 -07001186static void errXAssertHandler(const int errPin,
1187 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001188{
Jason M. Billscbf78532019-08-16 15:32:11 -07001189 // ERRx status is not guaranteed through the timeout, so save which
1190 // CPUs have it asserted
1191 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1192 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1193 errXAssertTimer.async_wait([errPin, errPinCPUs](
1194 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001195 if (ec)
1196 {
1197 // operation_aborted is expected if timer is canceled before
1198 // completion.
1199 if (ec != boost::asio::error::operation_aborted)
1200 {
1201 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1202 << "\n";
1203 }
1204 return;
1205 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001206 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1207 << std::to_string(errTimeoutMs) << " ms\n";
1208 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001209 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001210 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001211 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001212 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001213 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001214 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001215 }
1216 }
1217 }
1218 else
1219 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001220 cpuERRXLog(errPin);
1221 }
1222 });
1223}
1224
Jason M. Bills8c584392019-08-19 11:05:51 -07001225static void err0AssertHandler()
1226{
1227 // Handle the standard ERR0 detection and logging
1228 const static constexpr int err0 = 0;
1229 errXAssertHandler(err0, err0AssertTimer);
1230}
1231
1232static void err0Handler()
1233{
1234 if (!hostOff)
1235 {
1236 gpiod::line_event gpioLineEvent = err0Line.event_read();
1237
1238 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1239 if (err0)
1240 {
1241 err0AssertHandler();
1242 }
1243 else
1244 {
1245 err0AssertTimer.cancel();
1246 }
1247 }
1248 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1249 [](const boost::system::error_code ec) {
1250 if (ec)
1251 {
1252 std::cerr
1253 << "err0 handler error: " << ec.message()
1254 << "\n";
1255 return;
1256 }
1257 err0Handler();
1258 });
1259}
1260
Jason M. Bills75af3962019-08-19 11:07:17 -07001261static void err1AssertHandler()
1262{
1263 // Handle the standard ERR1 detection and logging
1264 const static constexpr int err1 = 1;
1265 errXAssertHandler(err1, err1AssertTimer);
1266}
1267
1268static void err1Handler()
1269{
1270 if (!hostOff)
1271 {
1272 gpiod::line_event gpioLineEvent = err1Line.event_read();
1273
1274 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1275 if (err1)
1276 {
1277 err1AssertHandler();
1278 }
1279 else
1280 {
1281 err1AssertTimer.cancel();
1282 }
1283 }
1284 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1285 [](const boost::system::error_code ec) {
1286 if (ec)
1287 {
1288 std::cerr
1289 << "err1 handler error: " << ec.message()
1290 << "\n";
1291 return;
1292 }
1293 err1Handler();
1294 });
1295}
1296
Jason M. Billscbf78532019-08-16 15:32:11 -07001297static void err2AssertHandler()
1298{
1299 // Handle the standard ERR2 detection and logging
1300 const static constexpr int err2 = 2;
1301 errXAssertHandler(err2, err2AssertTimer);
1302 // Also handle reset for ERR2
1303 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1304 if (ec)
1305 {
1306 // operation_aborted is expected if timer is canceled before
1307 // completion.
1308 if (ec != boost::asio::error::operation_aborted)
1309 {
1310 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1311 << "\n";
1312 }
1313 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001314 }
1315 conn->async_method_call(
1316 [](boost::system::error_code ec,
1317 const std::variant<bool>& property) {
1318 if (ec)
1319 {
1320 return;
1321 }
1322 const bool* reset = std::get_if<bool>(&property);
1323 if (reset == nullptr)
1324 {
1325 std::cerr << "Unable to read reset on ERR2 value\n";
1326 return;
1327 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001328 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001329 },
1330 "xyz.openbmc_project.Settings",
1331 "/xyz/openbmc_project/control/processor_error_config",
1332 "org.freedesktop.DBus.Properties", "Get",
1333 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001334
1335 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001336 });
1337}
1338
1339static void err2Handler()
1340{
1341 if (!hostOff)
1342 {
1343 gpiod::line_event gpioLineEvent = err2Line.event_read();
1344
1345 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1346 if (err2)
1347 {
1348 err2AssertHandler();
1349 }
1350 else
1351 {
1352 err2AssertTimer.cancel();
1353 }
1354 }
1355 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1356 [](const boost::system::error_code ec) {
1357 if (ec)
1358 {
1359 std::cerr
1360 << "err2 handler error: " << ec.message()
1361 << "\n";
1362 return;
1363 }
1364 err2Handler();
1365 });
1366}
1367
Jason M. Bills89922f82019-08-06 11:10:02 -07001368static void smiAssertHandler()
1369{
1370 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1371 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1372 if (ec)
1373 {
1374 // operation_aborted is expected if timer is canceled before
1375 // completion.
1376 if (ec != boost::asio::error::operation_aborted)
1377 {
1378 std::cerr << "smi timeout async_wait failed: " << ec.message()
1379 << "\n";
1380 }
1381 return;
1382 }
1383 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1384 << " ms\n";
1385 smiTimeoutLog();
1386 conn->async_method_call(
1387 [](boost::system::error_code ec,
1388 const std::variant<bool>& property) {
1389 if (ec)
1390 {
1391 return;
1392 }
1393 const bool* reset = std::get_if<bool>(&property);
1394 if (reset == nullptr)
1395 {
1396 std::cerr << "Unable to read reset on SMI value\n";
1397 return;
1398 }
Jason M. Bills94785442020-01-07 15:22:09 -08001399#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001400 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001401#else
1402 if (*reset)
1403 {
1404 std::cout << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -07001405 startWarmReset();
Jason M. Bills94785442020-01-07 15:22:09 -08001406 }
1407#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001408 },
1409 "xyz.openbmc_project.Settings",
1410 "/xyz/openbmc_project/control/bmc_reset_disables",
1411 "org.freedesktop.DBus.Properties", "Get",
1412 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1413 });
1414}
1415
1416static void smiHandler()
1417{
1418 if (!hostOff)
1419 {
1420 gpiod::line_event gpioLineEvent = smiLine.event_read();
1421
1422 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1423 if (smi)
1424 {
1425 smiAssertHandler();
1426 }
1427 else
1428 {
1429 smiAssertTimer.cancel();
1430 }
1431 }
1432 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1433 [](const boost::system::error_code ec) {
1434 if (ec)
1435 {
1436 std::cerr
1437 << "smi handler error: " << ec.message()
1438 << "\n";
1439 return;
1440 }
1441 smiHandler();
1442 });
1443}
1444
Jason M. Billsa15c2522019-08-16 10:01:44 -07001445static void initializeErrorState()
1446{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001447 // Handle CPU1_MISMATCH if it's asserted now
1448 if (cpu1MismatchLine.get_value() == 1)
1449 {
1450 cpuMismatchLog(1);
1451 }
1452
1453 // Handle CPU2_MISMATCH if it's asserted now
1454 if (cpu2MismatchLine.get_value() == 1)
1455 {
1456 cpuMismatchLog(2);
1457 }
1458
Jason M. Billsa15c2522019-08-16 10:01:44 -07001459 // Handle CPU_CATERR if it's asserted now
1460 if (caterrLine.get_value() == 0)
1461 {
1462 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001463 std::vector<Association> associations;
1464 associations.emplace_back(
1465 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1466 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1467 host_error_monitor::associationCATAssert->set_property("Associations",
1468 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001469 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001470
Jason M. Bills8c584392019-08-19 11:05:51 -07001471 // Handle CPU_ERR0 if it's asserted now
1472 if (err0Line.get_value() == 0)
1473 {
1474 err0AssertHandler();
1475 }
1476
Jason M. Bills75af3962019-08-19 11:07:17 -07001477 // Handle CPU_ERR1 if it's asserted now
1478 if (err1Line.get_value() == 0)
1479 {
1480 err1AssertHandler();
1481 }
1482
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001483 // Handle CPU_ERR2 if it's asserted now
1484 if (err2Line.get_value() == 0)
1485 {
1486 err2AssertHandler();
1487 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001488
1489 // Handle SMI if it's asserted now
1490 if (smiLine.get_value() == 0)
1491 {
1492 smiAssertHandler();
1493 }
Jason M. Bills08866542019-08-16 12:04:19 -07001494
Jason M. Billse94f5e12019-09-13 11:11:34 -07001495 // Handle CPU1_THERMTRIP if it's asserted now
1496 if (cpu1ThermtripLine.get_value() == 0)
1497 {
1498 cpu1ThermtripAssertHandler();
1499 }
1500
1501 // Handle CPU2_THERMTRIP if it's asserted now
1502 if (cpu2ThermtripLine.get_value() == 0)
1503 {
1504 cpu2ThermtripAssertHandler();
1505 }
1506
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001507 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1508 if (cpu1MemtripLine.get_value() == 0)
1509 {
1510 memThermTripLog(1);
1511 }
1512
1513 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1514 if (cpu2MemtripLine.get_value() == 0)
1515 {
1516 memThermTripLog(2);
1517 }
1518
Jason M. Billse94f5e12019-09-13 11:11:34 -07001519 // Handle CPU1_VRHOT if it's asserted now
1520 if (cpu1VRHotLine.get_value() == 0)
1521 {
1522 cpu1VRHotAssertHandler();
1523 }
1524
1525 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1526 if (cpu1MemABCDVRHotLine.get_value() == 0)
1527 {
1528 cpu1MemABCDVRHotAssertHandler();
1529 }
1530
1531 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1532 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1533 {
1534 cpu1MemEFGHVRHotAssertHandler();
1535 }
1536
1537 // Handle CPU2_VRHOT if it's asserted now
1538 if (cpu2VRHotLine.get_value() == 0)
1539 {
1540 cpu2VRHotAssertHandler();
1541 }
1542
1543 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1544 if (cpu2MemABCDVRHotLine.get_value() == 0)
1545 {
1546 cpu2MemABCDVRHotAssertHandler();
1547 }
1548
1549 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1550 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1551 {
1552 cpu2MemEFGHVRHotAssertHandler();
1553 }
1554
Jason M. Bills08866542019-08-16 12:04:19 -07001555 // Handle PCH_BMC_THERMTRIP if it's asserted now
1556 if (pchThermtripLine.get_value() == 0)
1557 {
1558 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001559 std::vector<Association> associations;
1560 associations.emplace_back(
1561 "", "critical",
1562 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1563 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1564 host_error_monitor::associationSSBThermTrip->set_property(
1565 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001566 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001567}
Jason M. Bills1490b142019-07-01 15:48:43 -07001568} // namespace host_error_monitor
1569
1570int main(int argc, char* argv[])
1571{
1572 // setup connection to dbus
1573 host_error_monitor::conn =
1574 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1575
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001576 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001577 host_error_monitor::conn->request_name(
1578 "xyz.openbmc_project.HostErrorMonitor");
1579 sdbusplus::asio::object_server server =
1580 sdbusplus::asio::object_server(host_error_monitor::conn);
1581
Yong Li1429ca82020-04-27 16:49:45 +08001582 // Associations interface for led status
1583 std::vector<host_error_monitor::Association> associations;
1584 associations.emplace_back("", "", "");
1585 host_error_monitor::associationSSBThermTrip = server.add_interface(
1586 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1587 "xyz.openbmc_project.Association.Definitions");
1588 host_error_monitor::associationSSBThermTrip->register_property(
1589 "Associations", associations);
1590 host_error_monitor::associationSSBThermTrip->initialize();
1591
1592 host_error_monitor::associationCATAssert = server.add_interface(
1593 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1594 "xyz.openbmc_project.Association.Definitions");
1595 host_error_monitor::associationCATAssert->register_property("Associations",
1596 associations);
1597 host_error_monitor::associationCATAssert->initialize();
1598
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001599 // Restart Cause Interface
1600 host_error_monitor::hostErrorTimeoutIface =
1601 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1602 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1603
1604 host_error_monitor::hostErrorTimeoutIface->register_property(
1605 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1606 [](const std::size_t& requested, std::size_t& resp) {
1607 if (requested > host_error_monitor::caterrTimeoutMsMax)
1608 {
1609 std::cerr << "IERRTimeoutMs update to " << requested
1610 << "ms rejected. Cannot be greater than "
1611 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1612 return 0;
1613 }
1614 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1615 host_error_monitor::caterrTimeoutMs = requested;
1616 resp = requested;
1617 return 1;
1618 },
1619 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1620 host_error_monitor::hostErrorTimeoutIface->initialize();
1621
Jason M. Bills1490b142019-07-01 15:48:43 -07001622 // Start tracking host state
1623 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1624 host_error_monitor::startHostStateMonitor();
1625
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001626 // Request CPU1_MISMATCH GPIO events
1627 if (!host_error_monitor::requestGPIOInput(
1628 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1629 {
1630 return -1;
1631 }
1632
1633 // Request CPU2_MISMATCH GPIO events
1634 if (!host_error_monitor::requestGPIOInput(
1635 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1636 {
1637 return -1;
1638 }
1639
Jason M. Bills1490b142019-07-01 15:48:43 -07001640 // Initialize the host state
1641 host_error_monitor::initializeHostState();
1642
1643 // Request CPU_CATERR GPIO events
1644 if (!host_error_monitor::requestGPIOEvents(
1645 "CPU_CATERR", host_error_monitor::caterrHandler,
1646 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1647 {
1648 return -1;
1649 }
1650
Jason M. Bills8c584392019-08-19 11:05:51 -07001651 // Request CPU_ERR0 GPIO events
1652 if (!host_error_monitor::requestGPIOEvents(
1653 "CPU_ERR0", host_error_monitor::err0Handler,
1654 host_error_monitor::err0Line, host_error_monitor::err0Event))
1655 {
1656 return -1;
1657 }
1658
Jason M. Bills75af3962019-08-19 11:07:17 -07001659 // Request CPU_ERR1 GPIO events
1660 if (!host_error_monitor::requestGPIOEvents(
1661 "CPU_ERR1", host_error_monitor::err1Handler,
1662 host_error_monitor::err1Line, host_error_monitor::err1Event))
1663 {
1664 return -1;
1665 }
1666
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001667 // Request CPU_ERR2 GPIO events
1668 if (!host_error_monitor::requestGPIOEvents(
1669 "CPU_ERR2", host_error_monitor::err2Handler,
1670 host_error_monitor::err2Line, host_error_monitor::err2Event))
1671 {
1672 return -1;
1673 }
1674
Jason M. Bills89922f82019-08-06 11:10:02 -07001675 // Request SMI GPIO events
1676 if (!host_error_monitor::requestGPIOEvents(
1677 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1678 host_error_monitor::smiEvent))
1679 {
1680 return -1;
1681 }
1682
Jason M. Bills45e87e02019-09-09 14:45:38 -07001683 // Request CPU1_FIVR_FAULT GPIO input
1684 if (!host_error_monitor::requestGPIOInput(
1685 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1686 {
1687 return -1;
1688 }
1689
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001690 // Request CPU1_THERMTRIP GPIO events
1691 if (!host_error_monitor::requestGPIOEvents(
1692 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1693 host_error_monitor::cpu1ThermtripLine,
1694 host_error_monitor::cpu1ThermtripEvent))
1695 {
1696 return -1;
1697 }
1698
Jason M. Bills45e87e02019-09-09 14:45:38 -07001699 // Request CPU2_FIVR_FAULT GPIO input
1700 if (!host_error_monitor::requestGPIOInput(
1701 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1702 {
1703 return -1;
1704 }
1705
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001706 // Request CPU2_THERMTRIP GPIO events
1707 if (!host_error_monitor::requestGPIOEvents(
1708 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1709 host_error_monitor::cpu2ThermtripLine,
1710 host_error_monitor::cpu2ThermtripEvent))
1711 {
1712 return -1;
1713 }
1714
Jason M. Bills250fa632019-08-28 15:58:25 -07001715 // Request CPU1_VRHOT GPIO events
1716 if (!host_error_monitor::requestGPIOEvents(
1717 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1718 host_error_monitor::cpu1VRHotLine,
1719 host_error_monitor::cpu1VRHotEvent))
1720 {
1721 return -1;
1722 }
1723
Jason M. Bills9647ba72019-08-29 14:19:19 -07001724 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1725 if (!host_error_monitor::requestGPIOEvents(
1726 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1727 host_error_monitor::cpu1MemABCDVRHotLine,
1728 host_error_monitor::cpu1MemABCDVRHotEvent))
1729 {
1730 return -1;
1731 }
1732
1733 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1734 if (!host_error_monitor::requestGPIOEvents(
1735 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1736 host_error_monitor::cpu1MemEFGHVRHotLine,
1737 host_error_monitor::cpu1MemEFGHVRHotEvent))
1738 {
1739 return -1;
1740 }
1741
Jason M. Bills250fa632019-08-28 15:58:25 -07001742 // Request CPU2_VRHOT GPIO events
1743 if (!host_error_monitor::requestGPIOEvents(
1744 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1745 host_error_monitor::cpu2VRHotLine,
1746 host_error_monitor::cpu2VRHotEvent))
1747 {
1748 return -1;
1749 }
1750
Jason M. Bills9647ba72019-08-29 14:19:19 -07001751 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1752 if (!host_error_monitor::requestGPIOEvents(
1753 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1754 host_error_monitor::cpu2MemABCDVRHotLine,
1755 host_error_monitor::cpu2MemABCDVRHotEvent))
1756 {
1757 return -1;
1758 }
1759
1760 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1761 if (!host_error_monitor::requestGPIOEvents(
1762 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1763 host_error_monitor::cpu2MemEFGHVRHotLine,
1764 host_error_monitor::cpu2MemEFGHVRHotEvent))
1765 {
1766 return -1;
1767 }
1768
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001769 // Request PCH_BMC_THERMTRIP GPIO events
1770 if (!host_error_monitor::requestGPIOEvents(
1771 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1772 host_error_monitor::pchThermtripLine,
1773 host_error_monitor::pchThermtripEvent))
1774 {
1775 return -1;
1776 }
1777
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001778 // Request CPU1_MEM_THERM_EVENT GPIO events
1779 if (!host_error_monitor::requestGPIOEvents(
1780 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1781 host_error_monitor::cpu1MemtripLine,
1782 host_error_monitor::cpu1MemtripEvent))
1783 {
1784 return -1;
1785 }
1786
1787 // Request CPU2_MEM_THERM_EVENT GPIO events
1788 if (!host_error_monitor::requestGPIOEvents(
1789 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1790 host_error_monitor::cpu2MemtripLine,
1791 host_error_monitor::cpu2MemtripEvent))
1792 {
1793 return -1;
1794 }
1795
Jason M. Bills1490b142019-07-01 15:48:43 -07001796 host_error_monitor::io.run();
1797
1798 return 0;
1799}