blob: 873348fe57869d838b5bd9e72305ed28f98d903f [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070022#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070024
25#include <bitset>
26#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070027#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070028
29namespace host_error_monitor
30{
31static boost::asio::io_service io;
32static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080033static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070034
Yong Li1429ca82020-04-27 16:49:45 +080035using Association = std::tuple<std::string, std::string, std::string>;
36static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
37static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
38
39static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
40
Jason M. Bills1490b142019-07-01 15:48:43 -070041static bool hostOff = true;
42
Jason M. Billsc4b91f22019-11-26 17:04:50 -080043static size_t caterrTimeoutMs = 2000;
44const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070045const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070046const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070047const static constexpr size_t crashdumpTimeoutS = 300;
48
49// Timers
50// Timer for CATERR asserted
51static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070052// Timer for ERR0 asserted
53static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070054// Timer for ERR1 asserted
55static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070056// Timer for ERR2 asserted
57static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070058// Timer for SMI asserted
59static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070060
61// GPIO Lines and Event Descriptors
62static gpiod::line caterrLine;
63static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070064static gpiod::line err0Line;
65static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070066static gpiod::line err1Line;
67static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070068static gpiod::line err2Line;
69static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070070static gpiod::line smiLine;
71static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070072static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070073static gpiod::line cpu1ThermtripLine;
74static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070075static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070076static gpiod::line cpu2ThermtripLine;
77static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070078static gpiod::line cpu1VRHotLine;
79static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
80static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070081static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
82static gpiod::line cpu1MemEFGHVRHotLine;
83static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
84static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070085static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070086static gpiod::line cpu1MemABCDVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
88static gpiod::line cpu2MemEFGHVRHotLine;
89static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080090//----------------------------------
91// PCH_BMC_THERMTRIP function related definition
92//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080093static gpiod::line pchThermtripLine;
94static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000095//----------------------------------
96// CPU_MEM_THERM_EVENT function related definition
97//----------------------------------
98static gpiod::line cpu1MemtripLine;
99static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
100static gpiod::line cpu2MemtripLine;
101static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000102//---------------------------------
103// CPU_MISMATCH function related definition
104//---------------------------------
105static gpiod::line cpu1MismatchLine;
106static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700107
Yong Li061eb032020-02-26 15:06:18 +0800108// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800109const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800110const static constexpr uint8_t beepCPUErr2 = 5;
111
112static void beep(const uint8_t& beepPriority)
113{
114 conn->async_method_call(
115 [](boost::system::error_code ec) {
116 if (ec)
117 {
118 std::cerr << "beep returned error with "
119 "async_method_call (ec = "
120 << ec << ")\n";
121 return;
122 }
123 },
124 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
125 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
126}
127
Jason M. Billsa3397932019-08-06 11:07:21 -0700128static void cpuIERRLog()
129{
130 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
131 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
132 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
133}
134
135static void cpuIERRLog(const int cpuNum)
136{
137 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
138
139 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
140 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
141 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
142}
143
144static void cpuIERRLog(const int cpuNum, const std::string& type)
145{
146 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
147
148 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
149 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
150 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
151}
152
Jason M. Billscbf78532019-08-16 15:32:11 -0700153static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700154{
Jason M. Billscbf78532019-08-16 15:32:11 -0700155 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
156
157 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
158 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
159 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700160}
161
Jason M. Billscbf78532019-08-16 15:32:11 -0700162static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700163{
Jason M. Billscbf78532019-08-16 15:32:11 -0700164 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
165 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700166
167 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
168 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
169 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
170}
171
Jason M. Bills89922f82019-08-06 11:10:02 -0700172static void smiTimeoutLog()
173{
174 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
175 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
176 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
177}
178
Jason M. Bills45e87e02019-09-09 14:45:38 -0700179static void cpuBootFIVRFaultLog(const int cpuNum)
180{
181 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
182
183 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
184 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
185 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
186}
187
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700188static void cpuThermTripLog(const int cpuNum)
189{
190 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
191
192 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
193 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
194 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
195 cpuNum, NULL);
196}
197
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000198static void memThermTripLog(const int cpuNum)
199{
200 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
201 std::string msg = cpuNumber + " Memory Thermal trip.";
202
203 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
204 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
205 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
206 cpuNumber.c_str(), NULL);
207}
208
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000209static void cpuMismatchLog(const int cpuNum)
210{
211 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
212
213 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
214 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
215 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
216}
217
Jason M. Bills250fa632019-08-28 15:58:25 -0700218static void cpuVRHotLog(const std::string& vr)
219{
220 std::string msg = vr + " Voltage Regulator Overheated.";
221
222 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
223 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
224 "OpenBMC.0.1.VoltageRegulatorOverheated",
225 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
226}
227
Jason M. Bills08866542019-08-16 12:04:19 -0700228static void ssbThermTripLog()
229{
230 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
231 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
232 "OpenBMC.0.1.SsbThermalTrip", NULL);
233}
234
Jason M. Billsa15c2522019-08-16 10:01:44 -0700235static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700236static void initializeHostState()
237{
238 conn->async_method_call(
239 [](boost::system::error_code ec,
240 const std::variant<std::string>& property) {
241 if (ec)
242 {
243 return;
244 }
245 const std::string* state = std::get_if<std::string>(&property);
246 if (state == nullptr)
247 {
248 std::cerr << "Unable to read host state value\n";
249 return;
250 }
251 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700252 // If the system is on, initialize the error state
253 if (!hostOff)
254 {
255 initializeErrorState();
256 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700257 },
258 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
259 "org.freedesktop.DBus.Properties", "Get",
260 "xyz.openbmc_project.State.Host", "CurrentHostState");
261}
262
263static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
264{
265 return std::make_shared<sdbusplus::bus::match::match>(
266 *conn,
267 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700268 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700269 [](sdbusplus::message::message& msg) {
270 std::string interfaceName;
271 boost::container::flat_map<std::string, std::variant<std::string>>
272 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700273 try
274 {
275 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700276 }
277 catch (std::exception& e)
278 {
279 std::cerr << "Unable to read host state\n";
280 return;
281 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700282 // We only want to check for CurrentHostState
283 if (propertiesChanged.begin()->first != "CurrentHostState")
284 {
285 return;
286 }
287 std::string* state =
288 std::get_if<std::string>(&(propertiesChanged.begin()->second));
289 if (state == nullptr)
290 {
291 std::cerr << propertiesChanged.begin()->first
292 << " property invalid\n";
293 return;
294 }
295
296 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700297
Jason M. Bills1490b142019-07-01 15:48:43 -0700298 if (hostOff)
299 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700300 // No host events should fire while off, so cancel any pending
301 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700302 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700303 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700304 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700305 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700306 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700307 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700308 else
309 {
310 // Handle any initial errors when the host turns on
311 initializeErrorState();
312 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700313 });
314}
315
316static bool requestGPIOEvents(
317 const std::string& name, const std::function<void()>& handler,
318 gpiod::line& gpioLine,
319 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
320{
321 // Find the GPIO line
322 gpioLine = gpiod::find_line(name);
323 if (!gpioLine)
324 {
325 std::cerr << "Failed to find the " << name << " line\n";
326 return false;
327 }
328
329 try
330 {
331 gpioLine.request(
332 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
333 }
334 catch (std::exception&)
335 {
336 std::cerr << "Failed to request events for " << name << "\n";
337 return false;
338 }
339
340 int gpioLineFd = gpioLine.event_get_fd();
341 if (gpioLineFd < 0)
342 {
343 std::cerr << "Failed to get " << name << " fd\n";
344 return false;
345 }
346
347 gpioEventDescriptor.assign(gpioLineFd);
348
349 gpioEventDescriptor.async_wait(
350 boost::asio::posix::stream_descriptor::wait_read,
351 [&name, handler](const boost::system::error_code ec) {
352 if (ec)
353 {
354 std::cerr << name << " fd handler error: " << ec.message()
355 << "\n";
356 return;
357 }
358 handler();
359 });
360 return true;
361}
362
Jason M. Bills45e87e02019-09-09 14:45:38 -0700363static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
364{
365 // Find the GPIO line
366 gpioLine = gpiod::find_line(name);
367 if (!gpioLine)
368 {
369 std::cerr << "Failed to find the " << name << " line.\n";
370 return false;
371 }
372
373 // Request GPIO input
374 try
375 {
376 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
377 }
378 catch (std::exception&)
379 {
380 std::cerr << "Failed to request " << name << " input\n";
381 return false;
382 }
383
384 return true;
385}
386
Jason M. Bills1490b142019-07-01 15:48:43 -0700387static void startPowerCycle()
388{
389 conn->async_method_call(
390 [](boost::system::error_code ec) {
391 if (ec)
392 {
393 std::cerr << "failed to set Chassis State\n";
394 }
395 },
396 "xyz.openbmc_project.State.Chassis",
397 "/xyz/openbmc_project/state/chassis0",
398 "org.freedesktop.DBus.Properties", "Set",
399 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
400 std::variant<std::string>{
401 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
402}
403
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700404static void startWarmReset()
405{
406 conn->async_method_call(
407 [](boost::system::error_code ec) {
408 if (ec)
409 {
410 std::cerr << "failed to set Host State\n";
411 }
412 },
413 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
414 "org.freedesktop.DBus.Properties", "Set",
415 "xyz.openbmc_project.State.Host", "RequestedHostTransition",
416 std::variant<std::string>{
417 "xyz.openbmc_project.State.Host.Transition.ForceWarmReboot"});
418}
419
Jason M. Billsb61766b2019-11-26 17:02:44 -0800420static void startCrashdumpAndRecovery(bool recoverSystem,
421 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700422{
Jason M. Billsd69549b2020-08-27 11:42:43 -0700423 std::cerr << "Starting crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700424 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
425 static boost::asio::steady_timer crashdumpTimer(io);
426
427 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
428 *conn,
429 "type='signal',interface='org.freedesktop.DBus.Properties',"
430 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
431 [recoverSystem](sdbusplus::message::message& msg) {
432 crashdumpTimer.cancel();
Jason M. Billsd69549b2020-08-27 11:42:43 -0700433 std::cerr << "Crashdump completed\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700434 if (recoverSystem)
435 {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700436 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700437 startWarmReset();
Jason M. Bills1490b142019-07-01 15:48:43 -0700438 }
439 crashdumpCompleteMatch.reset();
440 });
441
442 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
443 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
444 if (ec)
445 {
446 // operation_aborted is expected if timer is canceled
447 if (ec != boost::asio::error::operation_aborted)
448 {
449 std::cerr << "Crashdump async_wait failed: " << ec.message()
450 << "\n";
451 }
Jason M. Billsd69549b2020-08-27 11:42:43 -0700452 std::cerr << "Crashdump timer canceled\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700453 return;
454 }
455 std::cerr << "Crashdump failed to complete before timeout\n";
456 crashdumpCompleteMatch.reset();
457 });
458
459 conn->async_method_call(
460 [](boost::system::error_code ec) {
461 if (ec)
462 {
463 std::cerr << "failed to start Crashdump\n";
464 crashdumpTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700465 }
466 },
467 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800468 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700469}
470
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700471static void incrementCPUErrorCount(int cpuNum)
472{
473 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
474
475 // Get the current count
476 conn->async_method_call(
477 [propertyName](boost::system::error_code ec,
478 const std::variant<uint8_t>& property) {
479 if (ec)
480 {
481 std::cerr << "Failed to read " << propertyName << ": "
482 << ec.message() << "\n";
483 return;
484 }
485 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
486 if (errorCountVariant == nullptr)
487 {
488 std::cerr << propertyName << " invalid\n";
489 return;
490 }
491 uint8_t errorCount = *errorCountVariant;
492 if (errorCount == std::numeric_limits<uint8_t>::max())
493 {
494 std::cerr << "Maximum error count reached\n";
495 return;
496 }
497 // Increment the count
498 errorCount++;
499 conn->async_method_call(
500 [propertyName](boost::system::error_code ec) {
501 if (ec)
502 {
503 std::cerr << "Failed to set " << propertyName << ": "
504 << ec.message() << "\n";
505 }
506 },
507 "xyz.openbmc_project.Settings",
508 "/xyz/openbmc_project/control/processor_error_config",
509 "org.freedesktop.DBus.Properties", "Set",
510 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
511 std::variant<uint8_t>{errorCount});
512 },
513 "xyz.openbmc_project.Settings",
514 "/xyz/openbmc_project/control/processor_error_config",
515 "org.freedesktop.DBus.Properties", "Get",
516 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
517}
518
Jason M. Billsa3397932019-08-06 11:07:21 -0700519static bool checkIERRCPUs()
520{
521 bool cpuIERRFound = false;
522 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
523 cpu++, addr++)
524 {
525 uint8_t cc = 0;
526 CPUModel model{};
527 uint8_t stepping = 0;
528 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
529 {
530 std::cerr << "Cannot get CPUID!\n";
531 continue;
532 }
533
534 switch (model)
535 {
536 case skx:
537 {
538 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
539 // that caused the IERR
540 uint32_t mcaErrSrcLog = 0;
541 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
542 &cc) != PECI_CC_SUCCESS)
543 {
544 continue;
545 }
546 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
547 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
548 {
549 // TODO: Light the CPU fault LED?
550 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700551 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700552 // Next check if it's a CPU/VR mismatch by reading the
553 // IA32_MC4_STATUS MSR (0x411)
554 uint64_t mc4Status = 0;
555 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
556 PECI_CC_SUCCESS)
557 {
558 continue;
559 }
560 // Check MSEC bits 31:24 for
561 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
562 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
563 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
564 if ((mc4Status & (0x40 << 24)) ||
565 (mc4Status & (0x42 << 24)) ||
566 (mc4Status & (0x43 << 24)))
567 {
568 cpuIERRLog(cpu, "CPU/VR Mismatch");
569 continue;
570 }
571
572 // Next check if it's a Core FIVR fault by looking for a
573 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
574 // 80h)
575 uint32_t coreFIVRErrLog = 0;
576 if (peci_RdPCIConfigLocal(
577 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
578 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
579 {
580 continue;
581 }
582 if (coreFIVRErrLog)
583 {
584 cpuIERRLog(cpu, "Core FIVR Fault");
585 continue;
586 }
587
588 // Next check if it's an Uncore FIVR fault by looking for a
589 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
590 // 84h)
591 uint32_t uncoreFIVRErrLog = 0;
592 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
593 sizeof(uint32_t),
594 (uint8_t*)&uncoreFIVRErrLog,
595 &cc) != PECI_CC_SUCCESS)
596 {
597 continue;
598 }
599 if (uncoreFIVRErrLog)
600 {
601 cpuIERRLog(cpu, "Uncore FIVR Fault");
602 continue;
603 }
604
605 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
606 // both zero, but MSEC bits 31:24 have either
607 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
608 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
609 // uncore FIVR fault
610 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
611 ((mc4Status & (0x51 << 24)) ||
612 (mc4Status & (0x52 << 24))))
613 {
614 cpuIERRLog(cpu, "Uncore FIVR Fault");
615 continue;
616 }
617 cpuIERRLog(cpu);
618 }
619 break;
620 }
621 case icx:
622 {
623 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
624 // that caused the IERR
625 uint32_t mcaErrSrcLog = 0;
626 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
627 &cc) != PECI_CC_SUCCESS)
628 {
629 continue;
630 }
631 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
632 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
633 {
634 // TODO: Light the CPU fault LED?
635 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700636 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700637 // Next check if it's a CPU/VR mismatch by reading the
638 // IA32_MC4_STATUS MSR (0x411)
639 uint64_t mc4Status = 0;
640 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
641 PECI_CC_SUCCESS)
642 {
643 continue;
644 }
645 // TODO: Update MSEC/MSCOD_31_24 check
646 // Check MSEC bits 31:24 for
647 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
648 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
649 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
650 if ((mc4Status & (0x40 << 24)) ||
651 (mc4Status & (0x42 << 24)) ||
652 (mc4Status & (0x43 << 24)))
653 {
654 cpuIERRLog(cpu, "CPU/VR Mismatch");
655 continue;
656 }
657
658 // Next check if it's a Core FIVR fault by looking for a
659 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
660 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
661 uint32_t coreFIVRErrLog0 = 0;
662 uint32_t coreFIVRErrLog1 = 0;
663 if (peci_RdEndPointConfigPciLocal(
664 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
665 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
666 {
667 continue;
668 }
669 if (peci_RdEndPointConfigPciLocal(
670 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
671 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
672 {
673 continue;
674 }
675 if (coreFIVRErrLog0 || coreFIVRErrLog1)
676 {
677 cpuIERRLog(cpu, "Core FIVR Fault");
678 continue;
679 }
680
681 // Next check if it's an Uncore FIVR fault by looking for a
682 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
683 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
684 uint32_t uncoreFIVRErrLog = 0;
685 if (peci_RdEndPointConfigPciLocal(
686 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
687 (uint8_t*)&uncoreFIVRErrLog,
688 &cc) != PECI_CC_SUCCESS)
689 {
690 continue;
691 }
692 if (uncoreFIVRErrLog)
693 {
694 cpuIERRLog(cpu, "Uncore FIVR Fault");
695 continue;
696 }
697
698 // TODO: Update MSEC/MSCOD_31_24 check
699 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
700 // both zero, but MSEC bits 31:24 have either
701 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
702 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
703 // uncore FIVR fault
704 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
705 !uncoreFIVRErrLog &&
706 ((mc4Status & (0x51 << 24)) ||
707 (mc4Status & (0x52 << 24))))
708 {
709 cpuIERRLog(cpu, "Uncore FIVR Fault");
710 continue;
711 }
712 cpuIERRLog(cpu);
713 }
714 break;
715 }
716 }
717 }
718 return cpuIERRFound;
719}
720
Jason M. Billsa15c2522019-08-16 10:01:44 -0700721static void caterrAssertHandler()
722{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700723 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
724 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
725 if (ec)
726 {
727 // operation_aborted is expected if timer is canceled
728 // before completion.
729 if (ec != boost::asio::error::operation_aborted)
730 {
731 std::cerr << "caterr timeout async_wait failed: "
732 << ec.message() << "\n";
733 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700734 return;
735 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700736 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
737 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800738 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700739 if (!checkIERRCPUs())
740 {
741 cpuIERRLog();
742 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700743 conn->async_method_call(
744 [](boost::system::error_code ec,
745 const std::variant<bool>& property) {
746 if (ec)
747 {
748 return;
749 }
750 const bool* reset = std::get_if<bool>(&property);
751 if (reset == nullptr)
752 {
753 std::cerr << "Unable to read reset on CATERR value\n";
754 return;
755 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800756 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700757 },
758 "xyz.openbmc_project.Settings",
759 "/xyz/openbmc_project/control/processor_error_config",
760 "org.freedesktop.DBus.Properties", "Get",
761 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
762 });
763}
764
Jason M. Bills1490b142019-07-01 15:48:43 -0700765static void caterrHandler()
766{
767 if (!hostOff)
768 {
769 gpiod::line_event gpioLineEvent = caterrLine.event_read();
770
771 bool caterr =
772 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800773
774 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700775 if (caterr)
776 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700777 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800778 associations.emplace_back(
779 "", "critical",
780 "/xyz/openbmc_project/host_error_monitor/cat_error");
781 associations.emplace_back("", "critical",
782 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700783 }
784 else
785 {
786 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800787 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700788 }
Yong Li1429ca82020-04-27 16:49:45 +0800789 host_error_monitor::associationCATAssert->set_property("Associations",
790 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700791 }
792 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
793 [](const boost::system::error_code ec) {
794 if (ec)
795 {
796 std::cerr << "caterr handler error: "
797 << ec.message() << "\n";
798 return;
799 }
800 caterrHandler();
801 });
802}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700803
Jason M. Billse94f5e12019-09-13 11:11:34 -0700804static void cpu1ThermtripAssertHandler()
805{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700806 if (cpu1FIVRFaultLine.get_value() == 0)
807 {
808 cpuBootFIVRFaultLog(1);
809 }
810 else
811 {
812 cpuThermTripLog(1);
813 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700814}
815
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700816static void cpu1ThermtripHandler()
817{
Jason M. Bills84951142020-04-17 15:57:11 -0700818 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700819
Jason M. Bills84951142020-04-17 15:57:11 -0700820 bool cpu1Thermtrip =
821 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
822 if (cpu1Thermtrip)
823 {
824 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700825 }
Jason M. Bills84951142020-04-17 15:57:11 -0700826
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700827 cpu1ThermtripEvent.async_wait(
828 boost::asio::posix::stream_descriptor::wait_read,
829 [](const boost::system::error_code ec) {
830 if (ec)
831 {
832 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
833 << "\n";
834 return;
835 }
836 cpu1ThermtripHandler();
837 });
838}
839
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000840static void cpu1MemtripHandler()
841{
Jason M. Bills5287c022020-05-19 11:16:09 -0700842 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000843
Jason M. Bills5287c022020-05-19 11:16:09 -0700844 bool cpu1Memtrip =
845 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
846 if (cpu1Memtrip)
847 {
848 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000849 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700850
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000851 cpu1MemtripEvent.async_wait(
852 boost::asio::posix::stream_descriptor::wait_read,
853 [](const boost::system::error_code ec) {
854 if (ec)
855 {
856 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
857 << ec.message() << "\n";
858 return;
859 }
860 cpu1MemtripHandler();
861 });
862}
863
Jason M. Billse94f5e12019-09-13 11:11:34 -0700864static void cpu2ThermtripAssertHandler()
865{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700866 if (cpu2FIVRFaultLine.get_value() == 0)
867 {
868 cpuBootFIVRFaultLog(2);
869 }
870 else
871 {
872 cpuThermTripLog(2);
873 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700874}
875
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700876static void cpu2ThermtripHandler()
877{
Jason M. Bills84951142020-04-17 15:57:11 -0700878 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700879
Jason M. Bills84951142020-04-17 15:57:11 -0700880 bool cpu2Thermtrip =
881 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
882 if (cpu2Thermtrip)
883 {
884 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700885 }
Jason M. Bills84951142020-04-17 15:57:11 -0700886
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700887 cpu2ThermtripEvent.async_wait(
888 boost::asio::posix::stream_descriptor::wait_read,
889 [](const boost::system::error_code ec) {
890 if (ec)
891 {
892 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
893 << "\n";
894 return;
895 }
896 cpu2ThermtripHandler();
897 });
898}
899
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000900static void cpu2MemtripHandler()
901{
Jason M. Bills5287c022020-05-19 11:16:09 -0700902 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000903
Jason M. Bills5287c022020-05-19 11:16:09 -0700904 bool cpu2Memtrip =
905 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
906 if (cpu2Memtrip)
907 {
908 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000909 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700910
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000911 cpu2MemtripEvent.async_wait(
912 boost::asio::posix::stream_descriptor::wait_read,
913 [](const boost::system::error_code ec) {
914 if (ec)
915 {
916 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
917 << ec.message() << "\n";
918 return;
919 }
920 cpu2MemtripHandler();
921 });
922}
923
Jason M. Billse94f5e12019-09-13 11:11:34 -0700924static void cpu1VRHotAssertHandler()
925{
926 cpuVRHotLog("CPU 1");
927}
928
Jason M. Bills250fa632019-08-28 15:58:25 -0700929static void cpu1VRHotHandler()
930{
Jason M. Bills84951142020-04-17 15:57:11 -0700931 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700932
Jason M. Bills84951142020-04-17 15:57:11 -0700933 bool cpu1VRHot =
934 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
935 if (cpu1VRHot)
936 {
937 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700938 }
Jason M. Bills84951142020-04-17 15:57:11 -0700939
Jason M. Bills250fa632019-08-28 15:58:25 -0700940 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
941 [](const boost::system::error_code ec) {
942 if (ec)
943 {
944 std::cerr << "CPU 1 VRHot handler error: "
945 << ec.message() << "\n";
946 return;
947 }
948 cpu1VRHotHandler();
949 });
950}
951
Jason M. Billse94f5e12019-09-13 11:11:34 -0700952static void cpu1MemABCDVRHotAssertHandler()
953{
954 cpuVRHotLog("CPU 1 Memory ABCD");
955}
956
Jason M. Bills9647ba72019-08-29 14:19:19 -0700957static void cpu1MemABCDVRHotHandler()
958{
Jason M. Bills84951142020-04-17 15:57:11 -0700959 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700960
Jason M. Bills84951142020-04-17 15:57:11 -0700961 bool cpu1MemABCDVRHot =
962 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
963 if (cpu1MemABCDVRHot)
964 {
965 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700966 }
Jason M. Bills84951142020-04-17 15:57:11 -0700967
Jason M. Bills9647ba72019-08-29 14:19:19 -0700968 cpu1MemABCDVRHotEvent.async_wait(
969 boost::asio::posix::stream_descriptor::wait_read,
970 [](const boost::system::error_code ec) {
971 if (ec)
972 {
973 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
974 << ec.message() << "\n";
975 return;
976 }
977 cpu1MemABCDVRHotHandler();
978 });
979}
980
Jason M. Billse94f5e12019-09-13 11:11:34 -0700981static void cpu1MemEFGHVRHotAssertHandler()
982{
983 cpuVRHotLog("CPU 1 Memory EFGH");
984}
985
Jason M. Bills9647ba72019-08-29 14:19:19 -0700986static void cpu1MemEFGHVRHotHandler()
987{
Jason M. Bills84951142020-04-17 15:57:11 -0700988 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700989
Jason M. Bills84951142020-04-17 15:57:11 -0700990 bool cpu1MemEFGHVRHot =
991 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
992 if (cpu1MemEFGHVRHot)
993 {
994 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700995 }
Jason M. Bills84951142020-04-17 15:57:11 -0700996
Jason M. Bills9647ba72019-08-29 14:19:19 -0700997 cpu1MemEFGHVRHotEvent.async_wait(
998 boost::asio::posix::stream_descriptor::wait_read,
999 [](const boost::system::error_code ec) {
1000 if (ec)
1001 {
1002 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
1003 << ec.message() << "\n";
1004 return;
1005 }
1006 cpu1MemEFGHVRHotHandler();
1007 });
1008}
1009
Jason M. Billse94f5e12019-09-13 11:11:34 -07001010static void cpu2VRHotAssertHandler()
1011{
1012 cpuVRHotLog("CPU 2");
1013}
1014
Jason M. Bills250fa632019-08-28 15:58:25 -07001015static void cpu2VRHotHandler()
1016{
Jason M. Bills84951142020-04-17 15:57:11 -07001017 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001018
Jason M. Bills84951142020-04-17 15:57:11 -07001019 bool cpu2VRHot =
1020 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1021 if (cpu2VRHot)
1022 {
1023 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001024 }
Jason M. Bills84951142020-04-17 15:57:11 -07001025
Jason M. Bills250fa632019-08-28 15:58:25 -07001026 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1027 [](const boost::system::error_code ec) {
1028 if (ec)
1029 {
1030 std::cerr << "CPU 2 VRHot handler error: "
1031 << ec.message() << "\n";
1032 return;
1033 }
1034 cpu2VRHotHandler();
1035 });
1036}
1037
Jason M. Billse94f5e12019-09-13 11:11:34 -07001038static void cpu2MemABCDVRHotAssertHandler()
1039{
1040 cpuVRHotLog("CPU 2 Memory ABCD");
1041}
1042
Jason M. Bills9647ba72019-08-29 14:19:19 -07001043static void cpu2MemABCDVRHotHandler()
1044{
Jason M. Bills84951142020-04-17 15:57:11 -07001045 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001046
Jason M. Bills84951142020-04-17 15:57:11 -07001047 bool cpu2MemABCDVRHot =
1048 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1049 if (cpu2MemABCDVRHot)
1050 {
1051 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001052 }
Jason M. Bills84951142020-04-17 15:57:11 -07001053
Jason M. Bills9647ba72019-08-29 14:19:19 -07001054 cpu2MemABCDVRHotEvent.async_wait(
1055 boost::asio::posix::stream_descriptor::wait_read,
1056 [](const boost::system::error_code ec) {
1057 if (ec)
1058 {
1059 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1060 << ec.message() << "\n";
1061 return;
1062 }
1063 cpu2MemABCDVRHotHandler();
1064 });
1065}
1066
Jason M. Billse94f5e12019-09-13 11:11:34 -07001067static void cpu2MemEFGHVRHotAssertHandler()
1068{
1069 cpuVRHotLog("CPU 2 Memory EFGH");
1070}
1071
Jason M. Bills9647ba72019-08-29 14:19:19 -07001072static void cpu2MemEFGHVRHotHandler()
1073{
Jason M. Bills84951142020-04-17 15:57:11 -07001074 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001075
Jason M. Bills84951142020-04-17 15:57:11 -07001076 bool cpu2MemEFGHVRHot =
1077 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1078 if (cpu2MemEFGHVRHot)
1079 {
1080 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001081 }
Jason M. Bills84951142020-04-17 15:57:11 -07001082
Jason M. Bills9647ba72019-08-29 14:19:19 -07001083 cpu2MemEFGHVRHotEvent.async_wait(
1084 boost::asio::posix::stream_descriptor::wait_read,
1085 [](const boost::system::error_code ec) {
1086 if (ec)
1087 {
1088 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1089 << ec.message() << "\n";
1090 return;
1091 }
1092 cpu2MemEFGHVRHotHandler();
1093 });
1094}
1095
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001096static void pchThermtripHandler()
1097{
Yong Li1429ca82020-04-27 16:49:45 +08001098 std::vector<Association> associations;
1099
Jason M. Bills84951142020-04-17 15:57:11 -07001100 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001101
Jason M. Bills84951142020-04-17 15:57:11 -07001102 bool pchThermtrip =
1103 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1104 if (pchThermtrip)
1105 {
1106 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001107 associations.emplace_back(
1108 "", "critical",
1109 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1110 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001111 }
Yong Li1429ca82020-04-27 16:49:45 +08001112 else
1113 {
1114 associations.emplace_back("", "", "");
1115 }
1116 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1117 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001118
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001119 pchThermtripEvent.async_wait(
1120 boost::asio::posix::stream_descriptor::wait_read,
1121 [](const boost::system::error_code ec) {
1122 if (ec)
1123 {
1124 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1125 << "\n";
1126 return;
1127 }
1128 pchThermtripHandler();
1129 });
1130}
1131
Jason M. Billscbf78532019-08-16 15:32:11 -07001132static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001133{
Jason M. Billscbf78532019-08-16 15:32:11 -07001134 int errPinSts = (1 << errPin);
1135 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001136 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1137 cpu++, addr++)
1138 {
1139 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1140 {
1141 uint8_t cc = 0;
1142 CPUModel model{};
1143 uint8_t stepping = 0;
1144 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1145 {
1146 std::cerr << "Cannot get CPUID!\n";
1147 continue;
1148 }
1149
1150 switch (model)
1151 {
1152 case skx:
1153 {
1154 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001155 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001156 uint32_t errpinsts = 0;
1157 if (peci_RdPCIConfigLocal(
1158 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1159 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1160 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001161 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001162 }
1163 break;
1164 }
1165 case icx:
1166 {
1167 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001168 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001169 // accessed on PECI as bus 13)
1170 uint32_t errpinsts = 0;
1171 if (peci_RdEndPointConfigPciLocal(
1172 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1173 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1174 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001175 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001176 }
1177 break;
1178 }
1179 }
1180 }
1181 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001182 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001183}
1184
Jason M. Billscbf78532019-08-16 15:32:11 -07001185static void errXAssertHandler(const int errPin,
1186 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001187{
Jason M. Billscbf78532019-08-16 15:32:11 -07001188 // ERRx status is not guaranteed through the timeout, so save which
1189 // CPUs have it asserted
1190 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1191 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1192 errXAssertTimer.async_wait([errPin, errPinCPUs](
1193 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001194 if (ec)
1195 {
1196 // operation_aborted is expected if timer is canceled before
1197 // completion.
1198 if (ec != boost::asio::error::operation_aborted)
1199 {
1200 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1201 << "\n";
1202 }
1203 return;
1204 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001205 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1206 << std::to_string(errTimeoutMs) << " ms\n";
1207 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001208 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001209 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001210 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001211 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001212 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001213 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001214 }
1215 }
1216 }
1217 else
1218 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001219 cpuERRXLog(errPin);
1220 }
1221 });
1222}
1223
Jason M. Bills8c584392019-08-19 11:05:51 -07001224static void err0AssertHandler()
1225{
1226 // Handle the standard ERR0 detection and logging
1227 const static constexpr int err0 = 0;
1228 errXAssertHandler(err0, err0AssertTimer);
1229}
1230
1231static void err0Handler()
1232{
1233 if (!hostOff)
1234 {
1235 gpiod::line_event gpioLineEvent = err0Line.event_read();
1236
1237 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1238 if (err0)
1239 {
1240 err0AssertHandler();
1241 }
1242 else
1243 {
1244 err0AssertTimer.cancel();
1245 }
1246 }
1247 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1248 [](const boost::system::error_code ec) {
1249 if (ec)
1250 {
1251 std::cerr
1252 << "err0 handler error: " << ec.message()
1253 << "\n";
1254 return;
1255 }
1256 err0Handler();
1257 });
1258}
1259
Jason M. Bills75af3962019-08-19 11:07:17 -07001260static void err1AssertHandler()
1261{
1262 // Handle the standard ERR1 detection and logging
1263 const static constexpr int err1 = 1;
1264 errXAssertHandler(err1, err1AssertTimer);
1265}
1266
1267static void err1Handler()
1268{
1269 if (!hostOff)
1270 {
1271 gpiod::line_event gpioLineEvent = err1Line.event_read();
1272
1273 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1274 if (err1)
1275 {
1276 err1AssertHandler();
1277 }
1278 else
1279 {
1280 err1AssertTimer.cancel();
1281 }
1282 }
1283 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1284 [](const boost::system::error_code ec) {
1285 if (ec)
1286 {
1287 std::cerr
1288 << "err1 handler error: " << ec.message()
1289 << "\n";
1290 return;
1291 }
1292 err1Handler();
1293 });
1294}
1295
Jason M. Billscbf78532019-08-16 15:32:11 -07001296static void err2AssertHandler()
1297{
1298 // Handle the standard ERR2 detection and logging
1299 const static constexpr int err2 = 2;
1300 errXAssertHandler(err2, err2AssertTimer);
1301 // Also handle reset for ERR2
1302 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1303 if (ec)
1304 {
1305 // operation_aborted is expected if timer is canceled before
1306 // completion.
1307 if (ec != boost::asio::error::operation_aborted)
1308 {
1309 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1310 << "\n";
1311 }
1312 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001313 }
1314 conn->async_method_call(
1315 [](boost::system::error_code ec,
1316 const std::variant<bool>& property) {
1317 if (ec)
1318 {
1319 return;
1320 }
1321 const bool* reset = std::get_if<bool>(&property);
1322 if (reset == nullptr)
1323 {
1324 std::cerr << "Unable to read reset on ERR2 value\n";
1325 return;
1326 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001327 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001328 },
1329 "xyz.openbmc_project.Settings",
1330 "/xyz/openbmc_project/control/processor_error_config",
1331 "org.freedesktop.DBus.Properties", "Get",
1332 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001333
1334 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001335 });
1336}
1337
1338static void err2Handler()
1339{
1340 if (!hostOff)
1341 {
1342 gpiod::line_event gpioLineEvent = err2Line.event_read();
1343
1344 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1345 if (err2)
1346 {
1347 err2AssertHandler();
1348 }
1349 else
1350 {
1351 err2AssertTimer.cancel();
1352 }
1353 }
1354 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1355 [](const boost::system::error_code ec) {
1356 if (ec)
1357 {
1358 std::cerr
1359 << "err2 handler error: " << ec.message()
1360 << "\n";
1361 return;
1362 }
1363 err2Handler();
1364 });
1365}
1366
Jason M. Bills89922f82019-08-06 11:10:02 -07001367static void smiAssertHandler()
1368{
1369 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1370 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1371 if (ec)
1372 {
1373 // operation_aborted is expected if timer is canceled before
1374 // completion.
1375 if (ec != boost::asio::error::operation_aborted)
1376 {
1377 std::cerr << "smi timeout async_wait failed: " << ec.message()
1378 << "\n";
1379 }
1380 return;
1381 }
1382 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1383 << " ms\n";
1384 smiTimeoutLog();
1385 conn->async_method_call(
1386 [](boost::system::error_code ec,
1387 const std::variant<bool>& property) {
1388 if (ec)
1389 {
1390 return;
1391 }
1392 const bool* reset = std::get_if<bool>(&property);
1393 if (reset == nullptr)
1394 {
1395 std::cerr << "Unable to read reset on SMI value\n";
1396 return;
1397 }
Jason M. Bills94785442020-01-07 15:22:09 -08001398#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001399 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001400#else
1401 if (*reset)
1402 {
Jason M. Billsd69549b2020-08-27 11:42:43 -07001403 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -07001404 startWarmReset();
Jason M. Bills94785442020-01-07 15:22:09 -08001405 }
1406#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001407 },
1408 "xyz.openbmc_project.Settings",
1409 "/xyz/openbmc_project/control/bmc_reset_disables",
1410 "org.freedesktop.DBus.Properties", "Get",
1411 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1412 });
1413}
1414
1415static void smiHandler()
1416{
1417 if (!hostOff)
1418 {
1419 gpiod::line_event gpioLineEvent = smiLine.event_read();
1420
1421 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1422 if (smi)
1423 {
1424 smiAssertHandler();
1425 }
1426 else
1427 {
1428 smiAssertTimer.cancel();
1429 }
1430 }
1431 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1432 [](const boost::system::error_code ec) {
1433 if (ec)
1434 {
1435 std::cerr
1436 << "smi handler error: " << ec.message()
1437 << "\n";
1438 return;
1439 }
1440 smiHandler();
1441 });
1442}
1443
Jason M. Billsa15c2522019-08-16 10:01:44 -07001444static void initializeErrorState()
1445{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001446 // Handle CPU1_MISMATCH if it's asserted now
1447 if (cpu1MismatchLine.get_value() == 1)
1448 {
1449 cpuMismatchLog(1);
1450 }
1451
1452 // Handle CPU2_MISMATCH if it's asserted now
1453 if (cpu2MismatchLine.get_value() == 1)
1454 {
1455 cpuMismatchLog(2);
1456 }
1457
Jason M. Billsa15c2522019-08-16 10:01:44 -07001458 // Handle CPU_CATERR if it's asserted now
1459 if (caterrLine.get_value() == 0)
1460 {
1461 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001462 std::vector<Association> associations;
1463 associations.emplace_back(
1464 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1465 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1466 host_error_monitor::associationCATAssert->set_property("Associations",
1467 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001468 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001469
Jason M. Bills8c584392019-08-19 11:05:51 -07001470 // Handle CPU_ERR0 if it's asserted now
1471 if (err0Line.get_value() == 0)
1472 {
1473 err0AssertHandler();
1474 }
1475
Jason M. Bills75af3962019-08-19 11:07:17 -07001476 // Handle CPU_ERR1 if it's asserted now
1477 if (err1Line.get_value() == 0)
1478 {
1479 err1AssertHandler();
1480 }
1481
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001482 // Handle CPU_ERR2 if it's asserted now
1483 if (err2Line.get_value() == 0)
1484 {
1485 err2AssertHandler();
1486 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001487
1488 // Handle SMI if it's asserted now
1489 if (smiLine.get_value() == 0)
1490 {
1491 smiAssertHandler();
1492 }
Jason M. Bills08866542019-08-16 12:04:19 -07001493
Jason M. Billse94f5e12019-09-13 11:11:34 -07001494 // Handle CPU1_THERMTRIP if it's asserted now
1495 if (cpu1ThermtripLine.get_value() == 0)
1496 {
1497 cpu1ThermtripAssertHandler();
1498 }
1499
1500 // Handle CPU2_THERMTRIP if it's asserted now
1501 if (cpu2ThermtripLine.get_value() == 0)
1502 {
1503 cpu2ThermtripAssertHandler();
1504 }
1505
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001506 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1507 if (cpu1MemtripLine.get_value() == 0)
1508 {
1509 memThermTripLog(1);
1510 }
1511
1512 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1513 if (cpu2MemtripLine.get_value() == 0)
1514 {
1515 memThermTripLog(2);
1516 }
1517
Jason M. Billse94f5e12019-09-13 11:11:34 -07001518 // Handle CPU1_VRHOT if it's asserted now
1519 if (cpu1VRHotLine.get_value() == 0)
1520 {
1521 cpu1VRHotAssertHandler();
1522 }
1523
1524 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1525 if (cpu1MemABCDVRHotLine.get_value() == 0)
1526 {
1527 cpu1MemABCDVRHotAssertHandler();
1528 }
1529
1530 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1531 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1532 {
1533 cpu1MemEFGHVRHotAssertHandler();
1534 }
1535
1536 // Handle CPU2_VRHOT if it's asserted now
1537 if (cpu2VRHotLine.get_value() == 0)
1538 {
1539 cpu2VRHotAssertHandler();
1540 }
1541
1542 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1543 if (cpu2MemABCDVRHotLine.get_value() == 0)
1544 {
1545 cpu2MemABCDVRHotAssertHandler();
1546 }
1547
1548 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1549 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1550 {
1551 cpu2MemEFGHVRHotAssertHandler();
1552 }
1553
Jason M. Bills08866542019-08-16 12:04:19 -07001554 // Handle PCH_BMC_THERMTRIP if it's asserted now
1555 if (pchThermtripLine.get_value() == 0)
1556 {
1557 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001558 std::vector<Association> associations;
1559 associations.emplace_back(
1560 "", "critical",
1561 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1562 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1563 host_error_monitor::associationSSBThermTrip->set_property(
1564 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001565 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001566}
Jason M. Bills1490b142019-07-01 15:48:43 -07001567} // namespace host_error_monitor
1568
1569int main(int argc, char* argv[])
1570{
1571 // setup connection to dbus
1572 host_error_monitor::conn =
1573 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1574
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001575 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001576 host_error_monitor::conn->request_name(
1577 "xyz.openbmc_project.HostErrorMonitor");
1578 sdbusplus::asio::object_server server =
1579 sdbusplus::asio::object_server(host_error_monitor::conn);
1580
Yong Li1429ca82020-04-27 16:49:45 +08001581 // Associations interface for led status
1582 std::vector<host_error_monitor::Association> associations;
1583 associations.emplace_back("", "", "");
1584 host_error_monitor::associationSSBThermTrip = server.add_interface(
1585 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1586 "xyz.openbmc_project.Association.Definitions");
1587 host_error_monitor::associationSSBThermTrip->register_property(
1588 "Associations", associations);
1589 host_error_monitor::associationSSBThermTrip->initialize();
1590
1591 host_error_monitor::associationCATAssert = server.add_interface(
1592 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1593 "xyz.openbmc_project.Association.Definitions");
1594 host_error_monitor::associationCATAssert->register_property("Associations",
1595 associations);
1596 host_error_monitor::associationCATAssert->initialize();
1597
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001598 // Restart Cause Interface
1599 host_error_monitor::hostErrorTimeoutIface =
1600 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1601 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1602
1603 host_error_monitor::hostErrorTimeoutIface->register_property(
1604 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1605 [](const std::size_t& requested, std::size_t& resp) {
1606 if (requested > host_error_monitor::caterrTimeoutMsMax)
1607 {
1608 std::cerr << "IERRTimeoutMs update to " << requested
1609 << "ms rejected. Cannot be greater than "
1610 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1611 return 0;
1612 }
1613 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1614 host_error_monitor::caterrTimeoutMs = requested;
1615 resp = requested;
1616 return 1;
1617 },
1618 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1619 host_error_monitor::hostErrorTimeoutIface->initialize();
1620
Jason M. Bills1490b142019-07-01 15:48:43 -07001621 // Start tracking host state
1622 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1623 host_error_monitor::startHostStateMonitor();
1624
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001625 // Request CPU1_MISMATCH GPIO events
1626 if (!host_error_monitor::requestGPIOInput(
1627 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1628 {
1629 return -1;
1630 }
1631
1632 // Request CPU2_MISMATCH GPIO events
1633 if (!host_error_monitor::requestGPIOInput(
1634 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1635 {
1636 return -1;
1637 }
1638
Jason M. Bills1490b142019-07-01 15:48:43 -07001639 // Initialize the host state
1640 host_error_monitor::initializeHostState();
1641
1642 // Request CPU_CATERR GPIO events
1643 if (!host_error_monitor::requestGPIOEvents(
1644 "CPU_CATERR", host_error_monitor::caterrHandler,
1645 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1646 {
1647 return -1;
1648 }
1649
Jason M. Bills8c584392019-08-19 11:05:51 -07001650 // Request CPU_ERR0 GPIO events
1651 if (!host_error_monitor::requestGPIOEvents(
1652 "CPU_ERR0", host_error_monitor::err0Handler,
1653 host_error_monitor::err0Line, host_error_monitor::err0Event))
1654 {
1655 return -1;
1656 }
1657
Jason M. Bills75af3962019-08-19 11:07:17 -07001658 // Request CPU_ERR1 GPIO events
1659 if (!host_error_monitor::requestGPIOEvents(
1660 "CPU_ERR1", host_error_monitor::err1Handler,
1661 host_error_monitor::err1Line, host_error_monitor::err1Event))
1662 {
1663 return -1;
1664 }
1665
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001666 // Request CPU_ERR2 GPIO events
1667 if (!host_error_monitor::requestGPIOEvents(
1668 "CPU_ERR2", host_error_monitor::err2Handler,
1669 host_error_monitor::err2Line, host_error_monitor::err2Event))
1670 {
1671 return -1;
1672 }
1673
Jason M. Bills89922f82019-08-06 11:10:02 -07001674 // Request SMI GPIO events
1675 if (!host_error_monitor::requestGPIOEvents(
1676 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1677 host_error_monitor::smiEvent))
1678 {
1679 return -1;
1680 }
1681
Jason M. Bills45e87e02019-09-09 14:45:38 -07001682 // Request CPU1_FIVR_FAULT GPIO input
1683 if (!host_error_monitor::requestGPIOInput(
1684 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1685 {
1686 return -1;
1687 }
1688
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001689 // Request CPU1_THERMTRIP GPIO events
1690 if (!host_error_monitor::requestGPIOEvents(
1691 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1692 host_error_monitor::cpu1ThermtripLine,
1693 host_error_monitor::cpu1ThermtripEvent))
1694 {
1695 return -1;
1696 }
1697
Jason M. Bills45e87e02019-09-09 14:45:38 -07001698 // Request CPU2_FIVR_FAULT GPIO input
1699 if (!host_error_monitor::requestGPIOInput(
1700 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1701 {
1702 return -1;
1703 }
1704
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001705 // Request CPU2_THERMTRIP GPIO events
1706 if (!host_error_monitor::requestGPIOEvents(
1707 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1708 host_error_monitor::cpu2ThermtripLine,
1709 host_error_monitor::cpu2ThermtripEvent))
1710 {
1711 return -1;
1712 }
1713
Jason M. Bills250fa632019-08-28 15:58:25 -07001714 // Request CPU1_VRHOT GPIO events
1715 if (!host_error_monitor::requestGPIOEvents(
1716 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1717 host_error_monitor::cpu1VRHotLine,
1718 host_error_monitor::cpu1VRHotEvent))
1719 {
1720 return -1;
1721 }
1722
Jason M. Bills9647ba72019-08-29 14:19:19 -07001723 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1724 if (!host_error_monitor::requestGPIOEvents(
1725 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1726 host_error_monitor::cpu1MemABCDVRHotLine,
1727 host_error_monitor::cpu1MemABCDVRHotEvent))
1728 {
1729 return -1;
1730 }
1731
1732 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1733 if (!host_error_monitor::requestGPIOEvents(
1734 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1735 host_error_monitor::cpu1MemEFGHVRHotLine,
1736 host_error_monitor::cpu1MemEFGHVRHotEvent))
1737 {
1738 return -1;
1739 }
1740
Jason M. Bills250fa632019-08-28 15:58:25 -07001741 // Request CPU2_VRHOT GPIO events
1742 if (!host_error_monitor::requestGPIOEvents(
1743 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1744 host_error_monitor::cpu2VRHotLine,
1745 host_error_monitor::cpu2VRHotEvent))
1746 {
1747 return -1;
1748 }
1749
Jason M. Bills9647ba72019-08-29 14:19:19 -07001750 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1751 if (!host_error_monitor::requestGPIOEvents(
1752 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1753 host_error_monitor::cpu2MemABCDVRHotLine,
1754 host_error_monitor::cpu2MemABCDVRHotEvent))
1755 {
1756 return -1;
1757 }
1758
1759 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1760 if (!host_error_monitor::requestGPIOEvents(
1761 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1762 host_error_monitor::cpu2MemEFGHVRHotLine,
1763 host_error_monitor::cpu2MemEFGHVRHotEvent))
1764 {
1765 return -1;
1766 }
1767
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001768 // Request PCH_BMC_THERMTRIP GPIO events
1769 if (!host_error_monitor::requestGPIOEvents(
1770 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1771 host_error_monitor::pchThermtripLine,
1772 host_error_monitor::pchThermtripEvent))
1773 {
1774 return -1;
1775 }
1776
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001777 // Request CPU1_MEM_THERM_EVENT GPIO events
1778 if (!host_error_monitor::requestGPIOEvents(
1779 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1780 host_error_monitor::cpu1MemtripLine,
1781 host_error_monitor::cpu1MemtripEvent))
1782 {
1783 return -1;
1784 }
1785
1786 // Request CPU2_MEM_THERM_EVENT GPIO events
1787 if (!host_error_monitor::requestGPIOEvents(
1788 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1789 host_error_monitor::cpu2MemtripLine,
1790 host_error_monitor::cpu2MemtripEvent))
1791 {
1792 return -1;
1793 }
1794
Jason M. Bills1490b142019-07-01 15:48:43 -07001795 host_error_monitor::io.run();
1796
1797 return 0;
1798}