blob: f23d25e49db015b41a60c60cbf5dcbb8f3aba0df [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070022#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070024
25#include <bitset>
26#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070027#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070028
29namespace host_error_monitor
30{
31static boost::asio::io_service io;
32static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080033static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070034
Yong Li1429ca82020-04-27 16:49:45 +080035using Association = std::tuple<std::string, std::string, std::string>;
36static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
37static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
38
39static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
40
Jason M. Bills1490b142019-07-01 15:48:43 -070041static bool hostOff = true;
42
Jason M. Billsc4b91f22019-11-26 17:04:50 -080043static size_t caterrTimeoutMs = 2000;
44const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070045const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070046const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070047
48// Timers
49// Timer for CATERR asserted
50static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070051// Timer for ERR0 asserted
52static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070053// Timer for ERR1 asserted
54static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070055// Timer for ERR2 asserted
56static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070057// Timer for SMI asserted
58static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070059
60// GPIO Lines and Event Descriptors
61static gpiod::line caterrLine;
62static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070063static gpiod::line err0Line;
64static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070065static gpiod::line err1Line;
66static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070067static gpiod::line err2Line;
68static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070069static gpiod::line smiLine;
70static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070071static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070072static gpiod::line cpu1ThermtripLine;
73static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070074static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070075static gpiod::line cpu2ThermtripLine;
76static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070077static gpiod::line cpu1VRHotLine;
78static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
79static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070080static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
81static gpiod::line cpu1MemEFGHVRHotLine;
82static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
83static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070084static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070085static gpiod::line cpu1MemABCDVRHotLine;
86static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
87static gpiod::line cpu2MemEFGHVRHotLine;
88static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080089//----------------------------------
90// PCH_BMC_THERMTRIP function related definition
91//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080092static gpiod::line pchThermtripLine;
93static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000094//----------------------------------
95// CPU_MEM_THERM_EVENT function related definition
96//----------------------------------
97static gpiod::line cpu1MemtripLine;
98static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
99static gpiod::line cpu2MemtripLine;
100static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000101//---------------------------------
102// CPU_MISMATCH function related definition
103//---------------------------------
104static gpiod::line cpu1MismatchLine;
105static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700106
Yong Li061eb032020-02-26 15:06:18 +0800107// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800108const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800109const static constexpr uint8_t beepCPUErr2 = 5;
110
111static void beep(const uint8_t& beepPriority)
112{
113 conn->async_method_call(
114 [](boost::system::error_code ec) {
115 if (ec)
116 {
117 std::cerr << "beep returned error with "
118 "async_method_call (ec = "
119 << ec << ")\n";
120 return;
121 }
122 },
123 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
124 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
125}
126
Jason M. Billsa3397932019-08-06 11:07:21 -0700127static void cpuIERRLog()
128{
129 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
130 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
131 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
132}
133
134static void cpuIERRLog(const int cpuNum)
135{
136 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
137
138 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
139 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
140 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
141}
142
143static void cpuIERRLog(const int cpuNum, const std::string& type)
144{
145 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
146
147 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
148 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
149 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
150}
151
Jason M. Billscbf78532019-08-16 15:32:11 -0700152static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700153{
Jason M. Billscbf78532019-08-16 15:32:11 -0700154 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
155
156 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
157 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
158 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700159}
160
Jason M. Billscbf78532019-08-16 15:32:11 -0700161static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700162{
Jason M. Billscbf78532019-08-16 15:32:11 -0700163 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
164 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700165
166 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
167 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
168 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
169}
170
Jason M. Bills89922f82019-08-06 11:10:02 -0700171static void smiTimeoutLog()
172{
173 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
174 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
175 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
176}
177
Jason M. Bills45e87e02019-09-09 14:45:38 -0700178static void cpuBootFIVRFaultLog(const int cpuNum)
179{
180 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
181
182 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
183 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
184 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
185}
186
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700187static void cpuThermTripLog(const int cpuNum)
188{
189 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
190
191 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
192 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
193 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
194 cpuNum, NULL);
195}
196
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000197static void memThermTripLog(const int cpuNum)
198{
199 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
200 std::string msg = cpuNumber + " Memory Thermal trip.";
201
202 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
203 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
204 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
205 cpuNumber.c_str(), NULL);
206}
207
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000208static void cpuMismatchLog(const int cpuNum)
209{
210 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
211
212 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
213 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
214 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
215}
216
Jason M. Bills250fa632019-08-28 15:58:25 -0700217static void cpuVRHotLog(const std::string& vr)
218{
219 std::string msg = vr + " Voltage Regulator Overheated.";
220
221 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
222 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
223 "OpenBMC.0.1.VoltageRegulatorOverheated",
224 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
225}
226
Jason M. Bills08866542019-08-16 12:04:19 -0700227static void ssbThermTripLog()
228{
229 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
230 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
231 "OpenBMC.0.1.SsbThermalTrip", NULL);
232}
233
Jason M. Billsa15c2522019-08-16 10:01:44 -0700234static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700235static void initializeHostState()
236{
237 conn->async_method_call(
238 [](boost::system::error_code ec,
239 const std::variant<std::string>& property) {
240 if (ec)
241 {
242 return;
243 }
244 const std::string* state = std::get_if<std::string>(&property);
245 if (state == nullptr)
246 {
247 std::cerr << "Unable to read host state value\n";
248 return;
249 }
250 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700251 // If the system is on, initialize the error state
252 if (!hostOff)
253 {
254 initializeErrorState();
255 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700256 },
257 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
258 "org.freedesktop.DBus.Properties", "Get",
259 "xyz.openbmc_project.State.Host", "CurrentHostState");
260}
261
262static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
263{
264 return std::make_shared<sdbusplus::bus::match::match>(
265 *conn,
266 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700267 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700268 [](sdbusplus::message::message& msg) {
269 std::string interfaceName;
270 boost::container::flat_map<std::string, std::variant<std::string>>
271 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700272 try
273 {
274 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700275 }
276 catch (std::exception& e)
277 {
278 std::cerr << "Unable to read host state\n";
279 return;
280 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700281 // We only want to check for CurrentHostState
282 if (propertiesChanged.begin()->first != "CurrentHostState")
283 {
284 return;
285 }
286 std::string* state =
287 std::get_if<std::string>(&(propertiesChanged.begin()->second));
288 if (state == nullptr)
289 {
290 std::cerr << propertiesChanged.begin()->first
291 << " property invalid\n";
292 return;
293 }
294
295 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700296
Jason M. Bills1490b142019-07-01 15:48:43 -0700297 if (hostOff)
298 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700299 // No host events should fire while off, so cancel any pending
300 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700301 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700302 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700303 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700304 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700305 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700306 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700307 else
308 {
309 // Handle any initial errors when the host turns on
310 initializeErrorState();
311 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700312 });
313}
314
315static bool requestGPIOEvents(
316 const std::string& name, const std::function<void()>& handler,
317 gpiod::line& gpioLine,
318 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
319{
320 // Find the GPIO line
321 gpioLine = gpiod::find_line(name);
322 if (!gpioLine)
323 {
324 std::cerr << "Failed to find the " << name << " line\n";
325 return false;
326 }
327
328 try
329 {
330 gpioLine.request(
331 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
332 }
333 catch (std::exception&)
334 {
335 std::cerr << "Failed to request events for " << name << "\n";
336 return false;
337 }
338
339 int gpioLineFd = gpioLine.event_get_fd();
340 if (gpioLineFd < 0)
341 {
342 std::cerr << "Failed to get " << name << " fd\n";
343 return false;
344 }
345
346 gpioEventDescriptor.assign(gpioLineFd);
347
348 gpioEventDescriptor.async_wait(
349 boost::asio::posix::stream_descriptor::wait_read,
350 [&name, handler](const boost::system::error_code ec) {
351 if (ec)
352 {
353 std::cerr << name << " fd handler error: " << ec.message()
354 << "\n";
355 return;
356 }
357 handler();
358 });
359 return true;
360}
361
Jason M. Bills45e87e02019-09-09 14:45:38 -0700362static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
363{
364 // Find the GPIO line
365 gpioLine = gpiod::find_line(name);
366 if (!gpioLine)
367 {
368 std::cerr << "Failed to find the " << name << " line.\n";
369 return false;
370 }
371
372 // Request GPIO input
373 try
374 {
375 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
376 }
377 catch (std::exception&)
378 {
379 std::cerr << "Failed to request " << name << " input\n";
380 return false;
381 }
382
383 return true;
384}
385
Jason M. Bills1490b142019-07-01 15:48:43 -0700386static void startPowerCycle()
387{
388 conn->async_method_call(
389 [](boost::system::error_code ec) {
390 if (ec)
391 {
392 std::cerr << "failed to set Chassis State\n";
393 }
394 },
395 "xyz.openbmc_project.State.Chassis",
396 "/xyz/openbmc_project/state/chassis0",
397 "org.freedesktop.DBus.Properties", "Set",
398 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
399 std::variant<std::string>{
400 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
401}
402
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700403static void startWarmReset()
404{
405 conn->async_method_call(
406 [](boost::system::error_code ec) {
407 if (ec)
408 {
409 std::cerr << "failed to set Host State\n";
410 }
411 },
412 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
413 "org.freedesktop.DBus.Properties", "Set",
414 "xyz.openbmc_project.State.Host", "RequestedHostTransition",
415 std::variant<std::string>{
416 "xyz.openbmc_project.State.Host.Transition.ForceWarmReboot"});
417}
418
Jason M. Billsb61766b2019-11-26 17:02:44 -0800419static void startCrashdumpAndRecovery(bool recoverSystem,
420 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700421{
Jason M. Billsd69549b2020-08-27 11:42:43 -0700422 std::cerr << "Starting crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700423 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
Jason M. Bills1490b142019-07-01 15:48:43 -0700424
425 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
426 *conn,
Jason M. Billsc015c552020-08-27 11:02:47 -0700427 "type='signal',interface='com.intel.crashdump.Stored',member='"
428 "CrashdumpComplete'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700429 [recoverSystem](sdbusplus::message::message& msg) {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700430 std::cerr << "Crashdump completed\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700431 if (recoverSystem)
432 {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700433 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700434 startWarmReset();
Jason M. Bills1490b142019-07-01 15:48:43 -0700435 }
436 crashdumpCompleteMatch.reset();
437 });
438
Jason M. Bills1490b142019-07-01 15:48:43 -0700439 conn->async_method_call(
440 [](boost::system::error_code ec) {
441 if (ec)
442 {
443 std::cerr << "failed to start Crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700444 }
445 },
446 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800447 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700448}
449
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700450static void incrementCPUErrorCount(int cpuNum)
451{
452 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
453
454 // Get the current count
455 conn->async_method_call(
456 [propertyName](boost::system::error_code ec,
457 const std::variant<uint8_t>& property) {
458 if (ec)
459 {
460 std::cerr << "Failed to read " << propertyName << ": "
461 << ec.message() << "\n";
462 return;
463 }
464 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
465 if (errorCountVariant == nullptr)
466 {
467 std::cerr << propertyName << " invalid\n";
468 return;
469 }
470 uint8_t errorCount = *errorCountVariant;
471 if (errorCount == std::numeric_limits<uint8_t>::max())
472 {
473 std::cerr << "Maximum error count reached\n";
474 return;
475 }
476 // Increment the count
477 errorCount++;
478 conn->async_method_call(
479 [propertyName](boost::system::error_code ec) {
480 if (ec)
481 {
482 std::cerr << "Failed to set " << propertyName << ": "
483 << ec.message() << "\n";
484 }
485 },
486 "xyz.openbmc_project.Settings",
487 "/xyz/openbmc_project/control/processor_error_config",
488 "org.freedesktop.DBus.Properties", "Set",
489 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
490 std::variant<uint8_t>{errorCount});
491 },
492 "xyz.openbmc_project.Settings",
493 "/xyz/openbmc_project/control/processor_error_config",
494 "org.freedesktop.DBus.Properties", "Get",
495 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
496}
497
Jason M. Billsa3397932019-08-06 11:07:21 -0700498static bool checkIERRCPUs()
499{
500 bool cpuIERRFound = false;
501 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
502 cpu++, addr++)
503 {
504 uint8_t cc = 0;
505 CPUModel model{};
506 uint8_t stepping = 0;
507 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
508 {
509 std::cerr << "Cannot get CPUID!\n";
510 continue;
511 }
512
513 switch (model)
514 {
515 case skx:
516 {
517 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
518 // that caused the IERR
519 uint32_t mcaErrSrcLog = 0;
520 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
521 &cc) != PECI_CC_SUCCESS)
522 {
523 continue;
524 }
525 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
526 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
527 {
528 // TODO: Light the CPU fault LED?
529 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700530 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700531 // Next check if it's a CPU/VR mismatch by reading the
532 // IA32_MC4_STATUS MSR (0x411)
533 uint64_t mc4Status = 0;
534 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
535 PECI_CC_SUCCESS)
536 {
537 continue;
538 }
539 // Check MSEC bits 31:24 for
540 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
541 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
542 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
543 if ((mc4Status & (0x40 << 24)) ||
544 (mc4Status & (0x42 << 24)) ||
545 (mc4Status & (0x43 << 24)))
546 {
547 cpuIERRLog(cpu, "CPU/VR Mismatch");
548 continue;
549 }
550
551 // Next check if it's a Core FIVR fault by looking for a
552 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
553 // 80h)
554 uint32_t coreFIVRErrLog = 0;
555 if (peci_RdPCIConfigLocal(
556 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
557 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
558 {
559 continue;
560 }
561 if (coreFIVRErrLog)
562 {
563 cpuIERRLog(cpu, "Core FIVR Fault");
564 continue;
565 }
566
567 // Next check if it's an Uncore FIVR fault by looking for a
568 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
569 // 84h)
570 uint32_t uncoreFIVRErrLog = 0;
571 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
572 sizeof(uint32_t),
573 (uint8_t*)&uncoreFIVRErrLog,
574 &cc) != PECI_CC_SUCCESS)
575 {
576 continue;
577 }
578 if (uncoreFIVRErrLog)
579 {
580 cpuIERRLog(cpu, "Uncore FIVR Fault");
581 continue;
582 }
583
584 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
585 // both zero, but MSEC bits 31:24 have either
586 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
587 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
588 // uncore FIVR fault
589 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
590 ((mc4Status & (0x51 << 24)) ||
591 (mc4Status & (0x52 << 24))))
592 {
593 cpuIERRLog(cpu, "Uncore FIVR Fault");
594 continue;
595 }
596 cpuIERRLog(cpu);
597 }
598 break;
599 }
600 case icx:
601 {
602 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
603 // that caused the IERR
604 uint32_t mcaErrSrcLog = 0;
605 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
606 &cc) != PECI_CC_SUCCESS)
607 {
608 continue;
609 }
610 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
611 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
612 {
613 // TODO: Light the CPU fault LED?
614 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700615 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700616 // Next check if it's a CPU/VR mismatch by reading the
617 // IA32_MC4_STATUS MSR (0x411)
618 uint64_t mc4Status = 0;
619 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
620 PECI_CC_SUCCESS)
621 {
622 continue;
623 }
624 // TODO: Update MSEC/MSCOD_31_24 check
625 // Check MSEC bits 31:24 for
626 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
627 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
628 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
629 if ((mc4Status & (0x40 << 24)) ||
630 (mc4Status & (0x42 << 24)) ||
631 (mc4Status & (0x43 << 24)))
632 {
633 cpuIERRLog(cpu, "CPU/VR Mismatch");
634 continue;
635 }
636
637 // Next check if it's a Core FIVR fault by looking for a
638 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
639 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
640 uint32_t coreFIVRErrLog0 = 0;
641 uint32_t coreFIVRErrLog1 = 0;
642 if (peci_RdEndPointConfigPciLocal(
643 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
644 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
645 {
646 continue;
647 }
648 if (peci_RdEndPointConfigPciLocal(
649 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
650 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
651 {
652 continue;
653 }
654 if (coreFIVRErrLog0 || coreFIVRErrLog1)
655 {
656 cpuIERRLog(cpu, "Core FIVR Fault");
657 continue;
658 }
659
660 // Next check if it's an Uncore FIVR fault by looking for a
661 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
662 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
663 uint32_t uncoreFIVRErrLog = 0;
664 if (peci_RdEndPointConfigPciLocal(
665 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
666 (uint8_t*)&uncoreFIVRErrLog,
667 &cc) != PECI_CC_SUCCESS)
668 {
669 continue;
670 }
671 if (uncoreFIVRErrLog)
672 {
673 cpuIERRLog(cpu, "Uncore FIVR Fault");
674 continue;
675 }
676
677 // TODO: Update MSEC/MSCOD_31_24 check
678 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
679 // both zero, but MSEC bits 31:24 have either
680 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
681 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
682 // uncore FIVR fault
683 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
684 !uncoreFIVRErrLog &&
685 ((mc4Status & (0x51 << 24)) ||
686 (mc4Status & (0x52 << 24))))
687 {
688 cpuIERRLog(cpu, "Uncore FIVR Fault");
689 continue;
690 }
691 cpuIERRLog(cpu);
692 }
693 break;
694 }
695 }
696 }
697 return cpuIERRFound;
698}
699
Jason M. Billsa15c2522019-08-16 10:01:44 -0700700static void caterrAssertHandler()
701{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700702 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
703 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
704 if (ec)
705 {
706 // operation_aborted is expected if timer is canceled
707 // before completion.
708 if (ec != boost::asio::error::operation_aborted)
709 {
710 std::cerr << "caterr timeout async_wait failed: "
711 << ec.message() << "\n";
712 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700713 return;
714 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700715 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
716 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800717 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700718 if (!checkIERRCPUs())
719 {
720 cpuIERRLog();
721 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700722 conn->async_method_call(
723 [](boost::system::error_code ec,
724 const std::variant<bool>& property) {
725 if (ec)
726 {
727 return;
728 }
729 const bool* reset = std::get_if<bool>(&property);
730 if (reset == nullptr)
731 {
732 std::cerr << "Unable to read reset on CATERR value\n";
733 return;
734 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800735 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700736 },
737 "xyz.openbmc_project.Settings",
738 "/xyz/openbmc_project/control/processor_error_config",
739 "org.freedesktop.DBus.Properties", "Get",
740 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
741 });
742}
743
Jason M. Bills1490b142019-07-01 15:48:43 -0700744static void caterrHandler()
745{
746 if (!hostOff)
747 {
748 gpiod::line_event gpioLineEvent = caterrLine.event_read();
749
750 bool caterr =
751 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800752
753 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700754 if (caterr)
755 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700756 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800757 associations.emplace_back(
758 "", "critical",
759 "/xyz/openbmc_project/host_error_monitor/cat_error");
760 associations.emplace_back("", "critical",
761 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700762 }
763 else
764 {
765 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800766 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700767 }
Yong Li1429ca82020-04-27 16:49:45 +0800768 host_error_monitor::associationCATAssert->set_property("Associations",
769 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700770 }
771 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
772 [](const boost::system::error_code ec) {
773 if (ec)
774 {
775 std::cerr << "caterr handler error: "
776 << ec.message() << "\n";
777 return;
778 }
779 caterrHandler();
780 });
781}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700782
Jason M. Billse94f5e12019-09-13 11:11:34 -0700783static void cpu1ThermtripAssertHandler()
784{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700785 if (cpu1FIVRFaultLine.get_value() == 0)
786 {
787 cpuBootFIVRFaultLog(1);
788 }
789 else
790 {
791 cpuThermTripLog(1);
792 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700793}
794
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700795static void cpu1ThermtripHandler()
796{
Jason M. Bills84951142020-04-17 15:57:11 -0700797 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700798
Jason M. Bills84951142020-04-17 15:57:11 -0700799 bool cpu1Thermtrip =
800 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
801 if (cpu1Thermtrip)
802 {
803 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700804 }
Jason M. Bills84951142020-04-17 15:57:11 -0700805
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700806 cpu1ThermtripEvent.async_wait(
807 boost::asio::posix::stream_descriptor::wait_read,
808 [](const boost::system::error_code ec) {
809 if (ec)
810 {
811 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
812 << "\n";
813 return;
814 }
815 cpu1ThermtripHandler();
816 });
817}
818
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000819static void cpu1MemtripHandler()
820{
Jason M. Bills5287c022020-05-19 11:16:09 -0700821 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000822
Jason M. Bills5287c022020-05-19 11:16:09 -0700823 bool cpu1Memtrip =
824 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
825 if (cpu1Memtrip)
826 {
827 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000828 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700829
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000830 cpu1MemtripEvent.async_wait(
831 boost::asio::posix::stream_descriptor::wait_read,
832 [](const boost::system::error_code ec) {
833 if (ec)
834 {
835 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
836 << ec.message() << "\n";
837 return;
838 }
839 cpu1MemtripHandler();
840 });
841}
842
Jason M. Billse94f5e12019-09-13 11:11:34 -0700843static void cpu2ThermtripAssertHandler()
844{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700845 if (cpu2FIVRFaultLine.get_value() == 0)
846 {
847 cpuBootFIVRFaultLog(2);
848 }
849 else
850 {
851 cpuThermTripLog(2);
852 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700853}
854
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700855static void cpu2ThermtripHandler()
856{
Jason M. Bills84951142020-04-17 15:57:11 -0700857 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700858
Jason M. Bills84951142020-04-17 15:57:11 -0700859 bool cpu2Thermtrip =
860 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
861 if (cpu2Thermtrip)
862 {
863 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700864 }
Jason M. Bills84951142020-04-17 15:57:11 -0700865
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700866 cpu2ThermtripEvent.async_wait(
867 boost::asio::posix::stream_descriptor::wait_read,
868 [](const boost::system::error_code ec) {
869 if (ec)
870 {
871 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
872 << "\n";
873 return;
874 }
875 cpu2ThermtripHandler();
876 });
877}
878
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000879static void cpu2MemtripHandler()
880{
Jason M. Bills5287c022020-05-19 11:16:09 -0700881 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000882
Jason M. Bills5287c022020-05-19 11:16:09 -0700883 bool cpu2Memtrip =
884 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
885 if (cpu2Memtrip)
886 {
887 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000888 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700889
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000890 cpu2MemtripEvent.async_wait(
891 boost::asio::posix::stream_descriptor::wait_read,
892 [](const boost::system::error_code ec) {
893 if (ec)
894 {
895 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
896 << ec.message() << "\n";
897 return;
898 }
899 cpu2MemtripHandler();
900 });
901}
902
Jason M. Billse94f5e12019-09-13 11:11:34 -0700903static void cpu1VRHotAssertHandler()
904{
905 cpuVRHotLog("CPU 1");
906}
907
Jason M. Bills250fa632019-08-28 15:58:25 -0700908static void cpu1VRHotHandler()
909{
Jason M. Bills84951142020-04-17 15:57:11 -0700910 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700911
Jason M. Bills84951142020-04-17 15:57:11 -0700912 bool cpu1VRHot =
913 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
914 if (cpu1VRHot)
915 {
916 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700917 }
Jason M. Bills84951142020-04-17 15:57:11 -0700918
Jason M. Bills250fa632019-08-28 15:58:25 -0700919 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
920 [](const boost::system::error_code ec) {
921 if (ec)
922 {
923 std::cerr << "CPU 1 VRHot handler error: "
924 << ec.message() << "\n";
925 return;
926 }
927 cpu1VRHotHandler();
928 });
929}
930
Jason M. Billse94f5e12019-09-13 11:11:34 -0700931static void cpu1MemABCDVRHotAssertHandler()
932{
933 cpuVRHotLog("CPU 1 Memory ABCD");
934}
935
Jason M. Bills9647ba72019-08-29 14:19:19 -0700936static void cpu1MemABCDVRHotHandler()
937{
Jason M. Bills84951142020-04-17 15:57:11 -0700938 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700939
Jason M. Bills84951142020-04-17 15:57:11 -0700940 bool cpu1MemABCDVRHot =
941 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
942 if (cpu1MemABCDVRHot)
943 {
944 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700945 }
Jason M. Bills84951142020-04-17 15:57:11 -0700946
Jason M. Bills9647ba72019-08-29 14:19:19 -0700947 cpu1MemABCDVRHotEvent.async_wait(
948 boost::asio::posix::stream_descriptor::wait_read,
949 [](const boost::system::error_code ec) {
950 if (ec)
951 {
952 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
953 << ec.message() << "\n";
954 return;
955 }
956 cpu1MemABCDVRHotHandler();
957 });
958}
959
Jason M. Billse94f5e12019-09-13 11:11:34 -0700960static void cpu1MemEFGHVRHotAssertHandler()
961{
962 cpuVRHotLog("CPU 1 Memory EFGH");
963}
964
Jason M. Bills9647ba72019-08-29 14:19:19 -0700965static void cpu1MemEFGHVRHotHandler()
966{
Jason M. Bills84951142020-04-17 15:57:11 -0700967 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700968
Jason M. Bills84951142020-04-17 15:57:11 -0700969 bool cpu1MemEFGHVRHot =
970 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
971 if (cpu1MemEFGHVRHot)
972 {
973 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700974 }
Jason M. Bills84951142020-04-17 15:57:11 -0700975
Jason M. Bills9647ba72019-08-29 14:19:19 -0700976 cpu1MemEFGHVRHotEvent.async_wait(
977 boost::asio::posix::stream_descriptor::wait_read,
978 [](const boost::system::error_code ec) {
979 if (ec)
980 {
981 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
982 << ec.message() << "\n";
983 return;
984 }
985 cpu1MemEFGHVRHotHandler();
986 });
987}
988
Jason M. Billse94f5e12019-09-13 11:11:34 -0700989static void cpu2VRHotAssertHandler()
990{
991 cpuVRHotLog("CPU 2");
992}
993
Jason M. Bills250fa632019-08-28 15:58:25 -0700994static void cpu2VRHotHandler()
995{
Jason M. Bills84951142020-04-17 15:57:11 -0700996 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700997
Jason M. Bills84951142020-04-17 15:57:11 -0700998 bool cpu2VRHot =
999 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1000 if (cpu2VRHot)
1001 {
1002 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001003 }
Jason M. Bills84951142020-04-17 15:57:11 -07001004
Jason M. Bills250fa632019-08-28 15:58:25 -07001005 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1006 [](const boost::system::error_code ec) {
1007 if (ec)
1008 {
1009 std::cerr << "CPU 2 VRHot handler error: "
1010 << ec.message() << "\n";
1011 return;
1012 }
1013 cpu2VRHotHandler();
1014 });
1015}
1016
Jason M. Billse94f5e12019-09-13 11:11:34 -07001017static void cpu2MemABCDVRHotAssertHandler()
1018{
1019 cpuVRHotLog("CPU 2 Memory ABCD");
1020}
1021
Jason M. Bills9647ba72019-08-29 14:19:19 -07001022static void cpu2MemABCDVRHotHandler()
1023{
Jason M. Bills84951142020-04-17 15:57:11 -07001024 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001025
Jason M. Bills84951142020-04-17 15:57:11 -07001026 bool cpu2MemABCDVRHot =
1027 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1028 if (cpu2MemABCDVRHot)
1029 {
1030 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001031 }
Jason M. Bills84951142020-04-17 15:57:11 -07001032
Jason M. Bills9647ba72019-08-29 14:19:19 -07001033 cpu2MemABCDVRHotEvent.async_wait(
1034 boost::asio::posix::stream_descriptor::wait_read,
1035 [](const boost::system::error_code ec) {
1036 if (ec)
1037 {
1038 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1039 << ec.message() << "\n";
1040 return;
1041 }
1042 cpu2MemABCDVRHotHandler();
1043 });
1044}
1045
Jason M. Billse94f5e12019-09-13 11:11:34 -07001046static void cpu2MemEFGHVRHotAssertHandler()
1047{
1048 cpuVRHotLog("CPU 2 Memory EFGH");
1049}
1050
Jason M. Bills9647ba72019-08-29 14:19:19 -07001051static void cpu2MemEFGHVRHotHandler()
1052{
Jason M. Bills84951142020-04-17 15:57:11 -07001053 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001054
Jason M. Bills84951142020-04-17 15:57:11 -07001055 bool cpu2MemEFGHVRHot =
1056 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1057 if (cpu2MemEFGHVRHot)
1058 {
1059 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001060 }
Jason M. Bills84951142020-04-17 15:57:11 -07001061
Jason M. Bills9647ba72019-08-29 14:19:19 -07001062 cpu2MemEFGHVRHotEvent.async_wait(
1063 boost::asio::posix::stream_descriptor::wait_read,
1064 [](const boost::system::error_code ec) {
1065 if (ec)
1066 {
1067 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1068 << ec.message() << "\n";
1069 return;
1070 }
1071 cpu2MemEFGHVRHotHandler();
1072 });
1073}
1074
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001075static void pchThermtripHandler()
1076{
Yong Li1429ca82020-04-27 16:49:45 +08001077 std::vector<Association> associations;
1078
Jason M. Bills84951142020-04-17 15:57:11 -07001079 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001080
Jason M. Bills84951142020-04-17 15:57:11 -07001081 bool pchThermtrip =
1082 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1083 if (pchThermtrip)
1084 {
1085 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001086 associations.emplace_back(
1087 "", "critical",
1088 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1089 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001090 }
Yong Li1429ca82020-04-27 16:49:45 +08001091 else
1092 {
1093 associations.emplace_back("", "", "");
1094 }
1095 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1096 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001097
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001098 pchThermtripEvent.async_wait(
1099 boost::asio::posix::stream_descriptor::wait_read,
1100 [](const boost::system::error_code ec) {
1101 if (ec)
1102 {
1103 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1104 << "\n";
1105 return;
1106 }
1107 pchThermtripHandler();
1108 });
1109}
1110
Jason M. Billscbf78532019-08-16 15:32:11 -07001111static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001112{
Jason M. Billscbf78532019-08-16 15:32:11 -07001113 int errPinSts = (1 << errPin);
1114 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001115 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1116 cpu++, addr++)
1117 {
1118 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1119 {
1120 uint8_t cc = 0;
1121 CPUModel model{};
1122 uint8_t stepping = 0;
1123 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1124 {
1125 std::cerr << "Cannot get CPUID!\n";
1126 continue;
1127 }
1128
1129 switch (model)
1130 {
1131 case skx:
1132 {
1133 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001134 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001135 uint32_t errpinsts = 0;
1136 if (peci_RdPCIConfigLocal(
1137 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1138 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1139 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001140 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001141 }
1142 break;
1143 }
1144 case icx:
1145 {
1146 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001147 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001148 // accessed on PECI as bus 13)
1149 uint32_t errpinsts = 0;
1150 if (peci_RdEndPointConfigPciLocal(
1151 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1152 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1153 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001154 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001155 }
1156 break;
1157 }
1158 }
1159 }
1160 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001161 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001162}
1163
Jason M. Billscbf78532019-08-16 15:32:11 -07001164static void errXAssertHandler(const int errPin,
1165 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001166{
Jason M. Billscbf78532019-08-16 15:32:11 -07001167 // ERRx status is not guaranteed through the timeout, so save which
1168 // CPUs have it asserted
1169 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1170 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1171 errXAssertTimer.async_wait([errPin, errPinCPUs](
1172 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001173 if (ec)
1174 {
1175 // operation_aborted is expected if timer is canceled before
1176 // completion.
1177 if (ec != boost::asio::error::operation_aborted)
1178 {
1179 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1180 << "\n";
1181 }
1182 return;
1183 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001184 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1185 << std::to_string(errTimeoutMs) << " ms\n";
1186 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001187 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001188 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001189 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001190 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001191 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001192 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001193 }
1194 }
1195 }
1196 else
1197 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001198 cpuERRXLog(errPin);
1199 }
1200 });
1201}
1202
Jason M. Bills8c584392019-08-19 11:05:51 -07001203static void err0AssertHandler()
1204{
1205 // Handle the standard ERR0 detection and logging
1206 const static constexpr int err0 = 0;
1207 errXAssertHandler(err0, err0AssertTimer);
1208}
1209
1210static void err0Handler()
1211{
1212 if (!hostOff)
1213 {
1214 gpiod::line_event gpioLineEvent = err0Line.event_read();
1215
1216 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1217 if (err0)
1218 {
1219 err0AssertHandler();
1220 }
1221 else
1222 {
1223 err0AssertTimer.cancel();
1224 }
1225 }
1226 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1227 [](const boost::system::error_code ec) {
1228 if (ec)
1229 {
1230 std::cerr
1231 << "err0 handler error: " << ec.message()
1232 << "\n";
1233 return;
1234 }
1235 err0Handler();
1236 });
1237}
1238
Jason M. Bills75af3962019-08-19 11:07:17 -07001239static void err1AssertHandler()
1240{
1241 // Handle the standard ERR1 detection and logging
1242 const static constexpr int err1 = 1;
1243 errXAssertHandler(err1, err1AssertTimer);
1244}
1245
1246static void err1Handler()
1247{
1248 if (!hostOff)
1249 {
1250 gpiod::line_event gpioLineEvent = err1Line.event_read();
1251
1252 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1253 if (err1)
1254 {
1255 err1AssertHandler();
1256 }
1257 else
1258 {
1259 err1AssertTimer.cancel();
1260 }
1261 }
1262 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1263 [](const boost::system::error_code ec) {
1264 if (ec)
1265 {
1266 std::cerr
1267 << "err1 handler error: " << ec.message()
1268 << "\n";
1269 return;
1270 }
1271 err1Handler();
1272 });
1273}
1274
Jason M. Billscbf78532019-08-16 15:32:11 -07001275static void err2AssertHandler()
1276{
1277 // Handle the standard ERR2 detection and logging
1278 const static constexpr int err2 = 2;
1279 errXAssertHandler(err2, err2AssertTimer);
1280 // Also handle reset for ERR2
1281 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1282 if (ec)
1283 {
1284 // operation_aborted is expected if timer is canceled before
1285 // completion.
1286 if (ec != boost::asio::error::operation_aborted)
1287 {
1288 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1289 << "\n";
1290 }
1291 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001292 }
1293 conn->async_method_call(
1294 [](boost::system::error_code ec,
1295 const std::variant<bool>& property) {
1296 if (ec)
1297 {
1298 return;
1299 }
1300 const bool* reset = std::get_if<bool>(&property);
1301 if (reset == nullptr)
1302 {
1303 std::cerr << "Unable to read reset on ERR2 value\n";
1304 return;
1305 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001306 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001307 },
1308 "xyz.openbmc_project.Settings",
1309 "/xyz/openbmc_project/control/processor_error_config",
1310 "org.freedesktop.DBus.Properties", "Get",
1311 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001312
1313 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001314 });
1315}
1316
1317static void err2Handler()
1318{
1319 if (!hostOff)
1320 {
1321 gpiod::line_event gpioLineEvent = err2Line.event_read();
1322
1323 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1324 if (err2)
1325 {
1326 err2AssertHandler();
1327 }
1328 else
1329 {
1330 err2AssertTimer.cancel();
1331 }
1332 }
1333 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1334 [](const boost::system::error_code ec) {
1335 if (ec)
1336 {
1337 std::cerr
1338 << "err2 handler error: " << ec.message()
1339 << "\n";
1340 return;
1341 }
1342 err2Handler();
1343 });
1344}
1345
Jason M. Bills89922f82019-08-06 11:10:02 -07001346static void smiAssertHandler()
1347{
1348 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1349 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1350 if (ec)
1351 {
1352 // operation_aborted is expected if timer is canceled before
1353 // completion.
1354 if (ec != boost::asio::error::operation_aborted)
1355 {
1356 std::cerr << "smi timeout async_wait failed: " << ec.message()
1357 << "\n";
1358 }
1359 return;
1360 }
1361 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1362 << " ms\n";
1363 smiTimeoutLog();
1364 conn->async_method_call(
1365 [](boost::system::error_code ec,
1366 const std::variant<bool>& property) {
1367 if (ec)
1368 {
1369 return;
1370 }
1371 const bool* reset = std::get_if<bool>(&property);
1372 if (reset == nullptr)
1373 {
1374 std::cerr << "Unable to read reset on SMI value\n";
1375 return;
1376 }
Jason M. Bills94785442020-01-07 15:22:09 -08001377#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001378 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001379#else
1380 if (*reset)
1381 {
Jason M. Billsd69549b2020-08-27 11:42:43 -07001382 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -07001383 startWarmReset();
Jason M. Bills94785442020-01-07 15:22:09 -08001384 }
1385#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001386 },
1387 "xyz.openbmc_project.Settings",
1388 "/xyz/openbmc_project/control/bmc_reset_disables",
1389 "org.freedesktop.DBus.Properties", "Get",
1390 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1391 });
1392}
1393
1394static void smiHandler()
1395{
1396 if (!hostOff)
1397 {
1398 gpiod::line_event gpioLineEvent = smiLine.event_read();
1399
1400 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1401 if (smi)
1402 {
1403 smiAssertHandler();
1404 }
1405 else
1406 {
1407 smiAssertTimer.cancel();
1408 }
1409 }
1410 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1411 [](const boost::system::error_code ec) {
1412 if (ec)
1413 {
1414 std::cerr
1415 << "smi handler error: " << ec.message()
1416 << "\n";
1417 return;
1418 }
1419 smiHandler();
1420 });
1421}
1422
Jason M. Billsa15c2522019-08-16 10:01:44 -07001423static void initializeErrorState()
1424{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001425 // Handle CPU1_MISMATCH if it's asserted now
1426 if (cpu1MismatchLine.get_value() == 1)
1427 {
1428 cpuMismatchLog(1);
1429 }
1430
1431 // Handle CPU2_MISMATCH if it's asserted now
1432 if (cpu2MismatchLine.get_value() == 1)
1433 {
1434 cpuMismatchLog(2);
1435 }
1436
Jason M. Billsa15c2522019-08-16 10:01:44 -07001437 // Handle CPU_CATERR if it's asserted now
1438 if (caterrLine.get_value() == 0)
1439 {
1440 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001441 std::vector<Association> associations;
1442 associations.emplace_back(
1443 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1444 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1445 host_error_monitor::associationCATAssert->set_property("Associations",
1446 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001447 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001448
Jason M. Bills8c584392019-08-19 11:05:51 -07001449 // Handle CPU_ERR0 if it's asserted now
1450 if (err0Line.get_value() == 0)
1451 {
1452 err0AssertHandler();
1453 }
1454
Jason M. Bills75af3962019-08-19 11:07:17 -07001455 // Handle CPU_ERR1 if it's asserted now
1456 if (err1Line.get_value() == 0)
1457 {
1458 err1AssertHandler();
1459 }
1460
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001461 // Handle CPU_ERR2 if it's asserted now
1462 if (err2Line.get_value() == 0)
1463 {
1464 err2AssertHandler();
1465 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001466
1467 // Handle SMI if it's asserted now
1468 if (smiLine.get_value() == 0)
1469 {
1470 smiAssertHandler();
1471 }
Jason M. Bills08866542019-08-16 12:04:19 -07001472
Jason M. Billse94f5e12019-09-13 11:11:34 -07001473 // Handle CPU1_THERMTRIP if it's asserted now
1474 if (cpu1ThermtripLine.get_value() == 0)
1475 {
1476 cpu1ThermtripAssertHandler();
1477 }
1478
1479 // Handle CPU2_THERMTRIP if it's asserted now
1480 if (cpu2ThermtripLine.get_value() == 0)
1481 {
1482 cpu2ThermtripAssertHandler();
1483 }
1484
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001485 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1486 if (cpu1MemtripLine.get_value() == 0)
1487 {
1488 memThermTripLog(1);
1489 }
1490
1491 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1492 if (cpu2MemtripLine.get_value() == 0)
1493 {
1494 memThermTripLog(2);
1495 }
1496
Jason M. Billse94f5e12019-09-13 11:11:34 -07001497 // Handle CPU1_VRHOT if it's asserted now
1498 if (cpu1VRHotLine.get_value() == 0)
1499 {
1500 cpu1VRHotAssertHandler();
1501 }
1502
1503 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1504 if (cpu1MemABCDVRHotLine.get_value() == 0)
1505 {
1506 cpu1MemABCDVRHotAssertHandler();
1507 }
1508
1509 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1510 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1511 {
1512 cpu1MemEFGHVRHotAssertHandler();
1513 }
1514
1515 // Handle CPU2_VRHOT if it's asserted now
1516 if (cpu2VRHotLine.get_value() == 0)
1517 {
1518 cpu2VRHotAssertHandler();
1519 }
1520
1521 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1522 if (cpu2MemABCDVRHotLine.get_value() == 0)
1523 {
1524 cpu2MemABCDVRHotAssertHandler();
1525 }
1526
1527 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1528 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1529 {
1530 cpu2MemEFGHVRHotAssertHandler();
1531 }
1532
Jason M. Bills08866542019-08-16 12:04:19 -07001533 // Handle PCH_BMC_THERMTRIP if it's asserted now
1534 if (pchThermtripLine.get_value() == 0)
1535 {
1536 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001537 std::vector<Association> associations;
1538 associations.emplace_back(
1539 "", "critical",
1540 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1541 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1542 host_error_monitor::associationSSBThermTrip->set_property(
1543 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001544 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001545}
Jason M. Bills1490b142019-07-01 15:48:43 -07001546} // namespace host_error_monitor
1547
1548int main(int argc, char* argv[])
1549{
1550 // setup connection to dbus
1551 host_error_monitor::conn =
1552 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1553
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001554 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001555 host_error_monitor::conn->request_name(
1556 "xyz.openbmc_project.HostErrorMonitor");
1557 sdbusplus::asio::object_server server =
1558 sdbusplus::asio::object_server(host_error_monitor::conn);
1559
Yong Li1429ca82020-04-27 16:49:45 +08001560 // Associations interface for led status
1561 std::vector<host_error_monitor::Association> associations;
1562 associations.emplace_back("", "", "");
1563 host_error_monitor::associationSSBThermTrip = server.add_interface(
1564 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1565 "xyz.openbmc_project.Association.Definitions");
1566 host_error_monitor::associationSSBThermTrip->register_property(
1567 "Associations", associations);
1568 host_error_monitor::associationSSBThermTrip->initialize();
1569
1570 host_error_monitor::associationCATAssert = server.add_interface(
1571 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1572 "xyz.openbmc_project.Association.Definitions");
1573 host_error_monitor::associationCATAssert->register_property("Associations",
1574 associations);
1575 host_error_monitor::associationCATAssert->initialize();
1576
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001577 // Restart Cause Interface
1578 host_error_monitor::hostErrorTimeoutIface =
1579 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1580 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1581
1582 host_error_monitor::hostErrorTimeoutIface->register_property(
1583 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1584 [](const std::size_t& requested, std::size_t& resp) {
1585 if (requested > host_error_monitor::caterrTimeoutMsMax)
1586 {
1587 std::cerr << "IERRTimeoutMs update to " << requested
1588 << "ms rejected. Cannot be greater than "
1589 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1590 return 0;
1591 }
1592 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1593 host_error_monitor::caterrTimeoutMs = requested;
1594 resp = requested;
1595 return 1;
1596 },
1597 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1598 host_error_monitor::hostErrorTimeoutIface->initialize();
1599
Jason M. Bills1490b142019-07-01 15:48:43 -07001600 // Start tracking host state
1601 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1602 host_error_monitor::startHostStateMonitor();
1603
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001604 // Request CPU1_MISMATCH GPIO events
1605 if (!host_error_monitor::requestGPIOInput(
1606 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1607 {
1608 return -1;
1609 }
1610
1611 // Request CPU2_MISMATCH GPIO events
1612 if (!host_error_monitor::requestGPIOInput(
1613 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1614 {
1615 return -1;
1616 }
1617
Jason M. Bills1490b142019-07-01 15:48:43 -07001618 // Initialize the host state
1619 host_error_monitor::initializeHostState();
1620
1621 // Request CPU_CATERR GPIO events
1622 if (!host_error_monitor::requestGPIOEvents(
1623 "CPU_CATERR", host_error_monitor::caterrHandler,
1624 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1625 {
1626 return -1;
1627 }
1628
Jason M. Bills8c584392019-08-19 11:05:51 -07001629 // Request CPU_ERR0 GPIO events
1630 if (!host_error_monitor::requestGPIOEvents(
1631 "CPU_ERR0", host_error_monitor::err0Handler,
1632 host_error_monitor::err0Line, host_error_monitor::err0Event))
1633 {
1634 return -1;
1635 }
1636
Jason M. Bills75af3962019-08-19 11:07:17 -07001637 // Request CPU_ERR1 GPIO events
1638 if (!host_error_monitor::requestGPIOEvents(
1639 "CPU_ERR1", host_error_monitor::err1Handler,
1640 host_error_monitor::err1Line, host_error_monitor::err1Event))
1641 {
1642 return -1;
1643 }
1644
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001645 // Request CPU_ERR2 GPIO events
1646 if (!host_error_monitor::requestGPIOEvents(
1647 "CPU_ERR2", host_error_monitor::err2Handler,
1648 host_error_monitor::err2Line, host_error_monitor::err2Event))
1649 {
1650 return -1;
1651 }
1652
Jason M. Bills89922f82019-08-06 11:10:02 -07001653 // Request SMI GPIO events
1654 if (!host_error_monitor::requestGPIOEvents(
1655 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1656 host_error_monitor::smiEvent))
1657 {
1658 return -1;
1659 }
1660
Jason M. Bills45e87e02019-09-09 14:45:38 -07001661 // Request CPU1_FIVR_FAULT GPIO input
1662 if (!host_error_monitor::requestGPIOInput(
1663 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1664 {
1665 return -1;
1666 }
1667
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001668 // Request CPU1_THERMTRIP GPIO events
1669 if (!host_error_monitor::requestGPIOEvents(
1670 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1671 host_error_monitor::cpu1ThermtripLine,
1672 host_error_monitor::cpu1ThermtripEvent))
1673 {
1674 return -1;
1675 }
1676
Jason M. Bills45e87e02019-09-09 14:45:38 -07001677 // Request CPU2_FIVR_FAULT GPIO input
1678 if (!host_error_monitor::requestGPIOInput(
1679 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1680 {
1681 return -1;
1682 }
1683
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001684 // Request CPU2_THERMTRIP GPIO events
1685 if (!host_error_monitor::requestGPIOEvents(
1686 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1687 host_error_monitor::cpu2ThermtripLine,
1688 host_error_monitor::cpu2ThermtripEvent))
1689 {
1690 return -1;
1691 }
1692
Jason M. Bills250fa632019-08-28 15:58:25 -07001693 // Request CPU1_VRHOT GPIO events
1694 if (!host_error_monitor::requestGPIOEvents(
1695 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1696 host_error_monitor::cpu1VRHotLine,
1697 host_error_monitor::cpu1VRHotEvent))
1698 {
1699 return -1;
1700 }
1701
Jason M. Bills9647ba72019-08-29 14:19:19 -07001702 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1703 if (!host_error_monitor::requestGPIOEvents(
1704 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1705 host_error_monitor::cpu1MemABCDVRHotLine,
1706 host_error_monitor::cpu1MemABCDVRHotEvent))
1707 {
1708 return -1;
1709 }
1710
1711 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1712 if (!host_error_monitor::requestGPIOEvents(
1713 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1714 host_error_monitor::cpu1MemEFGHVRHotLine,
1715 host_error_monitor::cpu1MemEFGHVRHotEvent))
1716 {
1717 return -1;
1718 }
1719
Jason M. Bills250fa632019-08-28 15:58:25 -07001720 // Request CPU2_VRHOT GPIO events
1721 if (!host_error_monitor::requestGPIOEvents(
1722 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1723 host_error_monitor::cpu2VRHotLine,
1724 host_error_monitor::cpu2VRHotEvent))
1725 {
1726 return -1;
1727 }
1728
Jason M. Bills9647ba72019-08-29 14:19:19 -07001729 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1730 if (!host_error_monitor::requestGPIOEvents(
1731 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1732 host_error_monitor::cpu2MemABCDVRHotLine,
1733 host_error_monitor::cpu2MemABCDVRHotEvent))
1734 {
1735 return -1;
1736 }
1737
1738 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1739 if (!host_error_monitor::requestGPIOEvents(
1740 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1741 host_error_monitor::cpu2MemEFGHVRHotLine,
1742 host_error_monitor::cpu2MemEFGHVRHotEvent))
1743 {
1744 return -1;
1745 }
1746
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001747 // Request PCH_BMC_THERMTRIP GPIO events
1748 if (!host_error_monitor::requestGPIOEvents(
1749 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1750 host_error_monitor::pchThermtripLine,
1751 host_error_monitor::pchThermtripEvent))
1752 {
1753 return -1;
1754 }
1755
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001756 // Request CPU1_MEM_THERM_EVENT GPIO events
1757 if (!host_error_monitor::requestGPIOEvents(
1758 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1759 host_error_monitor::cpu1MemtripLine,
1760 host_error_monitor::cpu1MemtripEvent))
1761 {
1762 return -1;
1763 }
1764
1765 // Request CPU2_MEM_THERM_EVENT GPIO events
1766 if (!host_error_monitor::requestGPIOEvents(
1767 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1768 host_error_monitor::cpu2MemtripLine,
1769 host_error_monitor::cpu2MemtripEvent))
1770 {
1771 return -1;
1772 }
1773
Jason M. Bills1490b142019-07-01 15:48:43 -07001774 host_error_monitor::io.run();
1775
1776 return 0;
1777}