blob: 313ef29e0b813fe8472c8e72d2547979a91bde90 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills1490b142019-07-01 15:48:43 -070019#include <boost/asio/posix/stream_descriptor.hpp>
20#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070021#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070022
23#include <bitset>
24#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070025#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070026
27namespace host_error_monitor
28{
29static boost::asio::io_service io;
30static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080031static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070032
Yong Li1429ca82020-04-27 16:49:45 +080033using Association = std::tuple<std::string, std::string, std::string>;
34static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
35static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
36
37static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
38
Jason M. Bills1490b142019-07-01 15:48:43 -070039static bool hostOff = true;
40
Jason M. Billsc4b91f22019-11-26 17:04:50 -080041static size_t caterrTimeoutMs = 2000;
42const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070043const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070044const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070045const static constexpr size_t crashdumpTimeoutS = 300;
46
47// Timers
48// Timer for CATERR asserted
49static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070050// Timer for ERR0 asserted
51static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070052// Timer for ERR1 asserted
53static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070054// Timer for ERR2 asserted
55static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070056// Timer for SMI asserted
57static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070058
59// GPIO Lines and Event Descriptors
60static gpiod::line caterrLine;
61static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070062static gpiod::line err0Line;
63static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070064static gpiod::line err1Line;
65static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070066static gpiod::line err2Line;
67static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070068static gpiod::line smiLine;
69static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070070static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070071static gpiod::line cpu1ThermtripLine;
72static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070073static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070074static gpiod::line cpu2ThermtripLine;
75static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070076static gpiod::line cpu1VRHotLine;
77static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
78static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070079static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
80static gpiod::line cpu1MemEFGHVRHotLine;
81static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
82static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070083static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070084static gpiod::line cpu1MemABCDVRHotLine;
85static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
86static gpiod::line cpu2MemEFGHVRHotLine;
87static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080088//----------------------------------
89// PCH_BMC_THERMTRIP function related definition
90//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080091static gpiod::line pchThermtripLine;
92static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000093//----------------------------------
94// CPU_MEM_THERM_EVENT function related definition
95//----------------------------------
96static gpiod::line cpu1MemtripLine;
97static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
98static gpiod::line cpu2MemtripLine;
99static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000100//---------------------------------
101// CPU_MISMATCH function related definition
102//---------------------------------
103static gpiod::line cpu1MismatchLine;
104static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700105
Yong Li061eb032020-02-26 15:06:18 +0800106// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800107const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800108const static constexpr uint8_t beepCPUErr2 = 5;
109
110static void beep(const uint8_t& beepPriority)
111{
112 conn->async_method_call(
113 [](boost::system::error_code ec) {
114 if (ec)
115 {
116 std::cerr << "beep returned error with "
117 "async_method_call (ec = "
118 << ec << ")\n";
119 return;
120 }
121 },
122 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
123 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
124}
125
Jason M. Billsa3397932019-08-06 11:07:21 -0700126static void cpuIERRLog()
127{
128 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
129 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
130 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
131}
132
133static void cpuIERRLog(const int cpuNum)
134{
135 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
136
137 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
138 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
139 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
140}
141
142static void cpuIERRLog(const int cpuNum, const std::string& type)
143{
144 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
145
146 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
147 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
148 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
149}
150
Jason M. Billscbf78532019-08-16 15:32:11 -0700151static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700152{
Jason M. Billscbf78532019-08-16 15:32:11 -0700153 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
154
155 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
156 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
157 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700158}
159
Jason M. Billscbf78532019-08-16 15:32:11 -0700160static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700161{
Jason M. Billscbf78532019-08-16 15:32:11 -0700162 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
163 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700164
165 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
166 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
167 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
168}
169
Jason M. Bills89922f82019-08-06 11:10:02 -0700170static void smiTimeoutLog()
171{
172 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
173 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
174 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
175}
176
Jason M. Bills45e87e02019-09-09 14:45:38 -0700177static void cpuBootFIVRFaultLog(const int cpuNum)
178{
179 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
180
181 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
182 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
183 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
184}
185
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700186static void cpuThermTripLog(const int cpuNum)
187{
188 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
189
190 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
191 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
192 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
193 cpuNum, NULL);
194}
195
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000196static void memThermTripLog(const int cpuNum)
197{
198 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
199 std::string msg = cpuNumber + " Memory Thermal trip.";
200
201 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
202 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
203 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
204 cpuNumber.c_str(), NULL);
205}
206
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000207static void cpuMismatchLog(const int cpuNum)
208{
209 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
210
211 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
212 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
213 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
214}
215
Jason M. Bills250fa632019-08-28 15:58:25 -0700216static void cpuVRHotLog(const std::string& vr)
217{
218 std::string msg = vr + " Voltage Regulator Overheated.";
219
220 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
221 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
222 "OpenBMC.0.1.VoltageRegulatorOverheated",
223 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
224}
225
Jason M. Bills08866542019-08-16 12:04:19 -0700226static void ssbThermTripLog()
227{
228 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
229 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
230 "OpenBMC.0.1.SsbThermalTrip", NULL);
231}
232
Jason M. Billsa15c2522019-08-16 10:01:44 -0700233static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700234static void initializeHostState()
235{
236 conn->async_method_call(
237 [](boost::system::error_code ec,
238 const std::variant<std::string>& property) {
239 if (ec)
240 {
241 return;
242 }
243 const std::string* state = std::get_if<std::string>(&property);
244 if (state == nullptr)
245 {
246 std::cerr << "Unable to read host state value\n";
247 return;
248 }
249 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700250 // If the system is on, initialize the error state
251 if (!hostOff)
252 {
253 initializeErrorState();
254 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700255 },
256 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
257 "org.freedesktop.DBus.Properties", "Get",
258 "xyz.openbmc_project.State.Host", "CurrentHostState");
259}
260
261static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
262{
263 return std::make_shared<sdbusplus::bus::match::match>(
264 *conn,
265 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700266 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700267 [](sdbusplus::message::message& msg) {
268 std::string interfaceName;
269 boost::container::flat_map<std::string, std::variant<std::string>>
270 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700271 try
272 {
273 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700274 }
275 catch (std::exception& e)
276 {
277 std::cerr << "Unable to read host state\n";
278 return;
279 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700280 // We only want to check for CurrentHostState
281 if (propertiesChanged.begin()->first != "CurrentHostState")
282 {
283 return;
284 }
285 std::string* state =
286 std::get_if<std::string>(&(propertiesChanged.begin()->second));
287 if (state == nullptr)
288 {
289 std::cerr << propertiesChanged.begin()->first
290 << " property invalid\n";
291 return;
292 }
293
294 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700295
Jason M. Bills1490b142019-07-01 15:48:43 -0700296 if (hostOff)
297 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700298 // No host events should fire while off, so cancel any pending
299 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700300 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700301 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700302 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700303 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700304 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700305 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700306 else
307 {
308 // Handle any initial errors when the host turns on
309 initializeErrorState();
310 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700311 });
312}
313
314static bool requestGPIOEvents(
315 const std::string& name, const std::function<void()>& handler,
316 gpiod::line& gpioLine,
317 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
318{
319 // Find the GPIO line
320 gpioLine = gpiod::find_line(name);
321 if (!gpioLine)
322 {
323 std::cerr << "Failed to find the " << name << " line\n";
324 return false;
325 }
326
327 try
328 {
329 gpioLine.request(
330 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
331 }
332 catch (std::exception&)
333 {
334 std::cerr << "Failed to request events for " << name << "\n";
335 return false;
336 }
337
338 int gpioLineFd = gpioLine.event_get_fd();
339 if (gpioLineFd < 0)
340 {
341 std::cerr << "Failed to get " << name << " fd\n";
342 return false;
343 }
344
345 gpioEventDescriptor.assign(gpioLineFd);
346
347 gpioEventDescriptor.async_wait(
348 boost::asio::posix::stream_descriptor::wait_read,
349 [&name, handler](const boost::system::error_code ec) {
350 if (ec)
351 {
352 std::cerr << name << " fd handler error: " << ec.message()
353 << "\n";
354 return;
355 }
356 handler();
357 });
358 return true;
359}
360
Jason M. Bills45e87e02019-09-09 14:45:38 -0700361static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
362{
363 // Find the GPIO line
364 gpioLine = gpiod::find_line(name);
365 if (!gpioLine)
366 {
367 std::cerr << "Failed to find the " << name << " line.\n";
368 return false;
369 }
370
371 // Request GPIO input
372 try
373 {
374 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
375 }
376 catch (std::exception&)
377 {
378 std::cerr << "Failed to request " << name << " input\n";
379 return false;
380 }
381
382 return true;
383}
384
Jason M. Bills1490b142019-07-01 15:48:43 -0700385static void startPowerCycle()
386{
387 conn->async_method_call(
388 [](boost::system::error_code ec) {
389 if (ec)
390 {
391 std::cerr << "failed to set Chassis State\n";
392 }
393 },
394 "xyz.openbmc_project.State.Chassis",
395 "/xyz/openbmc_project/state/chassis0",
396 "org.freedesktop.DBus.Properties", "Set",
397 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
398 std::variant<std::string>{
399 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
400}
401
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700402static void startWarmReset()
403{
404 conn->async_method_call(
405 [](boost::system::error_code ec) {
406 if (ec)
407 {
408 std::cerr << "failed to set Host State\n";
409 }
410 },
411 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
412 "org.freedesktop.DBus.Properties", "Set",
413 "xyz.openbmc_project.State.Host", "RequestedHostTransition",
414 std::variant<std::string>{
415 "xyz.openbmc_project.State.Host.Transition.ForceWarmReboot"});
416}
417
Jason M. Billsb61766b2019-11-26 17:02:44 -0800418static void startCrashdumpAndRecovery(bool recoverSystem,
419 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700420{
421 std::cout << "Starting crashdump\n";
422 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
423 static boost::asio::steady_timer crashdumpTimer(io);
424
425 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
426 *conn,
427 "type='signal',interface='org.freedesktop.DBus.Properties',"
428 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
429 [recoverSystem](sdbusplus::message::message& msg) {
430 crashdumpTimer.cancel();
431 std::cout << "Crashdump completed\n";
432 if (recoverSystem)
433 {
434 std::cout << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700435 startWarmReset();
Jason M. Bills1490b142019-07-01 15:48:43 -0700436 }
437 crashdumpCompleteMatch.reset();
438 });
439
440 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
441 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
442 if (ec)
443 {
444 // operation_aborted is expected if timer is canceled
445 if (ec != boost::asio::error::operation_aborted)
446 {
447 std::cerr << "Crashdump async_wait failed: " << ec.message()
448 << "\n";
449 }
450 std::cout << "Crashdump timer canceled\n";
451 return;
452 }
453 std::cerr << "Crashdump failed to complete before timeout\n";
454 crashdumpCompleteMatch.reset();
455 });
456
457 conn->async_method_call(
458 [](boost::system::error_code ec) {
459 if (ec)
460 {
461 std::cerr << "failed to start Crashdump\n";
462 crashdumpTimer.cancel();
463 crashdumpCompleteMatch.reset();
464 }
465 },
466 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800467 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700468}
469
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700470static void incrementCPUErrorCount(int cpuNum)
471{
472 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
473
474 // Get the current count
475 conn->async_method_call(
476 [propertyName](boost::system::error_code ec,
477 const std::variant<uint8_t>& property) {
478 if (ec)
479 {
480 std::cerr << "Failed to read " << propertyName << ": "
481 << ec.message() << "\n";
482 return;
483 }
484 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
485 if (errorCountVariant == nullptr)
486 {
487 std::cerr << propertyName << " invalid\n";
488 return;
489 }
490 uint8_t errorCount = *errorCountVariant;
491 if (errorCount == std::numeric_limits<uint8_t>::max())
492 {
493 std::cerr << "Maximum error count reached\n";
494 return;
495 }
496 // Increment the count
497 errorCount++;
498 conn->async_method_call(
499 [propertyName](boost::system::error_code ec) {
500 if (ec)
501 {
502 std::cerr << "Failed to set " << propertyName << ": "
503 << ec.message() << "\n";
504 }
505 },
506 "xyz.openbmc_project.Settings",
507 "/xyz/openbmc_project/control/processor_error_config",
508 "org.freedesktop.DBus.Properties", "Set",
509 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
510 std::variant<uint8_t>{errorCount});
511 },
512 "xyz.openbmc_project.Settings",
513 "/xyz/openbmc_project/control/processor_error_config",
514 "org.freedesktop.DBus.Properties", "Get",
515 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
516}
517
Jason M. Billsa3397932019-08-06 11:07:21 -0700518static bool checkIERRCPUs()
519{
520 bool cpuIERRFound = false;
521 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
522 cpu++, addr++)
523 {
524 uint8_t cc = 0;
525 CPUModel model{};
526 uint8_t stepping = 0;
527 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
528 {
529 std::cerr << "Cannot get CPUID!\n";
530 continue;
531 }
532
533 switch (model)
534 {
535 case skx:
536 {
537 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
538 // that caused the IERR
539 uint32_t mcaErrSrcLog = 0;
540 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
541 &cc) != PECI_CC_SUCCESS)
542 {
543 continue;
544 }
545 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
546 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
547 {
548 // TODO: Light the CPU fault LED?
549 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700550 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700551 // Next check if it's a CPU/VR mismatch by reading the
552 // IA32_MC4_STATUS MSR (0x411)
553 uint64_t mc4Status = 0;
554 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
555 PECI_CC_SUCCESS)
556 {
557 continue;
558 }
559 // Check MSEC bits 31:24 for
560 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
561 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
562 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
563 if ((mc4Status & (0x40 << 24)) ||
564 (mc4Status & (0x42 << 24)) ||
565 (mc4Status & (0x43 << 24)))
566 {
567 cpuIERRLog(cpu, "CPU/VR Mismatch");
568 continue;
569 }
570
571 // Next check if it's a Core FIVR fault by looking for a
572 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
573 // 80h)
574 uint32_t coreFIVRErrLog = 0;
575 if (peci_RdPCIConfigLocal(
576 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
577 (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
578 {
579 continue;
580 }
581 if (coreFIVRErrLog)
582 {
583 cpuIERRLog(cpu, "Core FIVR Fault");
584 continue;
585 }
586
587 // Next check if it's an Uncore FIVR fault by looking for a
588 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
589 // 84h)
590 uint32_t uncoreFIVRErrLog = 0;
591 if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
592 sizeof(uint32_t),
593 (uint8_t*)&uncoreFIVRErrLog,
594 &cc) != PECI_CC_SUCCESS)
595 {
596 continue;
597 }
598 if (uncoreFIVRErrLog)
599 {
600 cpuIERRLog(cpu, "Uncore FIVR Fault");
601 continue;
602 }
603
604 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
605 // both zero, but MSEC bits 31:24 have either
606 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
607 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
608 // uncore FIVR fault
609 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
610 ((mc4Status & (0x51 << 24)) ||
611 (mc4Status & (0x52 << 24))))
612 {
613 cpuIERRLog(cpu, "Uncore FIVR Fault");
614 continue;
615 }
616 cpuIERRLog(cpu);
617 }
618 break;
619 }
620 case icx:
621 {
622 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
623 // that caused the IERR
624 uint32_t mcaErrSrcLog = 0;
625 if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
626 &cc) != PECI_CC_SUCCESS)
627 {
628 continue;
629 }
630 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
631 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
632 {
633 // TODO: Light the CPU fault LED?
634 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700635 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700636 // Next check if it's a CPU/VR mismatch by reading the
637 // IA32_MC4_STATUS MSR (0x411)
638 uint64_t mc4Status = 0;
639 if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
640 PECI_CC_SUCCESS)
641 {
642 continue;
643 }
644 // TODO: Update MSEC/MSCOD_31_24 check
645 // Check MSEC bits 31:24 for
646 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
647 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
648 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
649 if ((mc4Status & (0x40 << 24)) ||
650 (mc4Status & (0x42 << 24)) ||
651 (mc4Status & (0x43 << 24)))
652 {
653 cpuIERRLog(cpu, "CPU/VR Mismatch");
654 continue;
655 }
656
657 // Next check if it's a Core FIVR fault by looking for a
658 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
659 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
660 uint32_t coreFIVRErrLog0 = 0;
661 uint32_t coreFIVRErrLog1 = 0;
662 if (peci_RdEndPointConfigPciLocal(
663 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
664 (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
665 {
666 continue;
667 }
668 if (peci_RdEndPointConfigPciLocal(
669 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
670 (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
671 {
672 continue;
673 }
674 if (coreFIVRErrLog0 || coreFIVRErrLog1)
675 {
676 cpuIERRLog(cpu, "Core FIVR Fault");
677 continue;
678 }
679
680 // Next check if it's an Uncore FIVR fault by looking for a
681 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
682 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
683 uint32_t uncoreFIVRErrLog = 0;
684 if (peci_RdEndPointConfigPciLocal(
685 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
686 (uint8_t*)&uncoreFIVRErrLog,
687 &cc) != PECI_CC_SUCCESS)
688 {
689 continue;
690 }
691 if (uncoreFIVRErrLog)
692 {
693 cpuIERRLog(cpu, "Uncore FIVR Fault");
694 continue;
695 }
696
697 // TODO: Update MSEC/MSCOD_31_24 check
698 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
699 // both zero, but MSEC bits 31:24 have either
700 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
701 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
702 // uncore FIVR fault
703 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
704 !uncoreFIVRErrLog &&
705 ((mc4Status & (0x51 << 24)) ||
706 (mc4Status & (0x52 << 24))))
707 {
708 cpuIERRLog(cpu, "Uncore FIVR Fault");
709 continue;
710 }
711 cpuIERRLog(cpu);
712 }
713 break;
714 }
715 }
716 }
717 return cpuIERRFound;
718}
719
Jason M. Billsa15c2522019-08-16 10:01:44 -0700720static void caterrAssertHandler()
721{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700722 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
723 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
724 if (ec)
725 {
726 // operation_aborted is expected if timer is canceled
727 // before completion.
728 if (ec != boost::asio::error::operation_aborted)
729 {
730 std::cerr << "caterr timeout async_wait failed: "
731 << ec.message() << "\n";
732 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700733 return;
734 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700735 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
736 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800737 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700738 if (!checkIERRCPUs())
739 {
740 cpuIERRLog();
741 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700742 conn->async_method_call(
743 [](boost::system::error_code ec,
744 const std::variant<bool>& property) {
745 if (ec)
746 {
747 return;
748 }
749 const bool* reset = std::get_if<bool>(&property);
750 if (reset == nullptr)
751 {
752 std::cerr << "Unable to read reset on CATERR value\n";
753 return;
754 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800755 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700756 },
757 "xyz.openbmc_project.Settings",
758 "/xyz/openbmc_project/control/processor_error_config",
759 "org.freedesktop.DBus.Properties", "Get",
760 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
761 });
762}
763
Jason M. Bills1490b142019-07-01 15:48:43 -0700764static void caterrHandler()
765{
766 if (!hostOff)
767 {
768 gpiod::line_event gpioLineEvent = caterrLine.event_read();
769
770 bool caterr =
771 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800772
773 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700774 if (caterr)
775 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700776 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800777 associations.emplace_back(
778 "", "critical",
779 "/xyz/openbmc_project/host_error_monitor/cat_error");
780 associations.emplace_back("", "critical",
781 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700782 }
783 else
784 {
785 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800786 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700787 }
Yong Li1429ca82020-04-27 16:49:45 +0800788 host_error_monitor::associationCATAssert->set_property("Associations",
789 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700790 }
791 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
792 [](const boost::system::error_code ec) {
793 if (ec)
794 {
795 std::cerr << "caterr handler error: "
796 << ec.message() << "\n";
797 return;
798 }
799 caterrHandler();
800 });
801}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700802
Jason M. Billse94f5e12019-09-13 11:11:34 -0700803static void cpu1ThermtripAssertHandler()
804{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700805 if (cpu1FIVRFaultLine.get_value() == 0)
806 {
807 cpuBootFIVRFaultLog(1);
808 }
809 else
810 {
811 cpuThermTripLog(1);
812 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700813}
814
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700815static void cpu1ThermtripHandler()
816{
Jason M. Bills84951142020-04-17 15:57:11 -0700817 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700818
Jason M. Bills84951142020-04-17 15:57:11 -0700819 bool cpu1Thermtrip =
820 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
821 if (cpu1Thermtrip)
822 {
823 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700824 }
Jason M. Bills84951142020-04-17 15:57:11 -0700825
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700826 cpu1ThermtripEvent.async_wait(
827 boost::asio::posix::stream_descriptor::wait_read,
828 [](const boost::system::error_code ec) {
829 if (ec)
830 {
831 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
832 << "\n";
833 return;
834 }
835 cpu1ThermtripHandler();
836 });
837}
838
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000839static void cpu1MemtripHandler()
840{
Jason M. Bills5287c022020-05-19 11:16:09 -0700841 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000842
Jason M. Bills5287c022020-05-19 11:16:09 -0700843 bool cpu1Memtrip =
844 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
845 if (cpu1Memtrip)
846 {
847 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000848 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700849
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000850 cpu1MemtripEvent.async_wait(
851 boost::asio::posix::stream_descriptor::wait_read,
852 [](const boost::system::error_code ec) {
853 if (ec)
854 {
855 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
856 << ec.message() << "\n";
857 return;
858 }
859 cpu1MemtripHandler();
860 });
861}
862
Jason M. Billse94f5e12019-09-13 11:11:34 -0700863static void cpu2ThermtripAssertHandler()
864{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700865 if (cpu2FIVRFaultLine.get_value() == 0)
866 {
867 cpuBootFIVRFaultLog(2);
868 }
869 else
870 {
871 cpuThermTripLog(2);
872 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700873}
874
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700875static void cpu2ThermtripHandler()
876{
Jason M. Bills84951142020-04-17 15:57:11 -0700877 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700878
Jason M. Bills84951142020-04-17 15:57:11 -0700879 bool cpu2Thermtrip =
880 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
881 if (cpu2Thermtrip)
882 {
883 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700884 }
Jason M. Bills84951142020-04-17 15:57:11 -0700885
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700886 cpu2ThermtripEvent.async_wait(
887 boost::asio::posix::stream_descriptor::wait_read,
888 [](const boost::system::error_code ec) {
889 if (ec)
890 {
891 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
892 << "\n";
893 return;
894 }
895 cpu2ThermtripHandler();
896 });
897}
898
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000899static void cpu2MemtripHandler()
900{
Jason M. Bills5287c022020-05-19 11:16:09 -0700901 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000902
Jason M. Bills5287c022020-05-19 11:16:09 -0700903 bool cpu2Memtrip =
904 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
905 if (cpu2Memtrip)
906 {
907 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000908 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700909
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000910 cpu2MemtripEvent.async_wait(
911 boost::asio::posix::stream_descriptor::wait_read,
912 [](const boost::system::error_code ec) {
913 if (ec)
914 {
915 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
916 << ec.message() << "\n";
917 return;
918 }
919 cpu2MemtripHandler();
920 });
921}
922
Jason M. Billse94f5e12019-09-13 11:11:34 -0700923static void cpu1VRHotAssertHandler()
924{
925 cpuVRHotLog("CPU 1");
926}
927
Jason M. Bills250fa632019-08-28 15:58:25 -0700928static void cpu1VRHotHandler()
929{
Jason M. Bills84951142020-04-17 15:57:11 -0700930 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700931
Jason M. Bills84951142020-04-17 15:57:11 -0700932 bool cpu1VRHot =
933 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
934 if (cpu1VRHot)
935 {
936 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700937 }
Jason M. Bills84951142020-04-17 15:57:11 -0700938
Jason M. Bills250fa632019-08-28 15:58:25 -0700939 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
940 [](const boost::system::error_code ec) {
941 if (ec)
942 {
943 std::cerr << "CPU 1 VRHot handler error: "
944 << ec.message() << "\n";
945 return;
946 }
947 cpu1VRHotHandler();
948 });
949}
950
Jason M. Billse94f5e12019-09-13 11:11:34 -0700951static void cpu1MemABCDVRHotAssertHandler()
952{
953 cpuVRHotLog("CPU 1 Memory ABCD");
954}
955
Jason M. Bills9647ba72019-08-29 14:19:19 -0700956static void cpu1MemABCDVRHotHandler()
957{
Jason M. Bills84951142020-04-17 15:57:11 -0700958 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700959
Jason M. Bills84951142020-04-17 15:57:11 -0700960 bool cpu1MemABCDVRHot =
961 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
962 if (cpu1MemABCDVRHot)
963 {
964 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700965 }
Jason M. Bills84951142020-04-17 15:57:11 -0700966
Jason M. Bills9647ba72019-08-29 14:19:19 -0700967 cpu1MemABCDVRHotEvent.async_wait(
968 boost::asio::posix::stream_descriptor::wait_read,
969 [](const boost::system::error_code ec) {
970 if (ec)
971 {
972 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
973 << ec.message() << "\n";
974 return;
975 }
976 cpu1MemABCDVRHotHandler();
977 });
978}
979
Jason M. Billse94f5e12019-09-13 11:11:34 -0700980static void cpu1MemEFGHVRHotAssertHandler()
981{
982 cpuVRHotLog("CPU 1 Memory EFGH");
983}
984
Jason M. Bills9647ba72019-08-29 14:19:19 -0700985static void cpu1MemEFGHVRHotHandler()
986{
Jason M. Bills84951142020-04-17 15:57:11 -0700987 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700988
Jason M. Bills84951142020-04-17 15:57:11 -0700989 bool cpu1MemEFGHVRHot =
990 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
991 if (cpu1MemEFGHVRHot)
992 {
993 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700994 }
Jason M. Bills84951142020-04-17 15:57:11 -0700995
Jason M. Bills9647ba72019-08-29 14:19:19 -0700996 cpu1MemEFGHVRHotEvent.async_wait(
997 boost::asio::posix::stream_descriptor::wait_read,
998 [](const boost::system::error_code ec) {
999 if (ec)
1000 {
1001 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
1002 << ec.message() << "\n";
1003 return;
1004 }
1005 cpu1MemEFGHVRHotHandler();
1006 });
1007}
1008
Jason M. Billse94f5e12019-09-13 11:11:34 -07001009static void cpu2VRHotAssertHandler()
1010{
1011 cpuVRHotLog("CPU 2");
1012}
1013
Jason M. Bills250fa632019-08-28 15:58:25 -07001014static void cpu2VRHotHandler()
1015{
Jason M. Bills84951142020-04-17 15:57:11 -07001016 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001017
Jason M. Bills84951142020-04-17 15:57:11 -07001018 bool cpu2VRHot =
1019 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1020 if (cpu2VRHot)
1021 {
1022 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001023 }
Jason M. Bills84951142020-04-17 15:57:11 -07001024
Jason M. Bills250fa632019-08-28 15:58:25 -07001025 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1026 [](const boost::system::error_code ec) {
1027 if (ec)
1028 {
1029 std::cerr << "CPU 2 VRHot handler error: "
1030 << ec.message() << "\n";
1031 return;
1032 }
1033 cpu2VRHotHandler();
1034 });
1035}
1036
Jason M. Billse94f5e12019-09-13 11:11:34 -07001037static void cpu2MemABCDVRHotAssertHandler()
1038{
1039 cpuVRHotLog("CPU 2 Memory ABCD");
1040}
1041
Jason M. Bills9647ba72019-08-29 14:19:19 -07001042static void cpu2MemABCDVRHotHandler()
1043{
Jason M. Bills84951142020-04-17 15:57:11 -07001044 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001045
Jason M. Bills84951142020-04-17 15:57:11 -07001046 bool cpu2MemABCDVRHot =
1047 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1048 if (cpu2MemABCDVRHot)
1049 {
1050 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001051 }
Jason M. Bills84951142020-04-17 15:57:11 -07001052
Jason M. Bills9647ba72019-08-29 14:19:19 -07001053 cpu2MemABCDVRHotEvent.async_wait(
1054 boost::asio::posix::stream_descriptor::wait_read,
1055 [](const boost::system::error_code ec) {
1056 if (ec)
1057 {
1058 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1059 << ec.message() << "\n";
1060 return;
1061 }
1062 cpu2MemABCDVRHotHandler();
1063 });
1064}
1065
Jason M. Billse94f5e12019-09-13 11:11:34 -07001066static void cpu2MemEFGHVRHotAssertHandler()
1067{
1068 cpuVRHotLog("CPU 2 Memory EFGH");
1069}
1070
Jason M. Bills9647ba72019-08-29 14:19:19 -07001071static void cpu2MemEFGHVRHotHandler()
1072{
Jason M. Bills84951142020-04-17 15:57:11 -07001073 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001074
Jason M. Bills84951142020-04-17 15:57:11 -07001075 bool cpu2MemEFGHVRHot =
1076 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1077 if (cpu2MemEFGHVRHot)
1078 {
1079 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001080 }
Jason M. Bills84951142020-04-17 15:57:11 -07001081
Jason M. Bills9647ba72019-08-29 14:19:19 -07001082 cpu2MemEFGHVRHotEvent.async_wait(
1083 boost::asio::posix::stream_descriptor::wait_read,
1084 [](const boost::system::error_code ec) {
1085 if (ec)
1086 {
1087 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1088 << ec.message() << "\n";
1089 return;
1090 }
1091 cpu2MemEFGHVRHotHandler();
1092 });
1093}
1094
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001095static void pchThermtripHandler()
1096{
Yong Li1429ca82020-04-27 16:49:45 +08001097 std::vector<Association> associations;
1098
Jason M. Bills84951142020-04-17 15:57:11 -07001099 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001100
Jason M. Bills84951142020-04-17 15:57:11 -07001101 bool pchThermtrip =
1102 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1103 if (pchThermtrip)
1104 {
1105 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001106 associations.emplace_back(
1107 "", "critical",
1108 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1109 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001110 }
Yong Li1429ca82020-04-27 16:49:45 +08001111 else
1112 {
1113 associations.emplace_back("", "", "");
1114 }
1115 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1116 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001117
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001118 pchThermtripEvent.async_wait(
1119 boost::asio::posix::stream_descriptor::wait_read,
1120 [](const boost::system::error_code ec) {
1121 if (ec)
1122 {
1123 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1124 << "\n";
1125 return;
1126 }
1127 pchThermtripHandler();
1128 });
1129}
1130
Jason M. Billscbf78532019-08-16 15:32:11 -07001131static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001132{
Jason M. Billscbf78532019-08-16 15:32:11 -07001133 int errPinSts = (1 << errPin);
1134 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001135 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
1136 cpu++, addr++)
1137 {
1138 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1139 {
1140 uint8_t cc = 0;
1141 CPUModel model{};
1142 uint8_t stepping = 0;
1143 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1144 {
1145 std::cerr << "Cannot get CPUID!\n";
1146 continue;
1147 }
1148
1149 switch (model)
1150 {
1151 case skx:
1152 {
1153 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001154 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001155 uint32_t errpinsts = 0;
1156 if (peci_RdPCIConfigLocal(
1157 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1158 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1159 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001160 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001161 }
1162 break;
1163 }
1164 case icx:
1165 {
1166 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001167 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001168 // accessed on PECI as bus 13)
1169 uint32_t errpinsts = 0;
1170 if (peci_RdEndPointConfigPciLocal(
1171 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1172 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
1173 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001174 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001175 }
1176 break;
1177 }
1178 }
1179 }
1180 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001181 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001182}
1183
Jason M. Billscbf78532019-08-16 15:32:11 -07001184static void errXAssertHandler(const int errPin,
1185 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001186{
Jason M. Billscbf78532019-08-16 15:32:11 -07001187 // ERRx status is not guaranteed through the timeout, so save which
1188 // CPUs have it asserted
1189 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1190 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1191 errXAssertTimer.async_wait([errPin, errPinCPUs](
1192 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001193 if (ec)
1194 {
1195 // operation_aborted is expected if timer is canceled before
1196 // completion.
1197 if (ec != boost::asio::error::operation_aborted)
1198 {
1199 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1200 << "\n";
1201 }
1202 return;
1203 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001204 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1205 << std::to_string(errTimeoutMs) << " ms\n";
1206 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001207 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001208 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001209 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001210 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001211 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001212 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001213 }
1214 }
1215 }
1216 else
1217 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001218 cpuERRXLog(errPin);
1219 }
1220 });
1221}
1222
Jason M. Bills8c584392019-08-19 11:05:51 -07001223static void err0AssertHandler()
1224{
1225 // Handle the standard ERR0 detection and logging
1226 const static constexpr int err0 = 0;
1227 errXAssertHandler(err0, err0AssertTimer);
1228}
1229
1230static void err0Handler()
1231{
1232 if (!hostOff)
1233 {
1234 gpiod::line_event gpioLineEvent = err0Line.event_read();
1235
1236 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1237 if (err0)
1238 {
1239 err0AssertHandler();
1240 }
1241 else
1242 {
1243 err0AssertTimer.cancel();
1244 }
1245 }
1246 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1247 [](const boost::system::error_code ec) {
1248 if (ec)
1249 {
1250 std::cerr
1251 << "err0 handler error: " << ec.message()
1252 << "\n";
1253 return;
1254 }
1255 err0Handler();
1256 });
1257}
1258
Jason M. Bills75af3962019-08-19 11:07:17 -07001259static void err1AssertHandler()
1260{
1261 // Handle the standard ERR1 detection and logging
1262 const static constexpr int err1 = 1;
1263 errXAssertHandler(err1, err1AssertTimer);
1264}
1265
1266static void err1Handler()
1267{
1268 if (!hostOff)
1269 {
1270 gpiod::line_event gpioLineEvent = err1Line.event_read();
1271
1272 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1273 if (err1)
1274 {
1275 err1AssertHandler();
1276 }
1277 else
1278 {
1279 err1AssertTimer.cancel();
1280 }
1281 }
1282 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1283 [](const boost::system::error_code ec) {
1284 if (ec)
1285 {
1286 std::cerr
1287 << "err1 handler error: " << ec.message()
1288 << "\n";
1289 return;
1290 }
1291 err1Handler();
1292 });
1293}
1294
Jason M. Billscbf78532019-08-16 15:32:11 -07001295static void err2AssertHandler()
1296{
1297 // Handle the standard ERR2 detection and logging
1298 const static constexpr int err2 = 2;
1299 errXAssertHandler(err2, err2AssertTimer);
1300 // Also handle reset for ERR2
1301 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1302 if (ec)
1303 {
1304 // operation_aborted is expected if timer is canceled before
1305 // completion.
1306 if (ec != boost::asio::error::operation_aborted)
1307 {
1308 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1309 << "\n";
1310 }
1311 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001312 }
1313 conn->async_method_call(
1314 [](boost::system::error_code ec,
1315 const std::variant<bool>& property) {
1316 if (ec)
1317 {
1318 return;
1319 }
1320 const bool* reset = std::get_if<bool>(&property);
1321 if (reset == nullptr)
1322 {
1323 std::cerr << "Unable to read reset on ERR2 value\n";
1324 return;
1325 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001326 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001327 },
1328 "xyz.openbmc_project.Settings",
1329 "/xyz/openbmc_project/control/processor_error_config",
1330 "org.freedesktop.DBus.Properties", "Get",
1331 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001332
1333 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001334 });
1335}
1336
1337static void err2Handler()
1338{
1339 if (!hostOff)
1340 {
1341 gpiod::line_event gpioLineEvent = err2Line.event_read();
1342
1343 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1344 if (err2)
1345 {
1346 err2AssertHandler();
1347 }
1348 else
1349 {
1350 err2AssertTimer.cancel();
1351 }
1352 }
1353 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1354 [](const boost::system::error_code ec) {
1355 if (ec)
1356 {
1357 std::cerr
1358 << "err2 handler error: " << ec.message()
1359 << "\n";
1360 return;
1361 }
1362 err2Handler();
1363 });
1364}
1365
Jason M. Bills89922f82019-08-06 11:10:02 -07001366static void smiAssertHandler()
1367{
1368 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1369 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1370 if (ec)
1371 {
1372 // operation_aborted is expected if timer is canceled before
1373 // completion.
1374 if (ec != boost::asio::error::operation_aborted)
1375 {
1376 std::cerr << "smi timeout async_wait failed: " << ec.message()
1377 << "\n";
1378 }
1379 return;
1380 }
1381 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1382 << " ms\n";
1383 smiTimeoutLog();
1384 conn->async_method_call(
1385 [](boost::system::error_code ec,
1386 const std::variant<bool>& property) {
1387 if (ec)
1388 {
1389 return;
1390 }
1391 const bool* reset = std::get_if<bool>(&property);
1392 if (reset == nullptr)
1393 {
1394 std::cerr << "Unable to read reset on SMI value\n";
1395 return;
1396 }
Jason M. Bills94785442020-01-07 15:22:09 -08001397#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001398 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001399#else
1400 if (*reset)
1401 {
1402 std::cout << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -07001403 startWarmReset();
Jason M. Bills94785442020-01-07 15:22:09 -08001404 }
1405#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001406 },
1407 "xyz.openbmc_project.Settings",
1408 "/xyz/openbmc_project/control/bmc_reset_disables",
1409 "org.freedesktop.DBus.Properties", "Get",
1410 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1411 });
1412}
1413
1414static void smiHandler()
1415{
1416 if (!hostOff)
1417 {
1418 gpiod::line_event gpioLineEvent = smiLine.event_read();
1419
1420 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1421 if (smi)
1422 {
1423 smiAssertHandler();
1424 }
1425 else
1426 {
1427 smiAssertTimer.cancel();
1428 }
1429 }
1430 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1431 [](const boost::system::error_code ec) {
1432 if (ec)
1433 {
1434 std::cerr
1435 << "smi handler error: " << ec.message()
1436 << "\n";
1437 return;
1438 }
1439 smiHandler();
1440 });
1441}
1442
Jason M. Billsa15c2522019-08-16 10:01:44 -07001443static void initializeErrorState()
1444{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001445 // Handle CPU1_MISMATCH if it's asserted now
1446 if (cpu1MismatchLine.get_value() == 1)
1447 {
1448 cpuMismatchLog(1);
1449 }
1450
1451 // Handle CPU2_MISMATCH if it's asserted now
1452 if (cpu2MismatchLine.get_value() == 1)
1453 {
1454 cpuMismatchLog(2);
1455 }
1456
Jason M. Billsa15c2522019-08-16 10:01:44 -07001457 // Handle CPU_CATERR if it's asserted now
1458 if (caterrLine.get_value() == 0)
1459 {
1460 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001461 std::vector<Association> associations;
1462 associations.emplace_back(
1463 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1464 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1465 host_error_monitor::associationCATAssert->set_property("Associations",
1466 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001467 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001468
Jason M. Bills8c584392019-08-19 11:05:51 -07001469 // Handle CPU_ERR0 if it's asserted now
1470 if (err0Line.get_value() == 0)
1471 {
1472 err0AssertHandler();
1473 }
1474
Jason M. Bills75af3962019-08-19 11:07:17 -07001475 // Handle CPU_ERR1 if it's asserted now
1476 if (err1Line.get_value() == 0)
1477 {
1478 err1AssertHandler();
1479 }
1480
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001481 // Handle CPU_ERR2 if it's asserted now
1482 if (err2Line.get_value() == 0)
1483 {
1484 err2AssertHandler();
1485 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001486
1487 // Handle SMI if it's asserted now
1488 if (smiLine.get_value() == 0)
1489 {
1490 smiAssertHandler();
1491 }
Jason M. Bills08866542019-08-16 12:04:19 -07001492
Jason M. Billse94f5e12019-09-13 11:11:34 -07001493 // Handle CPU1_THERMTRIP if it's asserted now
1494 if (cpu1ThermtripLine.get_value() == 0)
1495 {
1496 cpu1ThermtripAssertHandler();
1497 }
1498
1499 // Handle CPU2_THERMTRIP if it's asserted now
1500 if (cpu2ThermtripLine.get_value() == 0)
1501 {
1502 cpu2ThermtripAssertHandler();
1503 }
1504
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001505 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1506 if (cpu1MemtripLine.get_value() == 0)
1507 {
1508 memThermTripLog(1);
1509 }
1510
1511 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1512 if (cpu2MemtripLine.get_value() == 0)
1513 {
1514 memThermTripLog(2);
1515 }
1516
Jason M. Billse94f5e12019-09-13 11:11:34 -07001517 // Handle CPU1_VRHOT if it's asserted now
1518 if (cpu1VRHotLine.get_value() == 0)
1519 {
1520 cpu1VRHotAssertHandler();
1521 }
1522
1523 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1524 if (cpu1MemABCDVRHotLine.get_value() == 0)
1525 {
1526 cpu1MemABCDVRHotAssertHandler();
1527 }
1528
1529 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1530 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1531 {
1532 cpu1MemEFGHVRHotAssertHandler();
1533 }
1534
1535 // Handle CPU2_VRHOT if it's asserted now
1536 if (cpu2VRHotLine.get_value() == 0)
1537 {
1538 cpu2VRHotAssertHandler();
1539 }
1540
1541 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1542 if (cpu2MemABCDVRHotLine.get_value() == 0)
1543 {
1544 cpu2MemABCDVRHotAssertHandler();
1545 }
1546
1547 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1548 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1549 {
1550 cpu2MemEFGHVRHotAssertHandler();
1551 }
1552
Jason M. Bills08866542019-08-16 12:04:19 -07001553 // Handle PCH_BMC_THERMTRIP if it's asserted now
1554 if (pchThermtripLine.get_value() == 0)
1555 {
1556 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001557 std::vector<Association> associations;
1558 associations.emplace_back(
1559 "", "critical",
1560 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1561 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1562 host_error_monitor::associationSSBThermTrip->set_property(
1563 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001564 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001565}
Jason M. Bills1490b142019-07-01 15:48:43 -07001566} // namespace host_error_monitor
1567
1568int main(int argc, char* argv[])
1569{
1570 // setup connection to dbus
1571 host_error_monitor::conn =
1572 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1573
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001574 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001575 host_error_monitor::conn->request_name(
1576 "xyz.openbmc_project.HostErrorMonitor");
1577 sdbusplus::asio::object_server server =
1578 sdbusplus::asio::object_server(host_error_monitor::conn);
1579
Yong Li1429ca82020-04-27 16:49:45 +08001580 // Associations interface for led status
1581 std::vector<host_error_monitor::Association> associations;
1582 associations.emplace_back("", "", "");
1583 host_error_monitor::associationSSBThermTrip = server.add_interface(
1584 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1585 "xyz.openbmc_project.Association.Definitions");
1586 host_error_monitor::associationSSBThermTrip->register_property(
1587 "Associations", associations);
1588 host_error_monitor::associationSSBThermTrip->initialize();
1589
1590 host_error_monitor::associationCATAssert = server.add_interface(
1591 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1592 "xyz.openbmc_project.Association.Definitions");
1593 host_error_monitor::associationCATAssert->register_property("Associations",
1594 associations);
1595 host_error_monitor::associationCATAssert->initialize();
1596
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001597 // Restart Cause Interface
1598 host_error_monitor::hostErrorTimeoutIface =
1599 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1600 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1601
1602 host_error_monitor::hostErrorTimeoutIface->register_property(
1603 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1604 [](const std::size_t& requested, std::size_t& resp) {
1605 if (requested > host_error_monitor::caterrTimeoutMsMax)
1606 {
1607 std::cerr << "IERRTimeoutMs update to " << requested
1608 << "ms rejected. Cannot be greater than "
1609 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1610 return 0;
1611 }
1612 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1613 host_error_monitor::caterrTimeoutMs = requested;
1614 resp = requested;
1615 return 1;
1616 },
1617 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1618 host_error_monitor::hostErrorTimeoutIface->initialize();
1619
Jason M. Bills1490b142019-07-01 15:48:43 -07001620 // Start tracking host state
1621 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1622 host_error_monitor::startHostStateMonitor();
1623
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001624 // Request CPU1_MISMATCH GPIO events
1625 if (!host_error_monitor::requestGPIOInput(
1626 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1627 {
1628 return -1;
1629 }
1630
1631 // Request CPU2_MISMATCH GPIO events
1632 if (!host_error_monitor::requestGPIOInput(
1633 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1634 {
1635 return -1;
1636 }
1637
Jason M. Bills1490b142019-07-01 15:48:43 -07001638 // Initialize the host state
1639 host_error_monitor::initializeHostState();
1640
1641 // Request CPU_CATERR GPIO events
1642 if (!host_error_monitor::requestGPIOEvents(
1643 "CPU_CATERR", host_error_monitor::caterrHandler,
1644 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1645 {
1646 return -1;
1647 }
1648
Jason M. Bills8c584392019-08-19 11:05:51 -07001649 // Request CPU_ERR0 GPIO events
1650 if (!host_error_monitor::requestGPIOEvents(
1651 "CPU_ERR0", host_error_monitor::err0Handler,
1652 host_error_monitor::err0Line, host_error_monitor::err0Event))
1653 {
1654 return -1;
1655 }
1656
Jason M. Bills75af3962019-08-19 11:07:17 -07001657 // Request CPU_ERR1 GPIO events
1658 if (!host_error_monitor::requestGPIOEvents(
1659 "CPU_ERR1", host_error_monitor::err1Handler,
1660 host_error_monitor::err1Line, host_error_monitor::err1Event))
1661 {
1662 return -1;
1663 }
1664
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001665 // Request CPU_ERR2 GPIO events
1666 if (!host_error_monitor::requestGPIOEvents(
1667 "CPU_ERR2", host_error_monitor::err2Handler,
1668 host_error_monitor::err2Line, host_error_monitor::err2Event))
1669 {
1670 return -1;
1671 }
1672
Jason M. Bills89922f82019-08-06 11:10:02 -07001673 // Request SMI GPIO events
1674 if (!host_error_monitor::requestGPIOEvents(
1675 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1676 host_error_monitor::smiEvent))
1677 {
1678 return -1;
1679 }
1680
Jason M. Bills45e87e02019-09-09 14:45:38 -07001681 // Request CPU1_FIVR_FAULT GPIO input
1682 if (!host_error_monitor::requestGPIOInput(
1683 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1684 {
1685 return -1;
1686 }
1687
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001688 // Request CPU1_THERMTRIP GPIO events
1689 if (!host_error_monitor::requestGPIOEvents(
1690 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1691 host_error_monitor::cpu1ThermtripLine,
1692 host_error_monitor::cpu1ThermtripEvent))
1693 {
1694 return -1;
1695 }
1696
Jason M. Bills45e87e02019-09-09 14:45:38 -07001697 // Request CPU2_FIVR_FAULT GPIO input
1698 if (!host_error_monitor::requestGPIOInput(
1699 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1700 {
1701 return -1;
1702 }
1703
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001704 // Request CPU2_THERMTRIP GPIO events
1705 if (!host_error_monitor::requestGPIOEvents(
1706 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1707 host_error_monitor::cpu2ThermtripLine,
1708 host_error_monitor::cpu2ThermtripEvent))
1709 {
1710 return -1;
1711 }
1712
Jason M. Bills250fa632019-08-28 15:58:25 -07001713 // Request CPU1_VRHOT GPIO events
1714 if (!host_error_monitor::requestGPIOEvents(
1715 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1716 host_error_monitor::cpu1VRHotLine,
1717 host_error_monitor::cpu1VRHotEvent))
1718 {
1719 return -1;
1720 }
1721
Jason M. Bills9647ba72019-08-29 14:19:19 -07001722 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1723 if (!host_error_monitor::requestGPIOEvents(
1724 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1725 host_error_monitor::cpu1MemABCDVRHotLine,
1726 host_error_monitor::cpu1MemABCDVRHotEvent))
1727 {
1728 return -1;
1729 }
1730
1731 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1732 if (!host_error_monitor::requestGPIOEvents(
1733 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1734 host_error_monitor::cpu1MemEFGHVRHotLine,
1735 host_error_monitor::cpu1MemEFGHVRHotEvent))
1736 {
1737 return -1;
1738 }
1739
Jason M. Bills250fa632019-08-28 15:58:25 -07001740 // Request CPU2_VRHOT GPIO events
1741 if (!host_error_monitor::requestGPIOEvents(
1742 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1743 host_error_monitor::cpu2VRHotLine,
1744 host_error_monitor::cpu2VRHotEvent))
1745 {
1746 return -1;
1747 }
1748
Jason M. Bills9647ba72019-08-29 14:19:19 -07001749 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1750 if (!host_error_monitor::requestGPIOEvents(
1751 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1752 host_error_monitor::cpu2MemABCDVRHotLine,
1753 host_error_monitor::cpu2MemABCDVRHotEvent))
1754 {
1755 return -1;
1756 }
1757
1758 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1759 if (!host_error_monitor::requestGPIOEvents(
1760 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1761 host_error_monitor::cpu2MemEFGHVRHotLine,
1762 host_error_monitor::cpu2MemEFGHVRHotEvent))
1763 {
1764 return -1;
1765 }
1766
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001767 // Request PCH_BMC_THERMTRIP GPIO events
1768 if (!host_error_monitor::requestGPIOEvents(
1769 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1770 host_error_monitor::pchThermtripLine,
1771 host_error_monitor::pchThermtripEvent))
1772 {
1773 return -1;
1774 }
1775
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001776 // Request CPU1_MEM_THERM_EVENT GPIO events
1777 if (!host_error_monitor::requestGPIOEvents(
1778 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1779 host_error_monitor::cpu1MemtripLine,
1780 host_error_monitor::cpu1MemtripEvent))
1781 {
1782 return -1;
1783 }
1784
1785 // Request CPU2_MEM_THERM_EVENT GPIO events
1786 if (!host_error_monitor::requestGPIOEvents(
1787 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1788 host_error_monitor::cpu2MemtripLine,
1789 host_error_monitor::cpu2MemtripEvent))
1790 {
1791 return -1;
1792 }
1793
Jason M. Bills1490b142019-07-01 15:48:43 -07001794 host_error_monitor::io.run();
1795
1796 return 0;
1797}