blob: b4a8c728587981b6e468832a335d0c534fe0df03 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070022#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070024
25#include <bitset>
26#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070027#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070028
29namespace host_error_monitor
30{
31static boost::asio::io_service io;
32static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080033static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070034
Yong Li1429ca82020-04-27 16:49:45 +080035using Association = std::tuple<std::string, std::string, std::string>;
36static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
37static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
38
39static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
40
Jason M. Bills1490b142019-07-01 15:48:43 -070041static bool hostOff = true;
42
Jason M. Billsc4b91f22019-11-26 17:04:50 -080043static size_t caterrTimeoutMs = 2000;
44const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070045const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070046const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070047
48// Timers
49// Timer for CATERR asserted
50static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070051// Timer for ERR0 asserted
52static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070053// Timer for ERR1 asserted
54static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070055// Timer for ERR2 asserted
56static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070057// Timer for SMI asserted
58static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070059
60// GPIO Lines and Event Descriptors
61static gpiod::line caterrLine;
62static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070063static gpiod::line err0Line;
64static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070065static gpiod::line err1Line;
66static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070067static gpiod::line err2Line;
68static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070069static gpiod::line smiLine;
70static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070071static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070072static gpiod::line cpu1ThermtripLine;
73static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070074static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070075static gpiod::line cpu2ThermtripLine;
76static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070077static gpiod::line cpu1VRHotLine;
78static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
79static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070080static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
81static gpiod::line cpu1MemEFGHVRHotLine;
82static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
83static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070084static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070085static gpiod::line cpu1MemABCDVRHotLine;
86static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
87static gpiod::line cpu2MemEFGHVRHotLine;
88static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080089//----------------------------------
90// PCH_BMC_THERMTRIP function related definition
91//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080092static gpiod::line pchThermtripLine;
93static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000094//----------------------------------
95// CPU_MEM_THERM_EVENT function related definition
96//----------------------------------
97static gpiod::line cpu1MemtripLine;
98static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
99static gpiod::line cpu2MemtripLine;
100static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000101//---------------------------------
102// CPU_MISMATCH function related definition
103//---------------------------------
104static gpiod::line cpu1MismatchLine;
105static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700106
Yong Li061eb032020-02-26 15:06:18 +0800107// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800108const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800109const static constexpr uint8_t beepCPUErr2 = 5;
110
111static void beep(const uint8_t& beepPriority)
112{
113 conn->async_method_call(
114 [](boost::system::error_code ec) {
115 if (ec)
116 {
117 std::cerr << "beep returned error with "
118 "async_method_call (ec = "
119 << ec << ")\n";
120 return;
121 }
122 },
123 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
124 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
125}
126
Jason M. Billsa3397932019-08-06 11:07:21 -0700127static void cpuIERRLog()
128{
129 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
130 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
131 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
132}
133
134static void cpuIERRLog(const int cpuNum)
135{
136 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
137
138 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
139 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
140 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
141}
142
143static void cpuIERRLog(const int cpuNum, const std::string& type)
144{
145 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
146
147 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
148 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
149 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
150}
151
Jason M. Billscbf78532019-08-16 15:32:11 -0700152static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700153{
Jason M. Billscbf78532019-08-16 15:32:11 -0700154 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
155
156 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
157 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
158 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700159}
160
Jason M. Billscbf78532019-08-16 15:32:11 -0700161static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700162{
Jason M. Billscbf78532019-08-16 15:32:11 -0700163 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
164 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700165
166 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
167 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
168 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
169}
170
Jason M. Bills89922f82019-08-06 11:10:02 -0700171static void smiTimeoutLog()
172{
173 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
174 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
175 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
176}
177
Jason M. Bills45e87e02019-09-09 14:45:38 -0700178static void cpuBootFIVRFaultLog(const int cpuNum)
179{
180 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
181
182 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
183 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
184 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
185}
186
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700187static void cpuThermTripLog(const int cpuNum)
188{
189 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
190
191 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
192 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
193 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
194 cpuNum, NULL);
195}
196
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000197static void memThermTripLog(const int cpuNum)
198{
199 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
200 std::string msg = cpuNumber + " Memory Thermal trip.";
201
202 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
203 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
204 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
205 cpuNumber.c_str(), NULL);
206}
207
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000208static void cpuMismatchLog(const int cpuNum)
209{
210 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
211
212 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
213 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
214 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
215}
216
Jason M. Bills250fa632019-08-28 15:58:25 -0700217static void cpuVRHotLog(const std::string& vr)
218{
219 std::string msg = vr + " Voltage Regulator Overheated.";
220
221 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
222 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
223 "OpenBMC.0.1.VoltageRegulatorOverheated",
224 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
225}
226
Jason M. Bills08866542019-08-16 12:04:19 -0700227static void ssbThermTripLog()
228{
229 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
230 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
231 "OpenBMC.0.1.SsbThermalTrip", NULL);
232}
233
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700234static inline bool peciError(EPECIStatus peciStatus, uint8_t cc)
235{
236 return (
237 peciStatus != PECI_CC_SUCCESS ||
238 (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED));
239}
240
241static void printPECIError(const std::string& reg, const size_t addr,
242 const EPECIStatus peciStatus, const size_t cc)
243{
244 std::cerr << "Failed to read " << reg << " on CPU address " << addr
245 << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc
246 << "\n";
247}
248
Jason M. Billsa15c2522019-08-16 10:01:44 -0700249static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700250static void initializeHostState()
251{
252 conn->async_method_call(
253 [](boost::system::error_code ec,
254 const std::variant<std::string>& property) {
255 if (ec)
256 {
257 return;
258 }
259 const std::string* state = std::get_if<std::string>(&property);
260 if (state == nullptr)
261 {
262 std::cerr << "Unable to read host state value\n";
263 return;
264 }
265 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700266 // If the system is on, initialize the error state
267 if (!hostOff)
268 {
269 initializeErrorState();
270 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700271 },
272 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
273 "org.freedesktop.DBus.Properties", "Get",
274 "xyz.openbmc_project.State.Host", "CurrentHostState");
275}
276
277static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
278{
279 return std::make_shared<sdbusplus::bus::match::match>(
280 *conn,
281 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700282 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700283 [](sdbusplus::message::message& msg) {
284 std::string interfaceName;
285 boost::container::flat_map<std::string, std::variant<std::string>>
286 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700287 try
288 {
289 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700290 }
291 catch (std::exception& e)
292 {
293 std::cerr << "Unable to read host state\n";
294 return;
295 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700296 // We only want to check for CurrentHostState
297 if (propertiesChanged.begin()->first != "CurrentHostState")
298 {
299 return;
300 }
301 std::string* state =
302 std::get_if<std::string>(&(propertiesChanged.begin()->second));
303 if (state == nullptr)
304 {
305 std::cerr << propertiesChanged.begin()->first
306 << " property invalid\n";
307 return;
308 }
309
310 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700311
Jason M. Bills1490b142019-07-01 15:48:43 -0700312 if (hostOff)
313 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700314 // No host events should fire while off, so cancel any pending
315 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700316 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700317 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700318 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700319 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700320 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700321 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700322 else
323 {
324 // Handle any initial errors when the host turns on
325 initializeErrorState();
326 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700327 });
328}
329
330static bool requestGPIOEvents(
331 const std::string& name, const std::function<void()>& handler,
332 gpiod::line& gpioLine,
333 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
334{
335 // Find the GPIO line
336 gpioLine = gpiod::find_line(name);
337 if (!gpioLine)
338 {
339 std::cerr << "Failed to find the " << name << " line\n";
340 return false;
341 }
342
343 try
344 {
345 gpioLine.request(
346 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
347 }
348 catch (std::exception&)
349 {
350 std::cerr << "Failed to request events for " << name << "\n";
351 return false;
352 }
353
354 int gpioLineFd = gpioLine.event_get_fd();
355 if (gpioLineFd < 0)
356 {
357 std::cerr << "Failed to get " << name << " fd\n";
358 return false;
359 }
360
361 gpioEventDescriptor.assign(gpioLineFd);
362
363 gpioEventDescriptor.async_wait(
364 boost::asio::posix::stream_descriptor::wait_read,
365 [&name, handler](const boost::system::error_code ec) {
366 if (ec)
367 {
368 std::cerr << name << " fd handler error: " << ec.message()
369 << "\n";
370 return;
371 }
372 handler();
373 });
374 return true;
375}
376
Jason M. Bills45e87e02019-09-09 14:45:38 -0700377static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
378{
379 // Find the GPIO line
380 gpioLine = gpiod::find_line(name);
381 if (!gpioLine)
382 {
383 std::cerr << "Failed to find the " << name << " line.\n";
384 return false;
385 }
386
387 // Request GPIO input
388 try
389 {
390 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
391 }
392 catch (std::exception&)
393 {
394 std::cerr << "Failed to request " << name << " input\n";
395 return false;
396 }
397
398 return true;
399}
400
Jason M. Bills1490b142019-07-01 15:48:43 -0700401static void startPowerCycle()
402{
403 conn->async_method_call(
404 [](boost::system::error_code ec) {
405 if (ec)
406 {
407 std::cerr << "failed to set Chassis State\n";
408 }
409 },
410 "xyz.openbmc_project.State.Chassis",
411 "/xyz/openbmc_project/state/chassis0",
412 "org.freedesktop.DBus.Properties", "Set",
413 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
414 std::variant<std::string>{
415 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
416}
417
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700418static void startWarmReset()
419{
420 conn->async_method_call(
421 [](boost::system::error_code ec) {
422 if (ec)
423 {
424 std::cerr << "failed to set Host State\n";
425 }
426 },
427 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
428 "org.freedesktop.DBus.Properties", "Set",
429 "xyz.openbmc_project.State.Host", "RequestedHostTransition",
430 std::variant<std::string>{
431 "xyz.openbmc_project.State.Host.Transition.ForceWarmReboot"});
432}
433
Jason M. Billsb61766b2019-11-26 17:02:44 -0800434static void startCrashdumpAndRecovery(bool recoverSystem,
435 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700436{
Jason M. Billsd69549b2020-08-27 11:42:43 -0700437 std::cerr << "Starting crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700438 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
Jason M. Bills1490b142019-07-01 15:48:43 -0700439
440 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
441 *conn,
Jason M. Billsc015c552020-08-27 11:02:47 -0700442 "type='signal',interface='com.intel.crashdump.Stored',member='"
443 "CrashdumpComplete'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700444 [recoverSystem](sdbusplus::message::message& msg) {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700445 std::cerr << "Crashdump completed\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700446 if (recoverSystem)
447 {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700448 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700449 startWarmReset();
Jason M. Bills1490b142019-07-01 15:48:43 -0700450 }
451 crashdumpCompleteMatch.reset();
452 });
453
Jason M. Bills1490b142019-07-01 15:48:43 -0700454 conn->async_method_call(
455 [](boost::system::error_code ec) {
456 if (ec)
457 {
458 std::cerr << "failed to start Crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700459 }
460 },
461 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800462 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700463}
464
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700465static void incrementCPUErrorCount(int cpuNum)
466{
467 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
468
469 // Get the current count
470 conn->async_method_call(
471 [propertyName](boost::system::error_code ec,
472 const std::variant<uint8_t>& property) {
473 if (ec)
474 {
475 std::cerr << "Failed to read " << propertyName << ": "
476 << ec.message() << "\n";
477 return;
478 }
479 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
480 if (errorCountVariant == nullptr)
481 {
482 std::cerr << propertyName << " invalid\n";
483 return;
484 }
485 uint8_t errorCount = *errorCountVariant;
486 if (errorCount == std::numeric_limits<uint8_t>::max())
487 {
488 std::cerr << "Maximum error count reached\n";
489 return;
490 }
491 // Increment the count
492 errorCount++;
493 conn->async_method_call(
494 [propertyName](boost::system::error_code ec) {
495 if (ec)
496 {
497 std::cerr << "Failed to set " << propertyName << ": "
498 << ec.message() << "\n";
499 }
500 },
501 "xyz.openbmc_project.Settings",
502 "/xyz/openbmc_project/control/processor_error_config",
503 "org.freedesktop.DBus.Properties", "Set",
504 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
505 std::variant<uint8_t>{errorCount});
506 },
507 "xyz.openbmc_project.Settings",
508 "/xyz/openbmc_project/control/processor_error_config",
509 "org.freedesktop.DBus.Properties", "Get",
510 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
511}
512
Jason M. Billsa3397932019-08-06 11:07:21 -0700513static bool checkIERRCPUs()
514{
515 bool cpuIERRFound = false;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700516 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Billsa3397932019-08-06 11:07:21 -0700517 cpu++, addr++)
518 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700519 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Billsa3397932019-08-06 11:07:21 -0700520 uint8_t cc = 0;
521 CPUModel model{};
522 uint8_t stepping = 0;
523 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
524 {
525 std::cerr << "Cannot get CPUID!\n";
526 continue;
527 }
528
529 switch (model)
530 {
531 case skx:
532 {
533 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
534 // that caused the IERR
535 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700536 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
537 (uint8_t*)&mcaErrSrcLog, &cc);
538 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700539 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700540 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700541 continue;
542 }
543 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
544 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
545 {
546 // TODO: Light the CPU fault LED?
547 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700548 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700549 // Next check if it's a CPU/VR mismatch by reading the
550 // IA32_MC4_STATUS MSR (0x411)
551 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700552 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
553 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700554 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700555 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700556 continue;
557 }
558 // Check MSEC bits 31:24 for
559 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
560 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
561 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
562 if ((mc4Status & (0x40 << 24)) ||
563 (mc4Status & (0x42 << 24)) ||
564 (mc4Status & (0x43 << 24)))
565 {
566 cpuIERRLog(cpu, "CPU/VR Mismatch");
567 continue;
568 }
569
570 // Next check if it's a Core FIVR fault by looking for a
571 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
572 // 80h)
573 uint32_t coreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700574 peciStatus = peci_RdPCIConfigLocal(
575 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
576 (uint8_t*)&coreFIVRErrLog, &cc);
577 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700578 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700579 printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus,
580 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700581 continue;
582 }
583 if (coreFIVRErrLog)
584 {
585 cpuIERRLog(cpu, "Core FIVR Fault");
586 continue;
587 }
588
589 // Next check if it's an Uncore FIVR fault by looking for a
590 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
591 // 84h)
592 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700593 peciStatus = peci_RdPCIConfigLocal(
594 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
595 (uint8_t*)&uncoreFIVRErrLog, &cc);
596 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700597 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700598 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
599 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700600 continue;
601 }
602 if (uncoreFIVRErrLog)
603 {
604 cpuIERRLog(cpu, "Uncore FIVR Fault");
605 continue;
606 }
607
608 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
609 // both zero, but MSEC bits 31:24 have either
610 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
611 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
612 // uncore FIVR fault
613 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
614 ((mc4Status & (0x51 << 24)) ||
615 (mc4Status & (0x52 << 24))))
616 {
617 cpuIERRLog(cpu, "Uncore FIVR Fault");
618 continue;
619 }
620 cpuIERRLog(cpu);
621 }
622 break;
623 }
624 case icx:
625 {
626 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
627 // that caused the IERR
628 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700629 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
630 (uint8_t*)&mcaErrSrcLog, &cc);
631 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700632 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700633 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700634 continue;
635 }
636 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
637 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
638 {
639 // TODO: Light the CPU fault LED?
640 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700641 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700642 // Next check if it's a CPU/VR mismatch by reading the
643 // IA32_MC4_STATUS MSR (0x411)
644 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700645 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
646 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700647 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700648 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700649 continue;
650 }
651 // TODO: Update MSEC/MSCOD_31_24 check
652 // Check MSEC bits 31:24 for
653 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
654 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
655 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
656 if ((mc4Status & (0x40 << 24)) ||
657 (mc4Status & (0x42 << 24)) ||
658 (mc4Status & (0x43 << 24)))
659 {
660 cpuIERRLog(cpu, "CPU/VR Mismatch");
661 continue;
662 }
663
664 // Next check if it's a Core FIVR fault by looking for a
665 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
666 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
667 uint32_t coreFIVRErrLog0 = 0;
668 uint32_t coreFIVRErrLog1 = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700669 peciStatus = peci_RdEndPointConfigPciLocal(
670 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
671 (uint8_t*)&coreFIVRErrLog0, &cc);
672 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700673 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700674 printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus,
675 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700676 continue;
677 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700678 peciStatus = peci_RdEndPointConfigPciLocal(
679 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
680 (uint8_t*)&coreFIVRErrLog1, &cc);
681 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700682 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700683 printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus,
684 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700685 continue;
686 }
687 if (coreFIVRErrLog0 || coreFIVRErrLog1)
688 {
689 cpuIERRLog(cpu, "Core FIVR Fault");
690 continue;
691 }
692
693 // Next check if it's an Uncore FIVR fault by looking for a
694 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
695 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
696 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700697 peciStatus = peci_RdEndPointConfigPciLocal(
698 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
699 (uint8_t*)&uncoreFIVRErrLog, &cc);
700 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700701 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700702 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
703 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700704 continue;
705 }
706 if (uncoreFIVRErrLog)
707 {
708 cpuIERRLog(cpu, "Uncore FIVR Fault");
709 continue;
710 }
711
712 // TODO: Update MSEC/MSCOD_31_24 check
713 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
714 // both zero, but MSEC bits 31:24 have either
715 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
716 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
717 // uncore FIVR fault
718 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
719 !uncoreFIVRErrLog &&
720 ((mc4Status & (0x51 << 24)) ||
721 (mc4Status & (0x52 << 24))))
722 {
723 cpuIERRLog(cpu, "Uncore FIVR Fault");
724 continue;
725 }
726 cpuIERRLog(cpu);
727 }
728 break;
729 }
730 }
731 }
732 return cpuIERRFound;
733}
734
Jason M. Billsa15c2522019-08-16 10:01:44 -0700735static void caterrAssertHandler()
736{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700737 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
738 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
739 if (ec)
740 {
741 // operation_aborted is expected if timer is canceled
742 // before completion.
743 if (ec != boost::asio::error::operation_aborted)
744 {
745 std::cerr << "caterr timeout async_wait failed: "
746 << ec.message() << "\n";
747 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700748 return;
749 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700750 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
751 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800752 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700753 if (!checkIERRCPUs())
754 {
755 cpuIERRLog();
756 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700757 conn->async_method_call(
758 [](boost::system::error_code ec,
759 const std::variant<bool>& property) {
760 if (ec)
761 {
762 return;
763 }
764 const bool* reset = std::get_if<bool>(&property);
765 if (reset == nullptr)
766 {
767 std::cerr << "Unable to read reset on CATERR value\n";
768 return;
769 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800770 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700771 },
772 "xyz.openbmc_project.Settings",
773 "/xyz/openbmc_project/control/processor_error_config",
774 "org.freedesktop.DBus.Properties", "Get",
775 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
776 });
777}
778
Jason M. Bills1490b142019-07-01 15:48:43 -0700779static void caterrHandler()
780{
781 if (!hostOff)
782 {
783 gpiod::line_event gpioLineEvent = caterrLine.event_read();
784
785 bool caterr =
786 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800787
788 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700789 if (caterr)
790 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700791 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800792 associations.emplace_back(
793 "", "critical",
794 "/xyz/openbmc_project/host_error_monitor/cat_error");
795 associations.emplace_back("", "critical",
796 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700797 }
798 else
799 {
800 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800801 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700802 }
Yong Li1429ca82020-04-27 16:49:45 +0800803 host_error_monitor::associationCATAssert->set_property("Associations",
804 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700805 }
806 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
807 [](const boost::system::error_code ec) {
808 if (ec)
809 {
810 std::cerr << "caterr handler error: "
811 << ec.message() << "\n";
812 return;
813 }
814 caterrHandler();
815 });
816}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700817
Jason M. Billse94f5e12019-09-13 11:11:34 -0700818static void cpu1ThermtripAssertHandler()
819{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700820 if (cpu1FIVRFaultLine.get_value() == 0)
821 {
822 cpuBootFIVRFaultLog(1);
823 }
824 else
825 {
826 cpuThermTripLog(1);
827 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700828}
829
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700830static void cpu1ThermtripHandler()
831{
Jason M. Bills84951142020-04-17 15:57:11 -0700832 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700833
Jason M. Bills84951142020-04-17 15:57:11 -0700834 bool cpu1Thermtrip =
835 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
836 if (cpu1Thermtrip)
837 {
838 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700839 }
Jason M. Bills84951142020-04-17 15:57:11 -0700840
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700841 cpu1ThermtripEvent.async_wait(
842 boost::asio::posix::stream_descriptor::wait_read,
843 [](const boost::system::error_code ec) {
844 if (ec)
845 {
846 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
847 << "\n";
848 return;
849 }
850 cpu1ThermtripHandler();
851 });
852}
853
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000854static void cpu1MemtripHandler()
855{
Jason M. Bills5287c022020-05-19 11:16:09 -0700856 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000857
Jason M. Bills5287c022020-05-19 11:16:09 -0700858 bool cpu1Memtrip =
859 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
860 if (cpu1Memtrip)
861 {
862 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000863 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700864
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000865 cpu1MemtripEvent.async_wait(
866 boost::asio::posix::stream_descriptor::wait_read,
867 [](const boost::system::error_code ec) {
868 if (ec)
869 {
870 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
871 << ec.message() << "\n";
872 return;
873 }
874 cpu1MemtripHandler();
875 });
876}
877
Jason M. Billse94f5e12019-09-13 11:11:34 -0700878static void cpu2ThermtripAssertHandler()
879{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700880 if (cpu2FIVRFaultLine.get_value() == 0)
881 {
882 cpuBootFIVRFaultLog(2);
883 }
884 else
885 {
886 cpuThermTripLog(2);
887 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700888}
889
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700890static void cpu2ThermtripHandler()
891{
Jason M. Bills84951142020-04-17 15:57:11 -0700892 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700893
Jason M. Bills84951142020-04-17 15:57:11 -0700894 bool cpu2Thermtrip =
895 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
896 if (cpu2Thermtrip)
897 {
898 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700899 }
Jason M. Bills84951142020-04-17 15:57:11 -0700900
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700901 cpu2ThermtripEvent.async_wait(
902 boost::asio::posix::stream_descriptor::wait_read,
903 [](const boost::system::error_code ec) {
904 if (ec)
905 {
906 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
907 << "\n";
908 return;
909 }
910 cpu2ThermtripHandler();
911 });
912}
913
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000914static void cpu2MemtripHandler()
915{
Jason M. Bills5287c022020-05-19 11:16:09 -0700916 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000917
Jason M. Bills5287c022020-05-19 11:16:09 -0700918 bool cpu2Memtrip =
919 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
920 if (cpu2Memtrip)
921 {
922 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000923 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700924
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000925 cpu2MemtripEvent.async_wait(
926 boost::asio::posix::stream_descriptor::wait_read,
927 [](const boost::system::error_code ec) {
928 if (ec)
929 {
930 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
931 << ec.message() << "\n";
932 return;
933 }
934 cpu2MemtripHandler();
935 });
936}
937
Jason M. Billse94f5e12019-09-13 11:11:34 -0700938static void cpu1VRHotAssertHandler()
939{
940 cpuVRHotLog("CPU 1");
941}
942
Jason M. Bills250fa632019-08-28 15:58:25 -0700943static void cpu1VRHotHandler()
944{
Jason M. Bills84951142020-04-17 15:57:11 -0700945 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700946
Jason M. Bills84951142020-04-17 15:57:11 -0700947 bool cpu1VRHot =
948 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
949 if (cpu1VRHot)
950 {
951 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700952 }
Jason M. Bills84951142020-04-17 15:57:11 -0700953
Jason M. Bills250fa632019-08-28 15:58:25 -0700954 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
955 [](const boost::system::error_code ec) {
956 if (ec)
957 {
958 std::cerr << "CPU 1 VRHot handler error: "
959 << ec.message() << "\n";
960 return;
961 }
962 cpu1VRHotHandler();
963 });
964}
965
Jason M. Billse94f5e12019-09-13 11:11:34 -0700966static void cpu1MemABCDVRHotAssertHandler()
967{
968 cpuVRHotLog("CPU 1 Memory ABCD");
969}
970
Jason M. Bills9647ba72019-08-29 14:19:19 -0700971static void cpu1MemABCDVRHotHandler()
972{
Jason M. Bills84951142020-04-17 15:57:11 -0700973 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700974
Jason M. Bills84951142020-04-17 15:57:11 -0700975 bool cpu1MemABCDVRHot =
976 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
977 if (cpu1MemABCDVRHot)
978 {
979 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700980 }
Jason M. Bills84951142020-04-17 15:57:11 -0700981
Jason M. Bills9647ba72019-08-29 14:19:19 -0700982 cpu1MemABCDVRHotEvent.async_wait(
983 boost::asio::posix::stream_descriptor::wait_read,
984 [](const boost::system::error_code ec) {
985 if (ec)
986 {
987 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
988 << ec.message() << "\n";
989 return;
990 }
991 cpu1MemABCDVRHotHandler();
992 });
993}
994
Jason M. Billse94f5e12019-09-13 11:11:34 -0700995static void cpu1MemEFGHVRHotAssertHandler()
996{
997 cpuVRHotLog("CPU 1 Memory EFGH");
998}
999
Jason M. Bills9647ba72019-08-29 14:19:19 -07001000static void cpu1MemEFGHVRHotHandler()
1001{
Jason M. Bills84951142020-04-17 15:57:11 -07001002 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001003
Jason M. Bills84951142020-04-17 15:57:11 -07001004 bool cpu1MemEFGHVRHot =
1005 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1006 if (cpu1MemEFGHVRHot)
1007 {
1008 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001009 }
Jason M. Bills84951142020-04-17 15:57:11 -07001010
Jason M. Bills9647ba72019-08-29 14:19:19 -07001011 cpu1MemEFGHVRHotEvent.async_wait(
1012 boost::asio::posix::stream_descriptor::wait_read,
1013 [](const boost::system::error_code ec) {
1014 if (ec)
1015 {
1016 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
1017 << ec.message() << "\n";
1018 return;
1019 }
1020 cpu1MemEFGHVRHotHandler();
1021 });
1022}
1023
Jason M. Billse94f5e12019-09-13 11:11:34 -07001024static void cpu2VRHotAssertHandler()
1025{
1026 cpuVRHotLog("CPU 2");
1027}
1028
Jason M. Bills250fa632019-08-28 15:58:25 -07001029static void cpu2VRHotHandler()
1030{
Jason M. Bills84951142020-04-17 15:57:11 -07001031 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001032
Jason M. Bills84951142020-04-17 15:57:11 -07001033 bool cpu2VRHot =
1034 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1035 if (cpu2VRHot)
1036 {
1037 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001038 }
Jason M. Bills84951142020-04-17 15:57:11 -07001039
Jason M. Bills250fa632019-08-28 15:58:25 -07001040 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1041 [](const boost::system::error_code ec) {
1042 if (ec)
1043 {
1044 std::cerr << "CPU 2 VRHot handler error: "
1045 << ec.message() << "\n";
1046 return;
1047 }
1048 cpu2VRHotHandler();
1049 });
1050}
1051
Jason M. Billse94f5e12019-09-13 11:11:34 -07001052static void cpu2MemABCDVRHotAssertHandler()
1053{
1054 cpuVRHotLog("CPU 2 Memory ABCD");
1055}
1056
Jason M. Bills9647ba72019-08-29 14:19:19 -07001057static void cpu2MemABCDVRHotHandler()
1058{
Jason M. Bills84951142020-04-17 15:57:11 -07001059 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001060
Jason M. Bills84951142020-04-17 15:57:11 -07001061 bool cpu2MemABCDVRHot =
1062 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1063 if (cpu2MemABCDVRHot)
1064 {
1065 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001066 }
Jason M. Bills84951142020-04-17 15:57:11 -07001067
Jason M. Bills9647ba72019-08-29 14:19:19 -07001068 cpu2MemABCDVRHotEvent.async_wait(
1069 boost::asio::posix::stream_descriptor::wait_read,
1070 [](const boost::system::error_code ec) {
1071 if (ec)
1072 {
1073 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1074 << ec.message() << "\n";
1075 return;
1076 }
1077 cpu2MemABCDVRHotHandler();
1078 });
1079}
1080
Jason M. Billse94f5e12019-09-13 11:11:34 -07001081static void cpu2MemEFGHVRHotAssertHandler()
1082{
1083 cpuVRHotLog("CPU 2 Memory EFGH");
1084}
1085
Jason M. Bills9647ba72019-08-29 14:19:19 -07001086static void cpu2MemEFGHVRHotHandler()
1087{
Jason M. Bills84951142020-04-17 15:57:11 -07001088 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001089
Jason M. Bills84951142020-04-17 15:57:11 -07001090 bool cpu2MemEFGHVRHot =
1091 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1092 if (cpu2MemEFGHVRHot)
1093 {
1094 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001095 }
Jason M. Bills84951142020-04-17 15:57:11 -07001096
Jason M. Bills9647ba72019-08-29 14:19:19 -07001097 cpu2MemEFGHVRHotEvent.async_wait(
1098 boost::asio::posix::stream_descriptor::wait_read,
1099 [](const boost::system::error_code ec) {
1100 if (ec)
1101 {
1102 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1103 << ec.message() << "\n";
1104 return;
1105 }
1106 cpu2MemEFGHVRHotHandler();
1107 });
1108}
1109
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001110static void pchThermtripHandler()
1111{
Yong Li1429ca82020-04-27 16:49:45 +08001112 std::vector<Association> associations;
1113
Jason M. Bills84951142020-04-17 15:57:11 -07001114 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001115
Jason M. Bills84951142020-04-17 15:57:11 -07001116 bool pchThermtrip =
1117 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1118 if (pchThermtrip)
1119 {
1120 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001121 associations.emplace_back(
1122 "", "critical",
1123 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1124 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001125 }
Yong Li1429ca82020-04-27 16:49:45 +08001126 else
1127 {
1128 associations.emplace_back("", "", "");
1129 }
1130 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1131 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001132
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001133 pchThermtripEvent.async_wait(
1134 boost::asio::posix::stream_descriptor::wait_read,
1135 [](const boost::system::error_code ec) {
1136 if (ec)
1137 {
1138 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1139 << "\n";
1140 return;
1141 }
1142 pchThermtripHandler();
1143 });
1144}
1145
Jason M. Billscbf78532019-08-16 15:32:11 -07001146static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001147{
Jason M. Billscbf78532019-08-16 15:32:11 -07001148 int errPinSts = (1 << errPin);
1149 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001150 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001151 cpu++, addr++)
1152 {
1153 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1154 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001155 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001156 uint8_t cc = 0;
1157 CPUModel model{};
1158 uint8_t stepping = 0;
1159 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1160 {
1161 std::cerr << "Cannot get CPUID!\n";
1162 continue;
1163 }
1164
1165 switch (model)
1166 {
1167 case skx:
1168 {
1169 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001170 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001171 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001172 peciStatus = peci_RdPCIConfigLocal(
1173 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1174 (uint8_t*)&errpinsts, &cc);
1175 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001176 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001177 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1178 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001179 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001180
1181 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001182 break;
1183 }
1184 case icx:
1185 {
1186 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001187 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001188 // accessed on PECI as bus 13)
1189 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001190 peciStatus = peci_RdEndPointConfigPciLocal(
1191 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1192 (uint8_t*)&errpinsts, &cc);
1193 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001194 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001195 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1196 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001197 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001198
1199 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001200 break;
1201 }
1202 }
1203 }
1204 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001205 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001206}
1207
Jason M. Billscbf78532019-08-16 15:32:11 -07001208static void errXAssertHandler(const int errPin,
1209 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001210{
Jason M. Billscbf78532019-08-16 15:32:11 -07001211 // ERRx status is not guaranteed through the timeout, so save which
1212 // CPUs have it asserted
1213 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1214 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1215 errXAssertTimer.async_wait([errPin, errPinCPUs](
1216 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001217 if (ec)
1218 {
1219 // operation_aborted is expected if timer is canceled before
1220 // completion.
1221 if (ec != boost::asio::error::operation_aborted)
1222 {
1223 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1224 << "\n";
1225 }
1226 return;
1227 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001228 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1229 << std::to_string(errTimeoutMs) << " ms\n";
1230 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001231 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001232 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001233 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001234 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001235 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001236 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001237 }
1238 }
1239 }
1240 else
1241 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001242 cpuERRXLog(errPin);
1243 }
1244 });
1245}
1246
Jason M. Bills8c584392019-08-19 11:05:51 -07001247static void err0AssertHandler()
1248{
1249 // Handle the standard ERR0 detection and logging
1250 const static constexpr int err0 = 0;
1251 errXAssertHandler(err0, err0AssertTimer);
1252}
1253
1254static void err0Handler()
1255{
1256 if (!hostOff)
1257 {
1258 gpiod::line_event gpioLineEvent = err0Line.event_read();
1259
1260 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1261 if (err0)
1262 {
1263 err0AssertHandler();
1264 }
1265 else
1266 {
1267 err0AssertTimer.cancel();
1268 }
1269 }
1270 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1271 [](const boost::system::error_code ec) {
1272 if (ec)
1273 {
1274 std::cerr
1275 << "err0 handler error: " << ec.message()
1276 << "\n";
1277 return;
1278 }
1279 err0Handler();
1280 });
1281}
1282
Jason M. Bills75af3962019-08-19 11:07:17 -07001283static void err1AssertHandler()
1284{
1285 // Handle the standard ERR1 detection and logging
1286 const static constexpr int err1 = 1;
1287 errXAssertHandler(err1, err1AssertTimer);
1288}
1289
1290static void err1Handler()
1291{
1292 if (!hostOff)
1293 {
1294 gpiod::line_event gpioLineEvent = err1Line.event_read();
1295
1296 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1297 if (err1)
1298 {
1299 err1AssertHandler();
1300 }
1301 else
1302 {
1303 err1AssertTimer.cancel();
1304 }
1305 }
1306 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1307 [](const boost::system::error_code ec) {
1308 if (ec)
1309 {
1310 std::cerr
1311 << "err1 handler error: " << ec.message()
1312 << "\n";
1313 return;
1314 }
1315 err1Handler();
1316 });
1317}
1318
Jason M. Billscbf78532019-08-16 15:32:11 -07001319static void err2AssertHandler()
1320{
1321 // Handle the standard ERR2 detection and logging
1322 const static constexpr int err2 = 2;
1323 errXAssertHandler(err2, err2AssertTimer);
1324 // Also handle reset for ERR2
1325 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1326 if (ec)
1327 {
1328 // operation_aborted is expected if timer is canceled before
1329 // completion.
1330 if (ec != boost::asio::error::operation_aborted)
1331 {
1332 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1333 << "\n";
1334 }
1335 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001336 }
1337 conn->async_method_call(
1338 [](boost::system::error_code ec,
1339 const std::variant<bool>& property) {
1340 if (ec)
1341 {
1342 return;
1343 }
1344 const bool* reset = std::get_if<bool>(&property);
1345 if (reset == nullptr)
1346 {
1347 std::cerr << "Unable to read reset on ERR2 value\n";
1348 return;
1349 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001350 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001351 },
1352 "xyz.openbmc_project.Settings",
1353 "/xyz/openbmc_project/control/processor_error_config",
1354 "org.freedesktop.DBus.Properties", "Get",
1355 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001356
1357 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001358 });
1359}
1360
1361static void err2Handler()
1362{
1363 if (!hostOff)
1364 {
1365 gpiod::line_event gpioLineEvent = err2Line.event_read();
1366
1367 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1368 if (err2)
1369 {
1370 err2AssertHandler();
1371 }
1372 else
1373 {
1374 err2AssertTimer.cancel();
1375 }
1376 }
1377 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1378 [](const boost::system::error_code ec) {
1379 if (ec)
1380 {
1381 std::cerr
1382 << "err2 handler error: " << ec.message()
1383 << "\n";
1384 return;
1385 }
1386 err2Handler();
1387 });
1388}
1389
Jason M. Bills89922f82019-08-06 11:10:02 -07001390static void smiAssertHandler()
1391{
1392 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1393 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1394 if (ec)
1395 {
1396 // operation_aborted is expected if timer is canceled before
1397 // completion.
1398 if (ec != boost::asio::error::operation_aborted)
1399 {
1400 std::cerr << "smi timeout async_wait failed: " << ec.message()
1401 << "\n";
1402 }
1403 return;
1404 }
1405 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1406 << " ms\n";
1407 smiTimeoutLog();
1408 conn->async_method_call(
1409 [](boost::system::error_code ec,
1410 const std::variant<bool>& property) {
1411 if (ec)
1412 {
1413 return;
1414 }
1415 const bool* reset = std::get_if<bool>(&property);
1416 if (reset == nullptr)
1417 {
1418 std::cerr << "Unable to read reset on SMI value\n";
1419 return;
1420 }
Jason M. Bills94785442020-01-07 15:22:09 -08001421#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001422 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001423#else
1424 if (*reset)
1425 {
Jason M. Billsd69549b2020-08-27 11:42:43 -07001426 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -07001427 startWarmReset();
Jason M. Bills94785442020-01-07 15:22:09 -08001428 }
1429#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001430 },
1431 "xyz.openbmc_project.Settings",
1432 "/xyz/openbmc_project/control/bmc_reset_disables",
1433 "org.freedesktop.DBus.Properties", "Get",
1434 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1435 });
1436}
1437
1438static void smiHandler()
1439{
1440 if (!hostOff)
1441 {
1442 gpiod::line_event gpioLineEvent = smiLine.event_read();
1443
1444 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1445 if (smi)
1446 {
1447 smiAssertHandler();
1448 }
1449 else
1450 {
1451 smiAssertTimer.cancel();
1452 }
1453 }
1454 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1455 [](const boost::system::error_code ec) {
1456 if (ec)
1457 {
1458 std::cerr
1459 << "smi handler error: " << ec.message()
1460 << "\n";
1461 return;
1462 }
1463 smiHandler();
1464 });
1465}
1466
Jason M. Billsa15c2522019-08-16 10:01:44 -07001467static void initializeErrorState()
1468{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001469 // Handle CPU1_MISMATCH if it's asserted now
1470 if (cpu1MismatchLine.get_value() == 1)
1471 {
1472 cpuMismatchLog(1);
1473 }
1474
1475 // Handle CPU2_MISMATCH if it's asserted now
1476 if (cpu2MismatchLine.get_value() == 1)
1477 {
1478 cpuMismatchLog(2);
1479 }
1480
Jason M. Billsa15c2522019-08-16 10:01:44 -07001481 // Handle CPU_CATERR if it's asserted now
1482 if (caterrLine.get_value() == 0)
1483 {
1484 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001485 std::vector<Association> associations;
1486 associations.emplace_back(
1487 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1488 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1489 host_error_monitor::associationCATAssert->set_property("Associations",
1490 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001491 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001492
Jason M. Bills8c584392019-08-19 11:05:51 -07001493 // Handle CPU_ERR0 if it's asserted now
1494 if (err0Line.get_value() == 0)
1495 {
1496 err0AssertHandler();
1497 }
1498
Jason M. Bills75af3962019-08-19 11:07:17 -07001499 // Handle CPU_ERR1 if it's asserted now
1500 if (err1Line.get_value() == 0)
1501 {
1502 err1AssertHandler();
1503 }
1504
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001505 // Handle CPU_ERR2 if it's asserted now
1506 if (err2Line.get_value() == 0)
1507 {
1508 err2AssertHandler();
1509 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001510
1511 // Handle SMI if it's asserted now
1512 if (smiLine.get_value() == 0)
1513 {
1514 smiAssertHandler();
1515 }
Jason M. Bills08866542019-08-16 12:04:19 -07001516
Jason M. Billse94f5e12019-09-13 11:11:34 -07001517 // Handle CPU1_THERMTRIP if it's asserted now
1518 if (cpu1ThermtripLine.get_value() == 0)
1519 {
1520 cpu1ThermtripAssertHandler();
1521 }
1522
1523 // Handle CPU2_THERMTRIP if it's asserted now
1524 if (cpu2ThermtripLine.get_value() == 0)
1525 {
1526 cpu2ThermtripAssertHandler();
1527 }
1528
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001529 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1530 if (cpu1MemtripLine.get_value() == 0)
1531 {
1532 memThermTripLog(1);
1533 }
1534
1535 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1536 if (cpu2MemtripLine.get_value() == 0)
1537 {
1538 memThermTripLog(2);
1539 }
1540
Jason M. Billse94f5e12019-09-13 11:11:34 -07001541 // Handle CPU1_VRHOT if it's asserted now
1542 if (cpu1VRHotLine.get_value() == 0)
1543 {
1544 cpu1VRHotAssertHandler();
1545 }
1546
1547 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1548 if (cpu1MemABCDVRHotLine.get_value() == 0)
1549 {
1550 cpu1MemABCDVRHotAssertHandler();
1551 }
1552
1553 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1554 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1555 {
1556 cpu1MemEFGHVRHotAssertHandler();
1557 }
1558
1559 // Handle CPU2_VRHOT if it's asserted now
1560 if (cpu2VRHotLine.get_value() == 0)
1561 {
1562 cpu2VRHotAssertHandler();
1563 }
1564
1565 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1566 if (cpu2MemABCDVRHotLine.get_value() == 0)
1567 {
1568 cpu2MemABCDVRHotAssertHandler();
1569 }
1570
1571 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1572 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1573 {
1574 cpu2MemEFGHVRHotAssertHandler();
1575 }
1576
Jason M. Bills08866542019-08-16 12:04:19 -07001577 // Handle PCH_BMC_THERMTRIP if it's asserted now
1578 if (pchThermtripLine.get_value() == 0)
1579 {
1580 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001581 std::vector<Association> associations;
1582 associations.emplace_back(
1583 "", "critical",
1584 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1585 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1586 host_error_monitor::associationSSBThermTrip->set_property(
1587 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001588 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001589}
Jason M. Bills1490b142019-07-01 15:48:43 -07001590} // namespace host_error_monitor
1591
1592int main(int argc, char* argv[])
1593{
1594 // setup connection to dbus
1595 host_error_monitor::conn =
1596 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1597
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001598 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001599 host_error_monitor::conn->request_name(
1600 "xyz.openbmc_project.HostErrorMonitor");
1601 sdbusplus::asio::object_server server =
1602 sdbusplus::asio::object_server(host_error_monitor::conn);
1603
Yong Li1429ca82020-04-27 16:49:45 +08001604 // Associations interface for led status
1605 std::vector<host_error_monitor::Association> associations;
1606 associations.emplace_back("", "", "");
1607 host_error_monitor::associationSSBThermTrip = server.add_interface(
1608 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1609 "xyz.openbmc_project.Association.Definitions");
1610 host_error_monitor::associationSSBThermTrip->register_property(
1611 "Associations", associations);
1612 host_error_monitor::associationSSBThermTrip->initialize();
1613
1614 host_error_monitor::associationCATAssert = server.add_interface(
1615 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1616 "xyz.openbmc_project.Association.Definitions");
1617 host_error_monitor::associationCATAssert->register_property("Associations",
1618 associations);
1619 host_error_monitor::associationCATAssert->initialize();
1620
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001621 // Restart Cause Interface
1622 host_error_monitor::hostErrorTimeoutIface =
1623 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1624 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1625
1626 host_error_monitor::hostErrorTimeoutIface->register_property(
1627 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1628 [](const std::size_t& requested, std::size_t& resp) {
1629 if (requested > host_error_monitor::caterrTimeoutMsMax)
1630 {
1631 std::cerr << "IERRTimeoutMs update to " << requested
1632 << "ms rejected. Cannot be greater than "
1633 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1634 return 0;
1635 }
1636 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1637 host_error_monitor::caterrTimeoutMs = requested;
1638 resp = requested;
1639 return 1;
1640 },
1641 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1642 host_error_monitor::hostErrorTimeoutIface->initialize();
1643
Jason M. Bills1490b142019-07-01 15:48:43 -07001644 // Start tracking host state
1645 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1646 host_error_monitor::startHostStateMonitor();
1647
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001648 // Request CPU1_MISMATCH GPIO events
1649 if (!host_error_monitor::requestGPIOInput(
1650 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1651 {
1652 return -1;
1653 }
1654
1655 // Request CPU2_MISMATCH GPIO events
1656 if (!host_error_monitor::requestGPIOInput(
1657 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1658 {
1659 return -1;
1660 }
1661
Jason M. Bills1490b142019-07-01 15:48:43 -07001662 // Initialize the host state
1663 host_error_monitor::initializeHostState();
1664
1665 // Request CPU_CATERR GPIO events
1666 if (!host_error_monitor::requestGPIOEvents(
1667 "CPU_CATERR", host_error_monitor::caterrHandler,
1668 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1669 {
1670 return -1;
1671 }
1672
Jason M. Bills8c584392019-08-19 11:05:51 -07001673 // Request CPU_ERR0 GPIO events
1674 if (!host_error_monitor::requestGPIOEvents(
1675 "CPU_ERR0", host_error_monitor::err0Handler,
1676 host_error_monitor::err0Line, host_error_monitor::err0Event))
1677 {
1678 return -1;
1679 }
1680
Jason M. Bills75af3962019-08-19 11:07:17 -07001681 // Request CPU_ERR1 GPIO events
1682 if (!host_error_monitor::requestGPIOEvents(
1683 "CPU_ERR1", host_error_monitor::err1Handler,
1684 host_error_monitor::err1Line, host_error_monitor::err1Event))
1685 {
1686 return -1;
1687 }
1688
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001689 // Request CPU_ERR2 GPIO events
1690 if (!host_error_monitor::requestGPIOEvents(
1691 "CPU_ERR2", host_error_monitor::err2Handler,
1692 host_error_monitor::err2Line, host_error_monitor::err2Event))
1693 {
1694 return -1;
1695 }
1696
Jason M. Bills89922f82019-08-06 11:10:02 -07001697 // Request SMI GPIO events
1698 if (!host_error_monitor::requestGPIOEvents(
1699 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1700 host_error_monitor::smiEvent))
1701 {
1702 return -1;
1703 }
1704
Jason M. Bills45e87e02019-09-09 14:45:38 -07001705 // Request CPU1_FIVR_FAULT GPIO input
1706 if (!host_error_monitor::requestGPIOInput(
1707 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1708 {
1709 return -1;
1710 }
1711
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001712 // Request CPU1_THERMTRIP GPIO events
1713 if (!host_error_monitor::requestGPIOEvents(
1714 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1715 host_error_monitor::cpu1ThermtripLine,
1716 host_error_monitor::cpu1ThermtripEvent))
1717 {
1718 return -1;
1719 }
1720
Jason M. Bills45e87e02019-09-09 14:45:38 -07001721 // Request CPU2_FIVR_FAULT GPIO input
1722 if (!host_error_monitor::requestGPIOInput(
1723 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1724 {
1725 return -1;
1726 }
1727
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001728 // Request CPU2_THERMTRIP GPIO events
1729 if (!host_error_monitor::requestGPIOEvents(
1730 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1731 host_error_monitor::cpu2ThermtripLine,
1732 host_error_monitor::cpu2ThermtripEvent))
1733 {
1734 return -1;
1735 }
1736
Jason M. Bills250fa632019-08-28 15:58:25 -07001737 // Request CPU1_VRHOT GPIO events
1738 if (!host_error_monitor::requestGPIOEvents(
1739 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1740 host_error_monitor::cpu1VRHotLine,
1741 host_error_monitor::cpu1VRHotEvent))
1742 {
1743 return -1;
1744 }
1745
Jason M. Bills9647ba72019-08-29 14:19:19 -07001746 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1747 if (!host_error_monitor::requestGPIOEvents(
1748 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1749 host_error_monitor::cpu1MemABCDVRHotLine,
1750 host_error_monitor::cpu1MemABCDVRHotEvent))
1751 {
1752 return -1;
1753 }
1754
1755 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1756 if (!host_error_monitor::requestGPIOEvents(
1757 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1758 host_error_monitor::cpu1MemEFGHVRHotLine,
1759 host_error_monitor::cpu1MemEFGHVRHotEvent))
1760 {
1761 return -1;
1762 }
1763
Jason M. Bills250fa632019-08-28 15:58:25 -07001764 // Request CPU2_VRHOT GPIO events
1765 if (!host_error_monitor::requestGPIOEvents(
1766 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1767 host_error_monitor::cpu2VRHotLine,
1768 host_error_monitor::cpu2VRHotEvent))
1769 {
1770 return -1;
1771 }
1772
Jason M. Bills9647ba72019-08-29 14:19:19 -07001773 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1774 if (!host_error_monitor::requestGPIOEvents(
1775 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1776 host_error_monitor::cpu2MemABCDVRHotLine,
1777 host_error_monitor::cpu2MemABCDVRHotEvent))
1778 {
1779 return -1;
1780 }
1781
1782 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1783 if (!host_error_monitor::requestGPIOEvents(
1784 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1785 host_error_monitor::cpu2MemEFGHVRHotLine,
1786 host_error_monitor::cpu2MemEFGHVRHotEvent))
1787 {
1788 return -1;
1789 }
1790
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001791 // Request PCH_BMC_THERMTRIP GPIO events
1792 if (!host_error_monitor::requestGPIOEvents(
1793 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1794 host_error_monitor::pchThermtripLine,
1795 host_error_monitor::pchThermtripEvent))
1796 {
1797 return -1;
1798 }
1799
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001800 // Request CPU1_MEM_THERM_EVENT GPIO events
1801 if (!host_error_monitor::requestGPIOEvents(
1802 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1803 host_error_monitor::cpu1MemtripLine,
1804 host_error_monitor::cpu1MemtripEvent))
1805 {
1806 return -1;
1807 }
1808
1809 // Request CPU2_MEM_THERM_EVENT GPIO events
1810 if (!host_error_monitor::requestGPIOEvents(
1811 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1812 host_error_monitor::cpu2MemtripLine,
1813 host_error_monitor::cpu2MemtripEvent))
1814 {
1815 return -1;
1816 }
1817
Jason M. Bills1490b142019-07-01 15:48:43 -07001818 host_error_monitor::io.run();
1819
1820 return 0;
1821}