blob: d52a5dc6a2247ede632322fc1ca892f71493d5fe [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070019#include <boost/asio/io_service.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
Jason M. Bills08b2c7a2020-08-28 15:39:14 -070021#include <boost/asio/steady_timer.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070022#include <gpiod.hpp>
Jason M. Bills1490b142019-07-01 15:48:43 -070023#include <sdbusplus/asio/object_server.hpp>
Jason M. Bills48e5dff2020-06-10 13:47:47 -070024
25#include <bitset>
26#include <iostream>
Jason M. Billsd1a19f62019-08-06 11:52:58 -070027#include <variant>
Jason M. Bills1490b142019-07-01 15:48:43 -070028
29namespace host_error_monitor
30{
31static boost::asio::io_service io;
32static std::shared_ptr<sdbusplus::asio::connection> conn;
Jason M. Billsc4b91f22019-11-26 17:04:50 -080033static std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
Jason M. Bills1490b142019-07-01 15:48:43 -070034
Yong Li1429ca82020-04-27 16:49:45 +080035using Association = std::tuple<std::string, std::string, std::string>;
36static std::shared_ptr<sdbusplus::asio::dbus_interface> associationSSBThermTrip;
37static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
38
39static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
40
Jason M. Bills1490b142019-07-01 15:48:43 -070041static bool hostOff = true;
42
Jason M. Billsc4b91f22019-11-26 17:04:50 -080043static size_t caterrTimeoutMs = 2000;
44const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
Jason M. Billscbf78532019-08-16 15:32:11 -070045const static constexpr size_t errTimeoutMs = 90000;
Jason M. Bills89922f82019-08-06 11:10:02 -070046const static constexpr size_t smiTimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070047
48// Timers
49// Timer for CATERR asserted
50static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070051// Timer for ERR0 asserted
52static boost::asio::steady_timer err0AssertTimer(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070053// Timer for ERR1 asserted
54static boost::asio::steady_timer err1AssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070055// Timer for ERR2 asserted
56static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070057// Timer for SMI asserted
58static boost::asio::steady_timer smiAssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070059
60// GPIO Lines and Event Descriptors
61static gpiod::line caterrLine;
62static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills8c584392019-08-19 11:05:51 -070063static gpiod::line err0Line;
64static boost::asio::posix::stream_descriptor err0Event(io);
Jason M. Bills75af3962019-08-19 11:07:17 -070065static gpiod::line err1Line;
66static boost::asio::posix::stream_descriptor err1Event(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070067static gpiod::line err2Line;
68static boost::asio::posix::stream_descriptor err2Event(io);
Jason M. Bills89922f82019-08-06 11:10:02 -070069static gpiod::line smiLine;
70static boost::asio::posix::stream_descriptor smiEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070071static gpiod::line cpu1FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070072static gpiod::line cpu1ThermtripLine;
73static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io);
Jason M. Bills45e87e02019-09-09 14:45:38 -070074static gpiod::line cpu2FIVRFaultLine;
Jason M. Bills78c5eed2019-08-28 14:00:40 -070075static gpiod::line cpu2ThermtripLine;
76static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io);
Jason M. Bills250fa632019-08-28 15:58:25 -070077static gpiod::line cpu1VRHotLine;
78static boost::asio::posix::stream_descriptor cpu1VRHotEvent(io);
79static gpiod::line cpu2VRHotLine;
Jason M. Bills9647ba72019-08-29 14:19:19 -070080static boost::asio::posix::stream_descriptor cpu1MemABCDVRHotEvent(io);
81static gpiod::line cpu1MemEFGHVRHotLine;
82static boost::asio::posix::stream_descriptor cpu1MemEFGHVRHotEvent(io);
83static gpiod::line cpu2MemABCDVRHotLine;
Jason M. Bills250fa632019-08-28 15:58:25 -070084static boost::asio::posix::stream_descriptor cpu2VRHotEvent(io);
Jason M. Bills9647ba72019-08-29 14:19:19 -070085static gpiod::line cpu1MemABCDVRHotLine;
86static boost::asio::posix::stream_descriptor cpu2MemABCDVRHotEvent(io);
87static gpiod::line cpu2MemEFGHVRHotLine;
88static boost::asio::posix::stream_descriptor cpu2MemEFGHVRHotEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080089//----------------------------------
90// PCH_BMC_THERMTRIP function related definition
91//----------------------------------
Chen Yugange6c0f1c2019-08-02 20:36:42 +080092static gpiod::line pchThermtripLine;
93static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +000094//----------------------------------
95// CPU_MEM_THERM_EVENT function related definition
96//----------------------------------
97static gpiod::line cpu1MemtripLine;
98static boost::asio::posix::stream_descriptor cpu1MemtripEvent(io);
99static gpiod::line cpu2MemtripLine;
100static boost::asio::posix::stream_descriptor cpu2MemtripEvent(io);
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000101//---------------------------------
102// CPU_MISMATCH function related definition
103//---------------------------------
104static gpiod::line cpu1MismatchLine;
105static gpiod::line cpu2MismatchLine;
Jason M. Bills1490b142019-07-01 15:48:43 -0700106
Yong Li061eb032020-02-26 15:06:18 +0800107// beep function for CPU error
Yong Li8c798c72020-04-22 15:29:07 +0800108const static constexpr uint8_t beepCPUIERR = 4;
Yong Li061eb032020-02-26 15:06:18 +0800109const static constexpr uint8_t beepCPUErr2 = 5;
110
111static void beep(const uint8_t& beepPriority)
112{
113 conn->async_method_call(
114 [](boost::system::error_code ec) {
115 if (ec)
116 {
117 std::cerr << "beep returned error with "
118 "async_method_call (ec = "
119 << ec << ")\n";
120 return;
121 }
122 },
123 "xyz.openbmc_project.BeepCode", "/xyz/openbmc_project/BeepCode",
124 "xyz.openbmc_project.BeepCode", "Beep", uint8_t(beepPriority));
125}
126
Jason M. Billsa3397932019-08-06 11:07:21 -0700127static void cpuIERRLog()
128{
129 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
130 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
131 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
132}
133
134static void cpuIERRLog(const int cpuNum)
135{
136 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
137
138 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
139 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
140 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
141}
142
143static void cpuIERRLog(const int cpuNum, const std::string& type)
144{
145 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
146
147 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
148 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
149 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
150}
151
Jason M. Billscbf78532019-08-16 15:32:11 -0700152static void cpuERRXLog(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700153{
Jason M. Billscbf78532019-08-16 15:32:11 -0700154 std::string msg = "ERR" + std::to_string(errPin) + " Timeout";
155
156 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
157 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
158 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700159}
160
Jason M. Billscbf78532019-08-16 15:32:11 -0700161static void cpuERRXLog(const int errPin, const int cpuNum)
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700162{
Jason M. Billscbf78532019-08-16 15:32:11 -0700163 std::string msg = "ERR" + std::to_string(errPin) + " Timeout on CPU " +
164 std::to_string(cpuNum + 1);
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700165
166 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
167 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
168 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
169}
170
Jason M. Bills89922f82019-08-06 11:10:02 -0700171static void smiTimeoutLog()
172{
173 sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO,
174 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
175 "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL);
176}
177
Jason M. Bills45e87e02019-09-09 14:45:38 -0700178static void cpuBootFIVRFaultLog(const int cpuNum)
179{
180 std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum);
181
182 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
183 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
184 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
185}
186
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700187static void cpuThermTripLog(const int cpuNum)
188{
189 std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip";
190
191 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
192 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
193 "OpenBMC.0.1.CPUThermalTrip", "REDFISH_MESSAGE_ARGS=%d",
194 cpuNum, NULL);
195}
196
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000197static void memThermTripLog(const int cpuNum)
198{
199 std::string cpuNumber = "CPU " + std::to_string(cpuNum);
200 std::string msg = cpuNumber + " Memory Thermal trip.";
201
202 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
203 LOG_ERR, "REDFISH_MESSAGE_ID=%s",
204 "OpenBMC.0.1.MemoryThermTrip", "REDFISH_MESSAGE_ARGS=%s",
205 cpuNumber.c_str(), NULL);
206}
207
jayaprakash Mutyala53099c42020-03-15 00:16:26 +0000208static void cpuMismatchLog(const int cpuNum)
209{
210 std::string msg = "CPU " + std::to_string(cpuNum) + " mismatch";
211
212 sd_journal_send("MESSAGE= %s", msg.c_str(), "PRIORITY=%i", LOG_ERR,
213 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUMismatch",
214 "REDFISH_MESSAGE_ARGS=%d", cpuNum, NULL);
215}
216
Jason M. Bills250fa632019-08-28 15:58:25 -0700217static void cpuVRHotLog(const std::string& vr)
218{
219 std::string msg = vr + " Voltage Regulator Overheated.";
220
221 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
222 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
223 "OpenBMC.0.1.VoltageRegulatorOverheated",
224 "REDFISH_MESSAGE_ARGS=%s", vr.c_str(), NULL);
225}
226
Jason M. Bills08866542019-08-16 12:04:19 -0700227static void ssbThermTripLog()
228{
229 sd_journal_send("MESSAGE=HostError: SSB thermal trip", "PRIORITY=%i",
230 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
231 "OpenBMC.0.1.SsbThermalTrip", NULL);
232}
233
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700234static inline bool peciError(EPECIStatus peciStatus, uint8_t cc)
235{
236 return (
237 peciStatus != PECI_CC_SUCCESS ||
238 (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED));
239}
240
241static void printPECIError(const std::string& reg, const size_t addr,
242 const EPECIStatus peciStatus, const size_t cc)
243{
244 std::cerr << "Failed to read " << reg << " on CPU address " << addr
245 << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc
246 << "\n";
247}
248
Jason M. Billsa15c2522019-08-16 10:01:44 -0700249static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -0700250static void initializeHostState()
251{
252 conn->async_method_call(
253 [](boost::system::error_code ec,
254 const std::variant<std::string>& property) {
255 if (ec)
256 {
257 return;
258 }
259 const std::string* state = std::get_if<std::string>(&property);
260 if (state == nullptr)
261 {
262 std::cerr << "Unable to read host state value\n";
263 return;
264 }
265 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -0700266 // If the system is on, initialize the error state
267 if (!hostOff)
268 {
269 initializeErrorState();
270 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700271 },
272 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
273 "org.freedesktop.DBus.Properties", "Get",
274 "xyz.openbmc_project.State.Host", "CurrentHostState");
275}
276
277static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
278{
279 return std::make_shared<sdbusplus::bus::match::match>(
280 *conn,
281 "type='signal',interface='org.freedesktop.DBus.Properties',"
Jason M. Bills2fbb9ea2020-06-19 14:46:54 -0700282 "member='PropertiesChanged',arg0='xyz.openbmc_project.State.Host'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700283 [](sdbusplus::message::message& msg) {
284 std::string interfaceName;
285 boost::container::flat_map<std::string, std::variant<std::string>>
286 propertiesChanged;
Jason M. Bills1490b142019-07-01 15:48:43 -0700287 try
288 {
289 msg.read(interfaceName, propertiesChanged);
Jason M. Bills1490b142019-07-01 15:48:43 -0700290 }
291 catch (std::exception& e)
292 {
293 std::cerr << "Unable to read host state\n";
294 return;
295 }
Jason M. Bills566ccc42020-06-18 16:38:26 -0700296 // We only want to check for CurrentHostState
297 if (propertiesChanged.begin()->first != "CurrentHostState")
298 {
299 return;
300 }
301 std::string* state =
302 std::get_if<std::string>(&(propertiesChanged.begin()->second));
303 if (state == nullptr)
304 {
305 std::cerr << propertiesChanged.begin()->first
306 << " property invalid\n";
307 return;
308 }
309
310 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Bills1490b142019-07-01 15:48:43 -0700311
Jason M. Bills1490b142019-07-01 15:48:43 -0700312 if (hostOff)
313 {
Jason M. Billse94f5e12019-09-13 11:11:34 -0700314 // No host events should fire while off, so cancel any pending
315 // timers
Jason M. Bills1490b142019-07-01 15:48:43 -0700316 caterrAssertTimer.cancel();
Jason M. Bills8c584392019-08-19 11:05:51 -0700317 err0AssertTimer.cancel();
Jason M. Bills75af3962019-08-19 11:07:17 -0700318 err1AssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700319 err2AssertTimer.cancel();
Jason M. Bills89922f82019-08-06 11:10:02 -0700320 smiAssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700321 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700322 else
323 {
324 // Handle any initial errors when the host turns on
325 initializeErrorState();
326 }
Jason M. Bills1490b142019-07-01 15:48:43 -0700327 });
328}
329
330static bool requestGPIOEvents(
331 const std::string& name, const std::function<void()>& handler,
332 gpiod::line& gpioLine,
333 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
334{
335 // Find the GPIO line
336 gpioLine = gpiod::find_line(name);
337 if (!gpioLine)
338 {
339 std::cerr << "Failed to find the " << name << " line\n";
340 return false;
341 }
342
343 try
344 {
345 gpioLine.request(
346 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
347 }
348 catch (std::exception&)
349 {
350 std::cerr << "Failed to request events for " << name << "\n";
351 return false;
352 }
353
354 int gpioLineFd = gpioLine.event_get_fd();
355 if (gpioLineFd < 0)
356 {
357 std::cerr << "Failed to get " << name << " fd\n";
358 return false;
359 }
360
361 gpioEventDescriptor.assign(gpioLineFd);
362
363 gpioEventDescriptor.async_wait(
364 boost::asio::posix::stream_descriptor::wait_read,
365 [&name, handler](const boost::system::error_code ec) {
366 if (ec)
367 {
368 std::cerr << name << " fd handler error: " << ec.message()
369 << "\n";
370 return;
371 }
372 handler();
373 });
374 return true;
375}
376
Jason M. Bills45e87e02019-09-09 14:45:38 -0700377static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine)
378{
379 // Find the GPIO line
380 gpioLine = gpiod::find_line(name);
381 if (!gpioLine)
382 {
383 std::cerr << "Failed to find the " << name << " line.\n";
384 return false;
385 }
386
387 // Request GPIO input
388 try
389 {
390 gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT});
391 }
392 catch (std::exception&)
393 {
394 std::cerr << "Failed to request " << name << " input\n";
395 return false;
396 }
397
398 return true;
399}
400
Jason M. Bills1490b142019-07-01 15:48:43 -0700401static void startPowerCycle()
402{
403 conn->async_method_call(
404 [](boost::system::error_code ec) {
405 if (ec)
406 {
407 std::cerr << "failed to set Chassis State\n";
408 }
409 },
410 "xyz.openbmc_project.State.Chassis",
411 "/xyz/openbmc_project/state/chassis0",
412 "org.freedesktop.DBus.Properties", "Set",
413 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
414 std::variant<std::string>{
415 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
416}
417
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700418static void startWarmReset()
419{
420 conn->async_method_call(
421 [](boost::system::error_code ec) {
422 if (ec)
423 {
424 std::cerr << "failed to set Host State\n";
425 }
426 },
427 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
428 "org.freedesktop.DBus.Properties", "Set",
429 "xyz.openbmc_project.State.Host", "RequestedHostTransition",
430 std::variant<std::string>{
431 "xyz.openbmc_project.State.Host.Transition.ForceWarmReboot"});
432}
433
Jason M. Billsb61766b2019-11-26 17:02:44 -0800434static void startCrashdumpAndRecovery(bool recoverSystem,
435 const std::string& triggerType)
Jason M. Bills1490b142019-07-01 15:48:43 -0700436{
Jason M. Billsd69549b2020-08-27 11:42:43 -0700437 std::cerr << "Starting crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700438 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
Jason M. Bills1490b142019-07-01 15:48:43 -0700439
440 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
441 *conn,
Jason M. Billsc015c552020-08-27 11:02:47 -0700442 "type='signal',interface='com.intel.crashdump.Stored',member='"
443 "CrashdumpComplete'",
Jason M. Bills1490b142019-07-01 15:48:43 -0700444 [recoverSystem](sdbusplus::message::message& msg) {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700445 std::cerr << "Crashdump completed\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700446 if (recoverSystem)
447 {
Jason M. Billsd69549b2020-08-27 11:42:43 -0700448 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -0700449 startWarmReset();
Jason M. Bills1490b142019-07-01 15:48:43 -0700450 }
451 crashdumpCompleteMatch.reset();
452 });
453
Jason M. Bills1490b142019-07-01 15:48:43 -0700454 conn->async_method_call(
455 [](boost::system::error_code ec) {
456 if (ec)
457 {
458 std::cerr << "failed to start Crashdump\n";
Jason M. Bills1490b142019-07-01 15:48:43 -0700459 }
460 },
461 "com.intel.crashdump", "/com/intel/crashdump",
Jason M. Billsb61766b2019-11-26 17:02:44 -0800462 "com.intel.crashdump.Stored", "GenerateStoredLog", triggerType);
Jason M. Bills1490b142019-07-01 15:48:43 -0700463}
464
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700465static void incrementCPUErrorCount(int cpuNum)
466{
467 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
468
469 // Get the current count
470 conn->async_method_call(
471 [propertyName](boost::system::error_code ec,
472 const std::variant<uint8_t>& property) {
473 if (ec)
474 {
475 std::cerr << "Failed to read " << propertyName << ": "
476 << ec.message() << "\n";
477 return;
478 }
479 const uint8_t* errorCountVariant = std::get_if<uint8_t>(&property);
480 if (errorCountVariant == nullptr)
481 {
482 std::cerr << propertyName << " invalid\n";
483 return;
484 }
485 uint8_t errorCount = *errorCountVariant;
486 if (errorCount == std::numeric_limits<uint8_t>::max())
487 {
488 std::cerr << "Maximum error count reached\n";
489 return;
490 }
491 // Increment the count
492 errorCount++;
493 conn->async_method_call(
494 [propertyName](boost::system::error_code ec) {
495 if (ec)
496 {
497 std::cerr << "Failed to set " << propertyName << ": "
498 << ec.message() << "\n";
499 }
500 },
501 "xyz.openbmc_project.Settings",
502 "/xyz/openbmc_project/control/processor_error_config",
503 "org.freedesktop.DBus.Properties", "Set",
504 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName,
505 std::variant<uint8_t>{errorCount});
506 },
507 "xyz.openbmc_project.Settings",
508 "/xyz/openbmc_project/control/processor_error_config",
509 "org.freedesktop.DBus.Properties", "Get",
510 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
511}
512
Jason M. Billsa3397932019-08-06 11:07:21 -0700513static bool checkIERRCPUs()
514{
515 bool cpuIERRFound = false;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700516 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Billsa3397932019-08-06 11:07:21 -0700517 cpu++, addr++)
518 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700519 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Billsa3397932019-08-06 11:07:21 -0700520 uint8_t cc = 0;
521 CPUModel model{};
522 uint8_t stepping = 0;
523 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
524 {
525 std::cerr << "Cannot get CPUID!\n";
526 continue;
527 }
528
529 switch (model)
530 {
531 case skx:
532 {
533 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
534 // that caused the IERR
535 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700536 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
537 (uint8_t*)&mcaErrSrcLog, &cc);
538 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700539 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700540 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700541 continue;
542 }
543 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
544 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
545 {
546 // TODO: Light the CPU fault LED?
547 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700548 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700549 // Next check if it's a CPU/VR mismatch by reading the
550 // IA32_MC4_STATUS MSR (0x411)
551 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700552 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
553 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700554 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700555 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700556 continue;
557 }
558 // Check MSEC bits 31:24 for
559 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
560 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
561 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700562 uint64_t msec = (mc4Status >> 24) & 0xFF;
563 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700564 {
565 cpuIERRLog(cpu, "CPU/VR Mismatch");
566 continue;
567 }
568
569 // Next check if it's a Core FIVR fault by looking for a
570 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
571 // 80h)
572 uint32_t coreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700573 peciStatus = peci_RdPCIConfigLocal(
574 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
575 (uint8_t*)&coreFIVRErrLog, &cc);
576 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700577 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700578 printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus,
579 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700580 continue;
581 }
582 if (coreFIVRErrLog)
583 {
584 cpuIERRLog(cpu, "Core FIVR Fault");
585 continue;
586 }
587
588 // Next check if it's an Uncore FIVR fault by looking for a
589 // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
590 // 84h)
591 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700592 peciStatus = peci_RdPCIConfigLocal(
593 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
594 (uint8_t*)&uncoreFIVRErrLog, &cc);
595 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700596 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700597 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
598 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700599 continue;
600 }
601 if (uncoreFIVRErrLog)
602 {
603 cpuIERRLog(cpu, "Uncore FIVR Fault");
604 continue;
605 }
606
607 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
608 // both zero, but MSEC bits 31:24 have either
609 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
610 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
611 // uncore FIVR fault
612 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700613 (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700614 {
615 cpuIERRLog(cpu, "Uncore FIVR Fault");
616 continue;
617 }
618 cpuIERRLog(cpu);
619 }
620 break;
621 }
622 case icx:
623 {
624 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
625 // that caused the IERR
626 uint32_t mcaErrSrcLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700627 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
628 (uint8_t*)&mcaErrSrcLog, &cc);
629 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700630 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700631 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700632 continue;
633 }
634 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
635 if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
636 {
637 // TODO: Light the CPU fault LED?
638 cpuIERRFound = true;
Jason M. Billsd1a19f62019-08-06 11:52:58 -0700639 incrementCPUErrorCount(cpu);
Jason M. Billsa3397932019-08-06 11:07:21 -0700640 // Next check if it's a CPU/VR mismatch by reading the
641 // IA32_MC4_STATUS MSR (0x411)
642 uint64_t mc4Status = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700643 peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
644 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700645 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700646 printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700647 continue;
648 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700649 // Check MSEC bits 31:24 for
650 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
651 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
652 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
Jason M. Billsc90570a2020-09-22 15:24:58 -0700653 uint64_t msec = (mc4Status >> 24) & 0xFF;
654 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
Jason M. Billsa3397932019-08-06 11:07:21 -0700655 {
656 cpuIERRLog(cpu, "CPU/VR Mismatch");
657 continue;
658 }
659
660 // Next check if it's a Core FIVR fault by looking for a
661 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
662 // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
663 uint32_t coreFIVRErrLog0 = 0;
664 uint32_t coreFIVRErrLog1 = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700665 peciStatus = peci_RdEndPointConfigPciLocal(
666 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
667 (uint8_t*)&coreFIVRErrLog0, &cc);
668 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700669 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700670 printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus,
671 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700672 continue;
673 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700674 peciStatus = peci_RdEndPointConfigPciLocal(
675 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
676 (uint8_t*)&coreFIVRErrLog1, &cc);
677 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700678 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700679 printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus,
680 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700681 continue;
682 }
683 if (coreFIVRErrLog0 || coreFIVRErrLog1)
684 {
685 cpuIERRLog(cpu, "Core FIVR Fault");
686 continue;
687 }
688
689 // Next check if it's an Uncore FIVR fault by looking for a
690 // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
691 // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
692 uint32_t uncoreFIVRErrLog = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700693 peciStatus = peci_RdEndPointConfigPciLocal(
694 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
695 (uint8_t*)&uncoreFIVRErrLog, &cc);
696 if (peciError(peciStatus, cc))
Jason M. Billsa3397932019-08-06 11:07:21 -0700697 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -0700698 printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
699 cc);
Jason M. Billsa3397932019-08-06 11:07:21 -0700700 continue;
701 }
702 if (uncoreFIVRErrLog)
703 {
704 cpuIERRLog(cpu, "Uncore FIVR Fault");
705 continue;
706 }
707
708 // TODO: Update MSEC/MSCOD_31_24 check
709 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
710 // both zero, but MSEC bits 31:24 have either
711 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
712 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
713 // uncore FIVR fault
714 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
Jason M. Billsc90570a2020-09-22 15:24:58 -0700715 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
Jason M. Billsa3397932019-08-06 11:07:21 -0700716 {
717 cpuIERRLog(cpu, "Uncore FIVR Fault");
718 continue;
719 }
720 cpuIERRLog(cpu);
721 }
722 break;
723 }
724 }
725 }
726 return cpuIERRFound;
727}
728
Jason M. Billsa15c2522019-08-16 10:01:44 -0700729static void caterrAssertHandler()
730{
Jason M. Billsa15c2522019-08-16 10:01:44 -0700731 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
732 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
733 if (ec)
734 {
735 // operation_aborted is expected if timer is canceled
736 // before completion.
737 if (ec != boost::asio::error::operation_aborted)
738 {
739 std::cerr << "caterr timeout async_wait failed: "
740 << ec.message() << "\n";
741 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700742 return;
743 }
Jason M. Billsa3397932019-08-06 11:07:21 -0700744 std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
745 << " ms\n";
Yong Li8c798c72020-04-22 15:29:07 +0800746 beep(beepCPUIERR);
Jason M. Billsa3397932019-08-06 11:07:21 -0700747 if (!checkIERRCPUs())
748 {
749 cpuIERRLog();
750 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700751 conn->async_method_call(
752 [](boost::system::error_code ec,
753 const std::variant<bool>& property) {
754 if (ec)
755 {
756 return;
757 }
758 const bool* reset = std::get_if<bool>(&property);
759 if (reset == nullptr)
760 {
761 std::cerr << "Unable to read reset on CATERR value\n";
762 return;
763 }
Jason M. Billsb61766b2019-11-26 17:02:44 -0800764 startCrashdumpAndRecovery(*reset, "IERR");
Jason M. Billsa15c2522019-08-16 10:01:44 -0700765 },
766 "xyz.openbmc_project.Settings",
767 "/xyz/openbmc_project/control/processor_error_config",
768 "org.freedesktop.DBus.Properties", "Get",
769 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
770 });
771}
772
Jason M. Bills1490b142019-07-01 15:48:43 -0700773static void caterrHandler()
774{
775 if (!hostOff)
776 {
777 gpiod::line_event gpioLineEvent = caterrLine.event_read();
778
779 bool caterr =
780 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
Yong Li1429ca82020-04-27 16:49:45 +0800781
782 std::vector<Association> associations;
Jason M. Bills1490b142019-07-01 15:48:43 -0700783 if (caterr)
784 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700785 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +0800786 associations.emplace_back(
787 "", "critical",
788 "/xyz/openbmc_project/host_error_monitor/cat_error");
789 associations.emplace_back("", "critical",
790 host_error_monitor::rootPath);
Jason M. Bills1490b142019-07-01 15:48:43 -0700791 }
792 else
793 {
794 caterrAssertTimer.cancel();
Yong Li1429ca82020-04-27 16:49:45 +0800795 associations.emplace_back("", "", "");
Jason M. Bills1490b142019-07-01 15:48:43 -0700796 }
Yong Li1429ca82020-04-27 16:49:45 +0800797 host_error_monitor::associationCATAssert->set_property("Associations",
798 associations);
Jason M. Bills1490b142019-07-01 15:48:43 -0700799 }
800 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
801 [](const boost::system::error_code ec) {
802 if (ec)
803 {
804 std::cerr << "caterr handler error: "
805 << ec.message() << "\n";
806 return;
807 }
808 caterrHandler();
809 });
810}
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700811
Jason M. Billse94f5e12019-09-13 11:11:34 -0700812static void cpu1ThermtripAssertHandler()
813{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700814 if (cpu1FIVRFaultLine.get_value() == 0)
815 {
816 cpuBootFIVRFaultLog(1);
817 }
818 else
819 {
820 cpuThermTripLog(1);
821 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700822}
823
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700824static void cpu1ThermtripHandler()
825{
Jason M. Bills84951142020-04-17 15:57:11 -0700826 gpiod::line_event gpioLineEvent = cpu1ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700827
Jason M. Bills84951142020-04-17 15:57:11 -0700828 bool cpu1Thermtrip =
829 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
830 if (cpu1Thermtrip)
831 {
832 cpu1ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700833 }
Jason M. Bills84951142020-04-17 15:57:11 -0700834
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700835 cpu1ThermtripEvent.async_wait(
836 boost::asio::posix::stream_descriptor::wait_read,
837 [](const boost::system::error_code ec) {
838 if (ec)
839 {
840 std::cerr << "CPU 1 Thermtrip handler error: " << ec.message()
841 << "\n";
842 return;
843 }
844 cpu1ThermtripHandler();
845 });
846}
847
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000848static void cpu1MemtripHandler()
849{
Jason M. Bills5287c022020-05-19 11:16:09 -0700850 gpiod::line_event gpioLineEvent = cpu1MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000851
Jason M. Bills5287c022020-05-19 11:16:09 -0700852 bool cpu1Memtrip =
853 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
854 if (cpu1Memtrip)
855 {
856 memThermTripLog(1);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000857 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700858
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000859 cpu1MemtripEvent.async_wait(
860 boost::asio::posix::stream_descriptor::wait_read,
861 [](const boost::system::error_code ec) {
862 if (ec)
863 {
864 std::cerr << "CPU 1 Memory Thermaltrip handler error: "
865 << ec.message() << "\n";
866 return;
867 }
868 cpu1MemtripHandler();
869 });
870}
871
Jason M. Billse94f5e12019-09-13 11:11:34 -0700872static void cpu2ThermtripAssertHandler()
873{
Jason M. Bills45e87e02019-09-09 14:45:38 -0700874 if (cpu2FIVRFaultLine.get_value() == 0)
875 {
876 cpuBootFIVRFaultLog(2);
877 }
878 else
879 {
880 cpuThermTripLog(2);
881 }
Jason M. Billse94f5e12019-09-13 11:11:34 -0700882}
883
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700884static void cpu2ThermtripHandler()
885{
Jason M. Bills84951142020-04-17 15:57:11 -0700886 gpiod::line_event gpioLineEvent = cpu2ThermtripLine.event_read();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700887
Jason M. Bills84951142020-04-17 15:57:11 -0700888 bool cpu2Thermtrip =
889 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
890 if (cpu2Thermtrip)
891 {
892 cpu2ThermtripAssertHandler();
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700893 }
Jason M. Bills84951142020-04-17 15:57:11 -0700894
Jason M. Bills78c5eed2019-08-28 14:00:40 -0700895 cpu2ThermtripEvent.async_wait(
896 boost::asio::posix::stream_descriptor::wait_read,
897 [](const boost::system::error_code ec) {
898 if (ec)
899 {
900 std::cerr << "CPU 2 Thermtrip handler error: " << ec.message()
901 << "\n";
902 return;
903 }
904 cpu2ThermtripHandler();
905 });
906}
907
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000908static void cpu2MemtripHandler()
909{
Jason M. Bills5287c022020-05-19 11:16:09 -0700910 gpiod::line_event gpioLineEvent = cpu2MemtripLine.event_read();
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000911
Jason M. Bills5287c022020-05-19 11:16:09 -0700912 bool cpu2Memtrip =
913 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
914 if (cpu2Memtrip)
915 {
916 memThermTripLog(2);
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000917 }
Jason M. Bills5287c022020-05-19 11:16:09 -0700918
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +0000919 cpu2MemtripEvent.async_wait(
920 boost::asio::posix::stream_descriptor::wait_read,
921 [](const boost::system::error_code ec) {
922 if (ec)
923 {
924 std::cerr << "CPU 2 Memory Thermaltrip handler error: "
925 << ec.message() << "\n";
926 return;
927 }
928 cpu2MemtripHandler();
929 });
930}
931
Jason M. Billse94f5e12019-09-13 11:11:34 -0700932static void cpu1VRHotAssertHandler()
933{
934 cpuVRHotLog("CPU 1");
935}
936
Jason M. Bills250fa632019-08-28 15:58:25 -0700937static void cpu1VRHotHandler()
938{
Jason M. Bills84951142020-04-17 15:57:11 -0700939 gpiod::line_event gpioLineEvent = cpu1VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -0700940
Jason M. Bills84951142020-04-17 15:57:11 -0700941 bool cpu1VRHot =
942 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
943 if (cpu1VRHot)
944 {
945 cpu1VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -0700946 }
Jason M. Bills84951142020-04-17 15:57:11 -0700947
Jason M. Bills250fa632019-08-28 15:58:25 -0700948 cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
949 [](const boost::system::error_code ec) {
950 if (ec)
951 {
952 std::cerr << "CPU 1 VRHot handler error: "
953 << ec.message() << "\n";
954 return;
955 }
956 cpu1VRHotHandler();
957 });
958}
959
Jason M. Billse94f5e12019-09-13 11:11:34 -0700960static void cpu1MemABCDVRHotAssertHandler()
961{
962 cpuVRHotLog("CPU 1 Memory ABCD");
963}
964
Jason M. Bills9647ba72019-08-29 14:19:19 -0700965static void cpu1MemABCDVRHotHandler()
966{
Jason M. Bills84951142020-04-17 15:57:11 -0700967 gpiod::line_event gpioLineEvent = cpu1MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700968
Jason M. Bills84951142020-04-17 15:57:11 -0700969 bool cpu1MemABCDVRHot =
970 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
971 if (cpu1MemABCDVRHot)
972 {
973 cpu1MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700974 }
Jason M. Bills84951142020-04-17 15:57:11 -0700975
Jason M. Bills9647ba72019-08-29 14:19:19 -0700976 cpu1MemABCDVRHotEvent.async_wait(
977 boost::asio::posix::stream_descriptor::wait_read,
978 [](const boost::system::error_code ec) {
979 if (ec)
980 {
981 std::cerr << "CPU 1 Memory ABCD VRHot handler error: "
982 << ec.message() << "\n";
983 return;
984 }
985 cpu1MemABCDVRHotHandler();
986 });
987}
988
Jason M. Billse94f5e12019-09-13 11:11:34 -0700989static void cpu1MemEFGHVRHotAssertHandler()
990{
991 cpuVRHotLog("CPU 1 Memory EFGH");
992}
993
Jason M. Bills9647ba72019-08-29 14:19:19 -0700994static void cpu1MemEFGHVRHotHandler()
995{
Jason M. Bills84951142020-04-17 15:57:11 -0700996 gpiod::line_event gpioLineEvent = cpu1MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -0700997
Jason M. Bills84951142020-04-17 15:57:11 -0700998 bool cpu1MemEFGHVRHot =
999 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1000 if (cpu1MemEFGHVRHot)
1001 {
1002 cpu1MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001003 }
Jason M. Bills84951142020-04-17 15:57:11 -07001004
Jason M. Bills9647ba72019-08-29 14:19:19 -07001005 cpu1MemEFGHVRHotEvent.async_wait(
1006 boost::asio::posix::stream_descriptor::wait_read,
1007 [](const boost::system::error_code ec) {
1008 if (ec)
1009 {
1010 std::cerr << "CPU 1 Memory EFGH VRHot handler error: "
1011 << ec.message() << "\n";
1012 return;
1013 }
1014 cpu1MemEFGHVRHotHandler();
1015 });
1016}
1017
Jason M. Billse94f5e12019-09-13 11:11:34 -07001018static void cpu2VRHotAssertHandler()
1019{
1020 cpuVRHotLog("CPU 2");
1021}
1022
Jason M. Bills250fa632019-08-28 15:58:25 -07001023static void cpu2VRHotHandler()
1024{
Jason M. Bills84951142020-04-17 15:57:11 -07001025 gpiod::line_event gpioLineEvent = cpu2VRHotLine.event_read();
Jason M. Bills250fa632019-08-28 15:58:25 -07001026
Jason M. Bills84951142020-04-17 15:57:11 -07001027 bool cpu2VRHot =
1028 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1029 if (cpu2VRHot)
1030 {
1031 cpu2VRHotAssertHandler();
Jason M. Bills250fa632019-08-28 15:58:25 -07001032 }
Jason M. Bills84951142020-04-17 15:57:11 -07001033
Jason M. Bills250fa632019-08-28 15:58:25 -07001034 cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1035 [](const boost::system::error_code ec) {
1036 if (ec)
1037 {
1038 std::cerr << "CPU 2 VRHot handler error: "
1039 << ec.message() << "\n";
1040 return;
1041 }
1042 cpu2VRHotHandler();
1043 });
1044}
1045
Jason M. Billse94f5e12019-09-13 11:11:34 -07001046static void cpu2MemABCDVRHotAssertHandler()
1047{
1048 cpuVRHotLog("CPU 2 Memory ABCD");
1049}
1050
Jason M. Bills9647ba72019-08-29 14:19:19 -07001051static void cpu2MemABCDVRHotHandler()
1052{
Jason M. Bills84951142020-04-17 15:57:11 -07001053 gpiod::line_event gpioLineEvent = cpu2MemABCDVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001054
Jason M. Bills84951142020-04-17 15:57:11 -07001055 bool cpu2MemABCDVRHot =
1056 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1057 if (cpu2MemABCDVRHot)
1058 {
1059 cpu2MemABCDVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001060 }
Jason M. Bills84951142020-04-17 15:57:11 -07001061
Jason M. Bills9647ba72019-08-29 14:19:19 -07001062 cpu2MemABCDVRHotEvent.async_wait(
1063 boost::asio::posix::stream_descriptor::wait_read,
1064 [](const boost::system::error_code ec) {
1065 if (ec)
1066 {
1067 std::cerr << "CPU 2 Memory ABCD VRHot handler error: "
1068 << ec.message() << "\n";
1069 return;
1070 }
1071 cpu2MemABCDVRHotHandler();
1072 });
1073}
1074
Jason M. Billse94f5e12019-09-13 11:11:34 -07001075static void cpu2MemEFGHVRHotAssertHandler()
1076{
1077 cpuVRHotLog("CPU 2 Memory EFGH");
1078}
1079
Jason M. Bills9647ba72019-08-29 14:19:19 -07001080static void cpu2MemEFGHVRHotHandler()
1081{
Jason M. Bills84951142020-04-17 15:57:11 -07001082 gpiod::line_event gpioLineEvent = cpu2MemEFGHVRHotLine.event_read();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001083
Jason M. Bills84951142020-04-17 15:57:11 -07001084 bool cpu2MemEFGHVRHot =
1085 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1086 if (cpu2MemEFGHVRHot)
1087 {
1088 cpu2MemEFGHVRHotAssertHandler();
Jason M. Bills9647ba72019-08-29 14:19:19 -07001089 }
Jason M. Bills84951142020-04-17 15:57:11 -07001090
Jason M. Bills9647ba72019-08-29 14:19:19 -07001091 cpu2MemEFGHVRHotEvent.async_wait(
1092 boost::asio::posix::stream_descriptor::wait_read,
1093 [](const boost::system::error_code ec) {
1094 if (ec)
1095 {
1096 std::cerr << "CPU 2 Memory EFGH VRHot handler error: "
1097 << ec.message() << "\n";
1098 return;
1099 }
1100 cpu2MemEFGHVRHotHandler();
1101 });
1102}
1103
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001104static void pchThermtripHandler()
1105{
Yong Li1429ca82020-04-27 16:49:45 +08001106 std::vector<Association> associations;
1107
Jason M. Bills84951142020-04-17 15:57:11 -07001108 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001109
Jason M. Bills84951142020-04-17 15:57:11 -07001110 bool pchThermtrip =
1111 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1112 if (pchThermtrip)
1113 {
1114 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001115 associations.emplace_back(
1116 "", "critical",
1117 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1118 associations.emplace_back("", "critical", host_error_monitor::rootPath);
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001119 }
Yong Li1429ca82020-04-27 16:49:45 +08001120 else
1121 {
1122 associations.emplace_back("", "", "");
1123 }
1124 host_error_monitor::associationSSBThermTrip->set_property("Associations",
1125 associations);
Jason M. Bills84951142020-04-17 15:57:11 -07001126
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001127 pchThermtripEvent.async_wait(
1128 boost::asio::posix::stream_descriptor::wait_read,
1129 [](const boost::system::error_code ec) {
1130 if (ec)
1131 {
1132 std::cerr << "PCH Thermal trip handler error: " << ec.message()
1133 << "\n";
1134 return;
1135 }
1136 pchThermtripHandler();
1137 });
1138}
1139
Jason M. Billscbf78532019-08-16 15:32:11 -07001140static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001141{
Jason M. Billscbf78532019-08-16 15:32:11 -07001142 int errPinSts = (1 << errPin);
1143 std::bitset<MAX_CPUS> errPinCPUs = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001144 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001145 cpu++, addr++)
1146 {
1147 if (peci_Ping(addr) == PECI_CC_SUCCESS)
1148 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001149 EPECIStatus peciStatus = PECI_CC_SUCCESS;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001150 uint8_t cc = 0;
1151 CPUModel model{};
1152 uint8_t stepping = 0;
1153 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
1154 {
1155 std::cerr << "Cannot get CPUID!\n";
1156 continue;
1157 }
1158
1159 switch (model)
1160 {
1161 case skx:
1162 {
1163 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001164 // the ERRx (B(0) D8 F0 offset 210h)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001165 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001166 peciStatus = peci_RdPCIConfigLocal(
1167 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
1168 (uint8_t*)&errpinsts, &cc);
1169 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001170 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001171 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1172 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001173 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001174
1175 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001176 break;
1177 }
1178 case icx:
1179 {
1180 // Check the ERRPINSTS to see if this is the CPU that caused
Jason M. Billscbf78532019-08-16 15:32:11 -07001181 // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001182 // accessed on PECI as bus 13)
1183 uint32_t errpinsts = 0;
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001184 peciStatus = peci_RdEndPointConfigPciLocal(
1185 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
1186 (uint8_t*)&errpinsts, &cc);
1187 if (peciError(peciStatus, cc))
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001188 {
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001189 printPECIError("ERRPINSTS", addr, peciStatus, cc);
1190 continue;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001191 }
Jason M. Bills8064f8b2020-09-22 15:09:49 -07001192
1193 errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001194 break;
1195 }
1196 }
1197 }
1198 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001199 return errPinCPUs;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001200}
1201
Jason M. Billscbf78532019-08-16 15:32:11 -07001202static void errXAssertHandler(const int errPin,
1203 boost::asio::steady_timer& errXAssertTimer)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001204{
Jason M. Billscbf78532019-08-16 15:32:11 -07001205 // ERRx status is not guaranteed through the timeout, so save which
1206 // CPUs have it asserted
1207 std::bitset<MAX_CPUS> errPinCPUs = checkERRPinCPUs(errPin);
1208 errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs));
1209 errXAssertTimer.async_wait([errPin, errPinCPUs](
1210 const boost::system::error_code ec) {
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001211 if (ec)
1212 {
1213 // operation_aborted is expected if timer is canceled before
1214 // completion.
1215 if (ec != boost::asio::error::operation_aborted)
1216 {
1217 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1218 << "\n";
1219 }
1220 return;
1221 }
Jason M. Billscbf78532019-08-16 15:32:11 -07001222 std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
1223 << std::to_string(errTimeoutMs) << " ms\n";
1224 if (errPinCPUs.count())
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001225 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001226 for (int i = 0; i < errPinCPUs.size(); i++)
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001227 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001228 if (errPinCPUs[i])
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001229 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001230 cpuERRXLog(errPin, i);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001231 }
1232 }
1233 }
1234 else
1235 {
Jason M. Billscbf78532019-08-16 15:32:11 -07001236 cpuERRXLog(errPin);
1237 }
1238 });
1239}
1240
Jason M. Bills8c584392019-08-19 11:05:51 -07001241static void err0AssertHandler()
1242{
1243 // Handle the standard ERR0 detection and logging
1244 const static constexpr int err0 = 0;
1245 errXAssertHandler(err0, err0AssertTimer);
1246}
1247
1248static void err0Handler()
1249{
1250 if (!hostOff)
1251 {
1252 gpiod::line_event gpioLineEvent = err0Line.event_read();
1253
1254 bool err0 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1255 if (err0)
1256 {
1257 err0AssertHandler();
1258 }
1259 else
1260 {
1261 err0AssertTimer.cancel();
1262 }
1263 }
1264 err0Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1265 [](const boost::system::error_code ec) {
1266 if (ec)
1267 {
1268 std::cerr
1269 << "err0 handler error: " << ec.message()
1270 << "\n";
1271 return;
1272 }
1273 err0Handler();
1274 });
1275}
1276
Jason M. Bills75af3962019-08-19 11:07:17 -07001277static void err1AssertHandler()
1278{
1279 // Handle the standard ERR1 detection and logging
1280 const static constexpr int err1 = 1;
1281 errXAssertHandler(err1, err1AssertTimer);
1282}
1283
1284static void err1Handler()
1285{
1286 if (!hostOff)
1287 {
1288 gpiod::line_event gpioLineEvent = err1Line.event_read();
1289
1290 bool err1 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1291 if (err1)
1292 {
1293 err1AssertHandler();
1294 }
1295 else
1296 {
1297 err1AssertTimer.cancel();
1298 }
1299 }
1300 err1Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1301 [](const boost::system::error_code ec) {
1302 if (ec)
1303 {
1304 std::cerr
1305 << "err1 handler error: " << ec.message()
1306 << "\n";
1307 return;
1308 }
1309 err1Handler();
1310 });
1311}
1312
Jason M. Billscbf78532019-08-16 15:32:11 -07001313static void err2AssertHandler()
1314{
1315 // Handle the standard ERR2 detection and logging
1316 const static constexpr int err2 = 2;
1317 errXAssertHandler(err2, err2AssertTimer);
1318 // Also handle reset for ERR2
1319 err2AssertTimer.async_wait([](const boost::system::error_code ec) {
1320 if (ec)
1321 {
1322 // operation_aborted is expected if timer is canceled before
1323 // completion.
1324 if (ec != boost::asio::error::operation_aborted)
1325 {
1326 std::cerr << "err2 timeout async_wait failed: " << ec.message()
1327 << "\n";
1328 }
1329 return;
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001330 }
1331 conn->async_method_call(
1332 [](boost::system::error_code ec,
1333 const std::variant<bool>& property) {
1334 if (ec)
1335 {
1336 return;
1337 }
1338 const bool* reset = std::get_if<bool>(&property);
1339 if (reset == nullptr)
1340 {
1341 std::cerr << "Unable to read reset on ERR2 value\n";
1342 return;
1343 }
Jason M. Billsb61766b2019-11-26 17:02:44 -08001344 startCrashdumpAndRecovery(*reset, "ERR2 Timeout");
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001345 },
1346 "xyz.openbmc_project.Settings",
1347 "/xyz/openbmc_project/control/processor_error_config",
1348 "org.freedesktop.DBus.Properties", "Get",
1349 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
Yong Li061eb032020-02-26 15:06:18 +08001350
1351 beep(beepCPUErr2);
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001352 });
1353}
1354
1355static void err2Handler()
1356{
1357 if (!hostOff)
1358 {
1359 gpiod::line_event gpioLineEvent = err2Line.event_read();
1360
1361 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1362 if (err2)
1363 {
1364 err2AssertHandler();
1365 }
1366 else
1367 {
1368 err2AssertTimer.cancel();
1369 }
1370 }
1371 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1372 [](const boost::system::error_code ec) {
1373 if (ec)
1374 {
1375 std::cerr
1376 << "err2 handler error: " << ec.message()
1377 << "\n";
1378 return;
1379 }
1380 err2Handler();
1381 });
1382}
1383
Jason M. Bills89922f82019-08-06 11:10:02 -07001384static void smiAssertHandler()
1385{
1386 smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs));
1387 smiAssertTimer.async_wait([](const boost::system::error_code ec) {
1388 if (ec)
1389 {
1390 // operation_aborted is expected if timer is canceled before
1391 // completion.
1392 if (ec != boost::asio::error::operation_aborted)
1393 {
1394 std::cerr << "smi timeout async_wait failed: " << ec.message()
1395 << "\n";
1396 }
1397 return;
1398 }
1399 std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
1400 << " ms\n";
1401 smiTimeoutLog();
1402 conn->async_method_call(
1403 [](boost::system::error_code ec,
1404 const std::variant<bool>& property) {
1405 if (ec)
1406 {
1407 return;
1408 }
1409 const bool* reset = std::get_if<bool>(&property);
1410 if (reset == nullptr)
1411 {
1412 std::cerr << "Unable to read reset on SMI value\n";
1413 return;
1414 }
Jason M. Bills94785442020-01-07 15:22:09 -08001415#ifdef HOST_ERROR_CRASHDUMP_ON_SMI_TIMEOUT
Jason M. Billsb61766b2019-11-26 17:02:44 -08001416 startCrashdumpAndRecovery(*reset, "SMI Timeout");
Jason M. Bills94785442020-01-07 15:22:09 -08001417#else
1418 if (*reset)
1419 {
Jason M. Billsd69549b2020-08-27 11:42:43 -07001420 std::cerr << "Recovering the system\n";
Jason M. Bills9a9bf982020-08-10 11:58:18 -07001421 startWarmReset();
Jason M. Bills94785442020-01-07 15:22:09 -08001422 }
1423#endif
Jason M. Bills89922f82019-08-06 11:10:02 -07001424 },
1425 "xyz.openbmc_project.Settings",
1426 "/xyz/openbmc_project/control/bmc_reset_disables",
1427 "org.freedesktop.DBus.Properties", "Get",
1428 "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI");
1429 });
1430}
1431
1432static void smiHandler()
1433{
1434 if (!hostOff)
1435 {
1436 gpiod::line_event gpioLineEvent = smiLine.event_read();
1437
1438 bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
1439 if (smi)
1440 {
1441 smiAssertHandler();
1442 }
1443 else
1444 {
1445 smiAssertTimer.cancel();
1446 }
1447 }
1448 smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
1449 [](const boost::system::error_code ec) {
1450 if (ec)
1451 {
1452 std::cerr
1453 << "smi handler error: " << ec.message()
1454 << "\n";
1455 return;
1456 }
1457 smiHandler();
1458 });
1459}
1460
Jason M. Billsa15c2522019-08-16 10:01:44 -07001461static void initializeErrorState()
1462{
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001463 // Handle CPU1_MISMATCH if it's asserted now
1464 if (cpu1MismatchLine.get_value() == 1)
1465 {
1466 cpuMismatchLog(1);
1467 }
1468
1469 // Handle CPU2_MISMATCH if it's asserted now
1470 if (cpu2MismatchLine.get_value() == 1)
1471 {
1472 cpuMismatchLog(2);
1473 }
1474
Jason M. Billsa15c2522019-08-16 10:01:44 -07001475 // Handle CPU_CATERR if it's asserted now
1476 if (caterrLine.get_value() == 0)
1477 {
1478 caterrAssertHandler();
Yong Li1429ca82020-04-27 16:49:45 +08001479 std::vector<Association> associations;
1480 associations.emplace_back(
1481 "", "critical", "/xyz/openbmc_project/host_error_monitor/cat_err");
1482 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1483 host_error_monitor::associationCATAssert->set_property("Associations",
1484 associations);
Jason M. Billsa15c2522019-08-16 10:01:44 -07001485 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001486
Jason M. Bills8c584392019-08-19 11:05:51 -07001487 // Handle CPU_ERR0 if it's asserted now
1488 if (err0Line.get_value() == 0)
1489 {
1490 err0AssertHandler();
1491 }
1492
Jason M. Bills75af3962019-08-19 11:07:17 -07001493 // Handle CPU_ERR1 if it's asserted now
1494 if (err1Line.get_value() == 0)
1495 {
1496 err1AssertHandler();
1497 }
1498
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001499 // Handle CPU_ERR2 if it's asserted now
1500 if (err2Line.get_value() == 0)
1501 {
1502 err2AssertHandler();
1503 }
Jason M. Bills89922f82019-08-06 11:10:02 -07001504
1505 // Handle SMI if it's asserted now
1506 if (smiLine.get_value() == 0)
1507 {
1508 smiAssertHandler();
1509 }
Jason M. Bills08866542019-08-16 12:04:19 -07001510
Jason M. Billse94f5e12019-09-13 11:11:34 -07001511 // Handle CPU1_THERMTRIP if it's asserted now
1512 if (cpu1ThermtripLine.get_value() == 0)
1513 {
1514 cpu1ThermtripAssertHandler();
1515 }
1516
1517 // Handle CPU2_THERMTRIP if it's asserted now
1518 if (cpu2ThermtripLine.get_value() == 0)
1519 {
1520 cpu2ThermtripAssertHandler();
1521 }
1522
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001523 // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
1524 if (cpu1MemtripLine.get_value() == 0)
1525 {
1526 memThermTripLog(1);
1527 }
1528
1529 // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
1530 if (cpu2MemtripLine.get_value() == 0)
1531 {
1532 memThermTripLog(2);
1533 }
1534
Jason M. Billse94f5e12019-09-13 11:11:34 -07001535 // Handle CPU1_VRHOT if it's asserted now
1536 if (cpu1VRHotLine.get_value() == 0)
1537 {
1538 cpu1VRHotAssertHandler();
1539 }
1540
1541 // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now
1542 if (cpu1MemABCDVRHotLine.get_value() == 0)
1543 {
1544 cpu1MemABCDVRHotAssertHandler();
1545 }
1546
1547 // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now
1548 if (cpu1MemEFGHVRHotLine.get_value() == 0)
1549 {
1550 cpu1MemEFGHVRHotAssertHandler();
1551 }
1552
1553 // Handle CPU2_VRHOT if it's asserted now
1554 if (cpu2VRHotLine.get_value() == 0)
1555 {
1556 cpu2VRHotAssertHandler();
1557 }
1558
1559 // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now
1560 if (cpu2MemABCDVRHotLine.get_value() == 0)
1561 {
1562 cpu2MemABCDVRHotAssertHandler();
1563 }
1564
1565 // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now
1566 if (cpu2MemEFGHVRHotLine.get_value() == 0)
1567 {
1568 cpu2MemEFGHVRHotAssertHandler();
1569 }
1570
Jason M. Bills08866542019-08-16 12:04:19 -07001571 // Handle PCH_BMC_THERMTRIP if it's asserted now
1572 if (pchThermtripLine.get_value() == 0)
1573 {
1574 ssbThermTripLog();
Yong Li1429ca82020-04-27 16:49:45 +08001575 std::vector<Association> associations;
1576 associations.emplace_back(
1577 "", "critical",
1578 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip");
1579 associations.emplace_back("", "critical", host_error_monitor::rootPath);
1580 host_error_monitor::associationSSBThermTrip->set_property(
1581 "Associations", associations);
Jason M. Bills08866542019-08-16 12:04:19 -07001582 }
Jason M. Billsa15c2522019-08-16 10:01:44 -07001583}
Jason M. Bills1490b142019-07-01 15:48:43 -07001584} // namespace host_error_monitor
1585
1586int main(int argc, char* argv[])
1587{
1588 // setup connection to dbus
1589 host_error_monitor::conn =
1590 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
1591
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001592 // Host Error Monitor Service
Jason M. Bills1490b142019-07-01 15:48:43 -07001593 host_error_monitor::conn->request_name(
1594 "xyz.openbmc_project.HostErrorMonitor");
1595 sdbusplus::asio::object_server server =
1596 sdbusplus::asio::object_server(host_error_monitor::conn);
1597
Yong Li1429ca82020-04-27 16:49:45 +08001598 // Associations interface for led status
1599 std::vector<host_error_monitor::Association> associations;
1600 associations.emplace_back("", "", "");
1601 host_error_monitor::associationSSBThermTrip = server.add_interface(
1602 "/xyz/openbmc_project/host_error_monitor/ssb_thermal_trip",
1603 "xyz.openbmc_project.Association.Definitions");
1604 host_error_monitor::associationSSBThermTrip->register_property(
1605 "Associations", associations);
1606 host_error_monitor::associationSSBThermTrip->initialize();
1607
1608 host_error_monitor::associationCATAssert = server.add_interface(
1609 "/xyz/openbmc_project/host_error_monitor/cat_assert",
1610 "xyz.openbmc_project.Association.Definitions");
1611 host_error_monitor::associationCATAssert->register_property("Associations",
1612 associations);
1613 host_error_monitor::associationCATAssert->initialize();
1614
Jason M. Billsc4b91f22019-11-26 17:04:50 -08001615 // Restart Cause Interface
1616 host_error_monitor::hostErrorTimeoutIface =
1617 server.add_interface("/xyz/openbmc_project/host_error_monitor",
1618 "xyz.openbmc_project.HostErrorMonitor.Timeout");
1619
1620 host_error_monitor::hostErrorTimeoutIface->register_property(
1621 "IERRTimeoutMs", host_error_monitor::caterrTimeoutMs,
1622 [](const std::size_t& requested, std::size_t& resp) {
1623 if (requested > host_error_monitor::caterrTimeoutMsMax)
1624 {
1625 std::cerr << "IERRTimeoutMs update to " << requested
1626 << "ms rejected. Cannot be greater than "
1627 << host_error_monitor::caterrTimeoutMsMax << "ms.\n";
1628 return 0;
1629 }
1630 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
1631 host_error_monitor::caterrTimeoutMs = requested;
1632 resp = requested;
1633 return 1;
1634 },
1635 [](std::size_t& resp) { return host_error_monitor::caterrTimeoutMs; });
1636 host_error_monitor::hostErrorTimeoutIface->initialize();
1637
Jason M. Bills1490b142019-07-01 15:48:43 -07001638 // Start tracking host state
1639 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
1640 host_error_monitor::startHostStateMonitor();
1641
jayaprakash Mutyala53099c42020-03-15 00:16:26 +00001642 // Request CPU1_MISMATCH GPIO events
1643 if (!host_error_monitor::requestGPIOInput(
1644 "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
1645 {
1646 return -1;
1647 }
1648
1649 // Request CPU2_MISMATCH GPIO events
1650 if (!host_error_monitor::requestGPIOInput(
1651 "CPU2_MISMATCH", host_error_monitor::cpu2MismatchLine))
1652 {
1653 return -1;
1654 }
1655
Jason M. Bills1490b142019-07-01 15:48:43 -07001656 // Initialize the host state
1657 host_error_monitor::initializeHostState();
1658
1659 // Request CPU_CATERR GPIO events
1660 if (!host_error_monitor::requestGPIOEvents(
1661 "CPU_CATERR", host_error_monitor::caterrHandler,
1662 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
1663 {
1664 return -1;
1665 }
1666
Jason M. Bills8c584392019-08-19 11:05:51 -07001667 // Request CPU_ERR0 GPIO events
1668 if (!host_error_monitor::requestGPIOEvents(
1669 "CPU_ERR0", host_error_monitor::err0Handler,
1670 host_error_monitor::err0Line, host_error_monitor::err0Event))
1671 {
1672 return -1;
1673 }
1674
Jason M. Bills75af3962019-08-19 11:07:17 -07001675 // Request CPU_ERR1 GPIO events
1676 if (!host_error_monitor::requestGPIOEvents(
1677 "CPU_ERR1", host_error_monitor::err1Handler,
1678 host_error_monitor::err1Line, host_error_monitor::err1Event))
1679 {
1680 return -1;
1681 }
1682
Jason M. Bills6a2cb692019-08-06 11:03:49 -07001683 // Request CPU_ERR2 GPIO events
1684 if (!host_error_monitor::requestGPIOEvents(
1685 "CPU_ERR2", host_error_monitor::err2Handler,
1686 host_error_monitor::err2Line, host_error_monitor::err2Event))
1687 {
1688 return -1;
1689 }
1690
Jason M. Bills89922f82019-08-06 11:10:02 -07001691 // Request SMI GPIO events
1692 if (!host_error_monitor::requestGPIOEvents(
1693 "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine,
1694 host_error_monitor::smiEvent))
1695 {
1696 return -1;
1697 }
1698
Jason M. Bills45e87e02019-09-09 14:45:38 -07001699 // Request CPU1_FIVR_FAULT GPIO input
1700 if (!host_error_monitor::requestGPIOInput(
1701 "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine))
1702 {
1703 return -1;
1704 }
1705
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001706 // Request CPU1_THERMTRIP GPIO events
1707 if (!host_error_monitor::requestGPIOEvents(
1708 "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler,
1709 host_error_monitor::cpu1ThermtripLine,
1710 host_error_monitor::cpu1ThermtripEvent))
1711 {
1712 return -1;
1713 }
1714
Jason M. Bills45e87e02019-09-09 14:45:38 -07001715 // Request CPU2_FIVR_FAULT GPIO input
1716 if (!host_error_monitor::requestGPIOInput(
1717 "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine))
1718 {
1719 return -1;
1720 }
1721
Jason M. Bills78c5eed2019-08-28 14:00:40 -07001722 // Request CPU2_THERMTRIP GPIO events
1723 if (!host_error_monitor::requestGPIOEvents(
1724 "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler,
1725 host_error_monitor::cpu2ThermtripLine,
1726 host_error_monitor::cpu2ThermtripEvent))
1727 {
1728 return -1;
1729 }
1730
Jason M. Bills250fa632019-08-28 15:58:25 -07001731 // Request CPU1_VRHOT GPIO events
1732 if (!host_error_monitor::requestGPIOEvents(
1733 "CPU1_VRHOT", host_error_monitor::cpu1VRHotHandler,
1734 host_error_monitor::cpu1VRHotLine,
1735 host_error_monitor::cpu1VRHotEvent))
1736 {
1737 return -1;
1738 }
1739
Jason M. Bills9647ba72019-08-29 14:19:19 -07001740 // Request CPU1_MEM_ABCD_VRHOT GPIO events
1741 if (!host_error_monitor::requestGPIOEvents(
1742 "CPU1_MEM_ABCD_VRHOT", host_error_monitor::cpu1MemABCDVRHotHandler,
1743 host_error_monitor::cpu1MemABCDVRHotLine,
1744 host_error_monitor::cpu1MemABCDVRHotEvent))
1745 {
1746 return -1;
1747 }
1748
1749 // Request CPU1_MEM_EFGH_VRHOT GPIO events
1750 if (!host_error_monitor::requestGPIOEvents(
1751 "CPU1_MEM_EFGH_VRHOT", host_error_monitor::cpu1MemEFGHVRHotHandler,
1752 host_error_monitor::cpu1MemEFGHVRHotLine,
1753 host_error_monitor::cpu1MemEFGHVRHotEvent))
1754 {
1755 return -1;
1756 }
1757
Jason M. Bills250fa632019-08-28 15:58:25 -07001758 // Request CPU2_VRHOT GPIO events
1759 if (!host_error_monitor::requestGPIOEvents(
1760 "CPU2_VRHOT", host_error_monitor::cpu2VRHotHandler,
1761 host_error_monitor::cpu2VRHotLine,
1762 host_error_monitor::cpu2VRHotEvent))
1763 {
1764 return -1;
1765 }
1766
Jason M. Bills9647ba72019-08-29 14:19:19 -07001767 // Request CPU2_MEM_ABCD_VRHOT GPIO events
1768 if (!host_error_monitor::requestGPIOEvents(
1769 "CPU2_MEM_ABCD_VRHOT", host_error_monitor::cpu2MemABCDVRHotHandler,
1770 host_error_monitor::cpu2MemABCDVRHotLine,
1771 host_error_monitor::cpu2MemABCDVRHotEvent))
1772 {
1773 return -1;
1774 }
1775
1776 // Request CPU2_MEM_EFGH_VRHOT GPIO events
1777 if (!host_error_monitor::requestGPIOEvents(
1778 "CPU2_MEM_EFGH_VRHOT", host_error_monitor::cpu2MemEFGHVRHotHandler,
1779 host_error_monitor::cpu2MemEFGHVRHotLine,
1780 host_error_monitor::cpu2MemEFGHVRHotEvent))
1781 {
1782 return -1;
1783 }
1784
Chen Yugange6c0f1c2019-08-02 20:36:42 +08001785 // Request PCH_BMC_THERMTRIP GPIO events
1786 if (!host_error_monitor::requestGPIOEvents(
1787 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
1788 host_error_monitor::pchThermtripLine,
1789 host_error_monitor::pchThermtripEvent))
1790 {
1791 return -1;
1792 }
1793
jayaprakash Mutyala009adbc2019-12-24 22:08:07 +00001794 // Request CPU1_MEM_THERM_EVENT GPIO events
1795 if (!host_error_monitor::requestGPIOEvents(
1796 "CPU1_MEM_THERM_EVENT", host_error_monitor::cpu1MemtripHandler,
1797 host_error_monitor::cpu1MemtripLine,
1798 host_error_monitor::cpu1MemtripEvent))
1799 {
1800 return -1;
1801 }
1802
1803 // Request CPU2_MEM_THERM_EVENT GPIO events
1804 if (!host_error_monitor::requestGPIOEvents(
1805 "CPU2_MEM_THERM_EVENT", host_error_monitor::cpu2MemtripHandler,
1806 host_error_monitor::cpu2MemtripLine,
1807 host_error_monitor::cpu2MemtripEvent))
1808 {
1809 return -1;
1810 }
1811
Jason M. Bills1490b142019-07-01 15:48:43 -07001812 host_error_monitor::io.run();
1813
1814 return 0;
1815}