blob: 5babf5036becc5216170f4041cad9ac2a458a4d2 [file] [log] [blame]
Jason M. Bills8fa1c962020-12-10 14:33:56 -08001/*
2// Copyright (c) 2021 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
16#pragma once
17#include <systemd/sd-journal.h>
18
19#include <error_monitors/base_gpio_poll_monitor.hpp>
20#include <host_error_monitor.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23namespace host_error_monitor::ierr_monitor
24{
25static constexpr bool debug = true;
26
27class IERRMonitor :
28 public host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor
29{
30 const static host_error_monitor::base_gpio_poll_monitor::AssertValue
31 assertValue =
32 host_error_monitor::base_gpio_poll_monitor::AssertValue::lowAssert;
JinFuLinfe641822022-11-16 09:23:10 +080033 std::shared_ptr<sdbusplus::asio::dbus_interface> assertIERR;
Jason M. Bills8fa1c962020-12-10 14:33:56 -080034 const static constexpr size_t ierrPollingTimeMs = 100;
35 const static constexpr size_t ierrTimeoutMs = 2000;
36 const static constexpr size_t ierrTimeoutMsMax =
37 600000; // 10 minutes maximum
38
39 const static constexpr uint8_t beepCPUIERR = 4;
40
41 std::shared_ptr<sdbusplus::asio::dbus_interface> associationIERR;
42 std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
43
44 static const constexpr char* callbackMgrPath =
45 "/xyz/openbmc_project/CallbackManager";
46
47 void logEvent()
48 {
49 if (!checkIERRCPUs())
50 {
51 cpuIERRLog();
52 }
53 }
54
55 void cpuIERRLog()
56 {
57 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
58 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
59 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
60 }
61
62 void cpuIERRLog(const int cpuNum)
63 {
64 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
65
66 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
67 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
68 "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
69 msg.c_str(), NULL);
70 }
71
72 void cpuIERRLog(const int cpuNum, const std::string& type)
73 {
74 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
75
76 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
77 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
78 "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
79 msg.c_str(), NULL);
80 }
81
82 bool checkIERRCPUs()
83 {
84 bool cpuIERRFound = false;
Zev Weiss03ed41b2023-04-10 20:03:08 -070085#ifdef LIBPECI
Jason M. Bills8fa1c962020-12-10 14:33:56 -080086 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
87 cpu++, addr++)
88 {
89 EPECIStatus peciStatus = PECI_CC_SUCCESS;
90 uint8_t cc = 0;
91 CPUModel model{};
92 uint8_t stepping = 0;
93 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
94 {
95 std::cerr << "Cannot get CPUID!\n";
96 continue;
97 }
98
99 switch (model)
100 {
101 case skx:
102 {
103 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
104 // that caused the IERR
105 uint32_t mcaErrSrcLog = 0;
106 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
107 (uint8_t*)&mcaErrSrcLog, &cc);
108 if (peciError(peciStatus, cc))
109 {
110 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
111 continue;
112 }
113 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
114 if ((mcaErrSrcLog & (1 << 20)) ||
115 (mcaErrSrcLog & (1 << 27)))
116 {
117 // TODO: Light the CPU fault LED?
118 cpuIERRFound = true;
119 incrementCPUErrorCount(cpu);
120 // Next check if it's a CPU/VR mismatch by reading the
121 // IA32_MC4_STATUS MSR (0x411)
122 uint64_t mc4Status = 0;
123 peciStatus =
124 peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
125 if (peciError(peciStatus, cc))
126 {
127 printPECIError("IA32_MC4_STATUS", addr, peciStatus,
128 cc);
129 continue;
130 }
131 // Check MSEC bits 31:24 for
132 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
133 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
134 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
135 uint64_t msec = (mc4Status >> 24) & 0xFF;
136 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
137 {
138 cpuIERRLog(cpu, "CPU/VR Mismatch");
139 continue;
140 }
141
142 // Next check if it's a Core FIVR fault by looking for a
143 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2
144 // offset 80h)
145 uint32_t coreFIVRErrLog = 0;
146 peciStatus = peci_RdPCIConfigLocal(
147 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
148 (uint8_t*)&coreFIVRErrLog, &cc);
149 if (peciError(peciStatus, cc))
150 {
151 printPECIError("CORE_FIVR_ERR_LOG", addr,
152 peciStatus, cc);
153 continue;
154 }
155 if (coreFIVRErrLog)
156 {
157 cpuIERRLog(cpu, "Core FIVR Fault");
158 continue;
159 }
160
161 // Next check if it's an Uncore FIVR fault by looking
162 // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30
163 // F2 offset 84h)
164 uint32_t uncoreFIVRErrLog = 0;
165 peciStatus = peci_RdPCIConfigLocal(
166 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
167 (uint8_t*)&uncoreFIVRErrLog, &cc);
168 if (peciError(peciStatus, cc))
169 {
170 printPECIError("UNCORE_FIVR_ERR_LOG", addr,
171 peciStatus, cc);
172 continue;
173 }
174 if (uncoreFIVRErrLog)
175 {
176 cpuIERRLog(cpu, "Uncore FIVR Fault");
177 continue;
178 }
179
180 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
181 // both zero, but MSEC bits 31:24 have either
182 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
183 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
184 // an uncore FIVR fault
185 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
186 (msec == 0x51 || msec == 0x52))
187 {
188 cpuIERRLog(cpu, "Uncore FIVR Fault");
189 continue;
190 }
191 cpuIERRLog(cpu);
192 }
193 break;
194 }
195 case icx:
196 {
197 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
198 // that caused the IERR
199 uint32_t mcaErrSrcLog = 0;
200 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
201 (uint8_t*)&mcaErrSrcLog, &cc);
202 if (peciError(peciStatus, cc))
203 {
204 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
205 continue;
206 }
207 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
208 if ((mcaErrSrcLog & (1 << 20)) ||
209 (mcaErrSrcLog & (1 << 27)))
210 {
211 // TODO: Light the CPU fault LED?
212 cpuIERRFound = true;
213 incrementCPUErrorCount(cpu);
214 // Next check if it's a CPU/VR mismatch by reading the
215 // IA32_MC4_STATUS MSR (0x411)
216 uint64_t mc4Status = 0;
217 peciStatus =
218 peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
219 if (peciError(peciStatus, cc))
220 {
221 printPECIError("IA32_MC4_STATUS", addr, peciStatus,
222 cc);
223 continue;
224 }
225 // Check MSEC bits 31:24 for
226 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
227 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
228 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
229 uint64_t msec = (mc4Status >> 24) & 0xFF;
230 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
231 {
232 cpuIERRLog(cpu, "CPU/VR Mismatch");
233 continue;
234 }
235
236 // Next check if it's a Core FIVR fault by looking for a
237 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2
238 // offsets C0h and C4h) (Note: Bus 31 is accessed on
239 // PECI as bus 14)
240 uint32_t coreFIVRErrLog0 = 0;
241 uint32_t coreFIVRErrLog1 = 0;
242 peciStatus = peci_RdEndPointConfigPciLocal(
243 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
244 (uint8_t*)&coreFIVRErrLog0, &cc);
245 if (peciError(peciStatus, cc))
246 {
247 printPECIError("CORE_FIVR_ERR_LOG_0", addr,
248 peciStatus, cc);
249 continue;
250 }
251 peciStatus = peci_RdEndPointConfigPciLocal(
252 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
253 (uint8_t*)&coreFIVRErrLog1, &cc);
254 if (peciError(peciStatus, cc))
255 {
256 printPECIError("CORE_FIVR_ERR_LOG_1", addr,
257 peciStatus, cc);
258 continue;
259 }
260 if (coreFIVRErrLog0 || coreFIVRErrLog1)
261 {
262 cpuIERRLog(cpu, "Core FIVR Fault");
263 continue;
264 }
265
266 // Next check if it's an Uncore FIVR fault by looking
267 // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(31)
268 // D30 F2 offset 84h) (Note: Bus 31 is accessed on PECI
269 // as bus 14)
270 uint32_t uncoreFIVRErrLog = 0;
271 peciStatus = peci_RdEndPointConfigPciLocal(
272 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
273 (uint8_t*)&uncoreFIVRErrLog, &cc);
274 if (peciError(peciStatus, cc))
275 {
276 printPECIError("UNCORE_FIVR_ERR_LOG", addr,
277 peciStatus, cc);
278 continue;
279 }
280 if (uncoreFIVRErrLog)
281 {
282 cpuIERRLog(cpu, "Uncore FIVR Fault");
283 continue;
284 }
285
286 // TODO: Update MSEC/MSCOD_31_24 check
287 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
288 // both zero, but MSEC bits 31:24 have either
289 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
290 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
291 // an uncore FIVR fault
292 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
293 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
294 {
295 cpuIERRLog(cpu, "Uncore FIVR Fault");
296 continue;
297 }
298 cpuIERRLog(cpu);
299 }
300 break;
301 }
302 }
303 }
Zev Weiss03ed41b2023-04-10 20:03:08 -0700304#endif
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800305 return cpuIERRFound;
306 }
307
308 void incrementCPUErrorCount(int cpuNum)
309 {
310 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
311
312 // Get the current count
313 conn->async_method_call(
314 [this, propertyName](boost::system::error_code ec,
315 const std::variant<uint8_t>& property) {
316 if (ec)
317 {
318 std::cerr << "Failed to read " << propertyName << ": "
319 << ec.message() << "\n";
320 return;
321 }
322 const uint8_t* errorCountVariant =
323 std::get_if<uint8_t>(&property);
324 if (errorCountVariant == nullptr)
325 {
326 std::cerr << propertyName << " invalid\n";
327 return;
328 }
329 uint8_t errorCount = *errorCountVariant;
330 if (errorCount == std::numeric_limits<uint8_t>::max())
331 {
332 std::cerr << "Maximum error count reached\n";
333 return;
334 }
335 // Increment the count
336 errorCount++;
337 conn->async_method_call(
338 [propertyName](boost::system::error_code ec) {
339 if (ec)
340 {
341 std::cerr << "Failed to set " << propertyName
342 << ": " << ec.message() << "\n";
343 }
344 },
345 "xyz.openbmc_project.Settings",
346 "/xyz/openbmc_project/control/processor_error_config",
347 "org.freedesktop.DBus.Properties", "Set",
348 "xyz.openbmc_project.Control.Processor.ErrConfig",
349 propertyName, std::variant<uint8_t>{errorCount});
350 },
351 "xyz.openbmc_project.Settings",
352 "/xyz/openbmc_project/control/processor_error_config",
353 "org.freedesktop.DBus.Properties", "Get",
354 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
355 }
356
357 void assertHandler() override
358 {
359 host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor::
360 assertHandler();
361
362 setLED();
JinFuLinfe641822022-11-16 09:23:10 +0800363 assertIERR->set_property("Asserted", true);
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800364
365 beep(conn, beepCPUIERR);
366
367 conn->async_method_call(
368 [this](boost::system::error_code ec,
369 const std::variant<bool>& property) {
JinFuLin519f2cd2022-11-08 10:48:56 +0800370 // Default to no reset after Crashdump
Jason M. Billsc4a241e2023-07-26 14:15:04 -0700371 RecoveryType recovery = RecoveryType::noRecovery;
JinFuLin519f2cd2022-11-08 10:48:56 +0800372 if (!ec)
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800373 {
JinFuLin519f2cd2022-11-08 10:48:56 +0800374 const bool* resetPtr = std::get_if<bool>(&property);
375 if (resetPtr == nullptr)
376 {
377 std::cerr << "Unable to read reset on CATERR value\n";
378 }
Jason M. Billsc4a241e2023-07-26 14:15:04 -0700379 else if (*resetPtr)
JinFuLin519f2cd2022-11-08 10:48:56 +0800380 {
Jason M. Billsc4a241e2023-07-26 14:15:04 -0700381 recovery = RecoveryType::warmReset;
JinFuLin519f2cd2022-11-08 10:48:56 +0800382 }
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800383 }
Jason M. Billsc4a241e2023-07-26 14:15:04 -0700384 startCrashdumpAndRecovery(conn, recovery, "IERR");
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800385 },
386 "xyz.openbmc_project.Settings",
387 "/xyz/openbmc_project/control/processor_error_config",
388 "org.freedesktop.DBus.Properties", "Get",
389 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
390 }
391
392 void deassertHandler() override
393 {
394 unsetLED();
JinFuLinfe641822022-11-16 09:23:10 +0800395 assertIERR->set_property("Asserted", false);
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800396 }
397
398 void setLED()
399 {
400 std::vector<Association> associations;
401
402 associations.emplace_back(
403 "", "critical", "/xyz/openbmc_project/host_error_monitor/ierr");
404 associations.emplace_back("", "critical", callbackMgrPath);
405
406 associationIERR->set_property("Associations", associations);
407 }
408
409 void unsetLED()
410 {
411 std::vector<Association> associations;
412
413 associations.emplace_back("", "", "");
414
415 associationIERR->set_property("Associations", associations);
416 }
417
418 public:
Ed Tanousee00ccc2023-03-01 10:37:43 -0800419 IERRMonitor(boost::asio::io_context& io,
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800420 std::shared_ptr<sdbusplus::asio::connection> conn,
JinFuLinf8c0e1a2022-12-05 16:32:43 +0800421 const std::string& signalName,
422 const std::string& customName = std::string()) :
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800423 BaseGPIOPollMonitor(io, conn, signalName, assertValue,
424 ierrPollingTimeMs, ierrTimeoutMs)
425 {
426 // Associations interface for led status
427 std::vector<host_error_monitor::Association> associations;
428 associations.emplace_back("", "", "");
429
430 sdbusplus::asio::object_server server =
431 sdbusplus::asio::object_server(conn);
432 associationIERR =
433 server.add_interface("/xyz/openbmc_project/host_error_monitor/ierr",
434 "xyz.openbmc_project.Association.Definitions");
435 associationIERR->register_property("Associations", associations);
436 associationIERR->initialize();
437
438 hostErrorTimeoutIface = server.add_interface(
439 "/xyz/openbmc_project/host_error_monitor",
440 "xyz.openbmc_project.HostErrorMonitor.Timeout");
441
442 hostErrorTimeoutIface->register_property(
443 "IERRTimeoutMs", ierrTimeoutMs,
444 [this](const std::size_t& requested, std::size_t& resp) {
445 if (requested > ierrTimeoutMsMax)
446 {
447 std::cerr << "IERRTimeoutMs update to " << requested
448 << "ms rejected. Cannot be greater than "
449 << ierrTimeoutMsMax << "ms.\n";
450 return 0;
451 }
452 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
453 setTimeoutMs(requested);
454 resp = requested;
455 return 1;
456 },
457 [this](std::size_t& resp) { return getTimeoutMs(); });
458 hostErrorTimeoutIface->initialize();
459
JinFuLinf8c0e1a2022-12-05 16:32:43 +0800460 std::string objectName = customName.empty() ? signalName : customName;
JinFuLinfe641822022-11-16 09:23:10 +0800461 assertIERR = server.add_interface(
JinFuLinf8c0e1a2022-12-05 16:32:43 +0800462 "/xyz/openbmc_project/host_error_monitor/processor/" + objectName,
463 "xyz.openbmc_project.HostErrorMonitor.Processor.IERR");
JinFuLinfe641822022-11-16 09:23:10 +0800464 assertIERR->register_property("Asserted", false);
465 assertIERR->initialize();
466
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800467 if (valid)
468 {
469 startPolling();
470 }
471 }
472};
473} // namespace host_error_monitor::ierr_monitor