blob: 213eb2e0e622beb5517ad5cb593b2356ee1c561b [file] [log] [blame]
Jason M. Bills8fa1c962020-12-10 14:33:56 -08001/*
2// Copyright (c) 2021 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
16#pragma once
17#include <systemd/sd-journal.h>
18
19#include <error_monitors/base_gpio_poll_monitor.hpp>
20#include <host_error_monitor.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23namespace host_error_monitor::ierr_monitor
24{
25static constexpr bool debug = true;
26
27class IERRMonitor :
28 public host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor
29{
30 const static host_error_monitor::base_gpio_poll_monitor::AssertValue
31 assertValue =
32 host_error_monitor::base_gpio_poll_monitor::AssertValue::lowAssert;
JinFuLinfe641822022-11-16 09:23:10 +080033 std::shared_ptr<sdbusplus::asio::dbus_interface> assertIERR;
Jason M. Bills8fa1c962020-12-10 14:33:56 -080034 const static constexpr size_t ierrPollingTimeMs = 100;
35 const static constexpr size_t ierrTimeoutMs = 2000;
36 const static constexpr size_t ierrTimeoutMsMax =
37 600000; // 10 minutes maximum
38
39 const static constexpr uint8_t beepCPUIERR = 4;
40
41 std::shared_ptr<sdbusplus::asio::dbus_interface> associationIERR;
42 std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
43
44 static const constexpr char* callbackMgrPath =
45 "/xyz/openbmc_project/CallbackManager";
46
47 void logEvent()
48 {
49 if (!checkIERRCPUs())
50 {
51 cpuIERRLog();
52 }
53 }
54
55 void cpuIERRLog()
56 {
57 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
58 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
59 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
60 }
61
62 void cpuIERRLog(const int cpuNum)
63 {
64 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
65
66 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
67 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
68 "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
69 msg.c_str(), NULL);
70 }
71
72 void cpuIERRLog(const int cpuNum, const std::string& type)
73 {
74 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
75
76 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
77 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
78 "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
79 msg.c_str(), NULL);
80 }
81
82 bool checkIERRCPUs()
83 {
84 bool cpuIERRFound = false;
85 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
86 cpu++, addr++)
87 {
88 EPECIStatus peciStatus = PECI_CC_SUCCESS;
89 uint8_t cc = 0;
90 CPUModel model{};
91 uint8_t stepping = 0;
92 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
93 {
94 std::cerr << "Cannot get CPUID!\n";
95 continue;
96 }
97
98 switch (model)
99 {
100 case skx:
101 {
102 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
103 // that caused the IERR
104 uint32_t mcaErrSrcLog = 0;
105 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
106 (uint8_t*)&mcaErrSrcLog, &cc);
107 if (peciError(peciStatus, cc))
108 {
109 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
110 continue;
111 }
112 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
113 if ((mcaErrSrcLog & (1 << 20)) ||
114 (mcaErrSrcLog & (1 << 27)))
115 {
116 // TODO: Light the CPU fault LED?
117 cpuIERRFound = true;
118 incrementCPUErrorCount(cpu);
119 // Next check if it's a CPU/VR mismatch by reading the
120 // IA32_MC4_STATUS MSR (0x411)
121 uint64_t mc4Status = 0;
122 peciStatus =
123 peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
124 if (peciError(peciStatus, cc))
125 {
126 printPECIError("IA32_MC4_STATUS", addr, peciStatus,
127 cc);
128 continue;
129 }
130 // Check MSEC bits 31:24 for
131 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
132 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
133 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
134 uint64_t msec = (mc4Status >> 24) & 0xFF;
135 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
136 {
137 cpuIERRLog(cpu, "CPU/VR Mismatch");
138 continue;
139 }
140
141 // Next check if it's a Core FIVR fault by looking for a
142 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2
143 // offset 80h)
144 uint32_t coreFIVRErrLog = 0;
145 peciStatus = peci_RdPCIConfigLocal(
146 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
147 (uint8_t*)&coreFIVRErrLog, &cc);
148 if (peciError(peciStatus, cc))
149 {
150 printPECIError("CORE_FIVR_ERR_LOG", addr,
151 peciStatus, cc);
152 continue;
153 }
154 if (coreFIVRErrLog)
155 {
156 cpuIERRLog(cpu, "Core FIVR Fault");
157 continue;
158 }
159
160 // Next check if it's an Uncore FIVR fault by looking
161 // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30
162 // F2 offset 84h)
163 uint32_t uncoreFIVRErrLog = 0;
164 peciStatus = peci_RdPCIConfigLocal(
165 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
166 (uint8_t*)&uncoreFIVRErrLog, &cc);
167 if (peciError(peciStatus, cc))
168 {
169 printPECIError("UNCORE_FIVR_ERR_LOG", addr,
170 peciStatus, cc);
171 continue;
172 }
173 if (uncoreFIVRErrLog)
174 {
175 cpuIERRLog(cpu, "Uncore FIVR Fault");
176 continue;
177 }
178
179 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
180 // both zero, but MSEC bits 31:24 have either
181 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
182 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
183 // an uncore FIVR fault
184 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
185 (msec == 0x51 || msec == 0x52))
186 {
187 cpuIERRLog(cpu, "Uncore FIVR Fault");
188 continue;
189 }
190 cpuIERRLog(cpu);
191 }
192 break;
193 }
194 case icx:
195 {
196 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
197 // that caused the IERR
198 uint32_t mcaErrSrcLog = 0;
199 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
200 (uint8_t*)&mcaErrSrcLog, &cc);
201 if (peciError(peciStatus, cc))
202 {
203 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
204 continue;
205 }
206 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
207 if ((mcaErrSrcLog & (1 << 20)) ||
208 (mcaErrSrcLog & (1 << 27)))
209 {
210 // TODO: Light the CPU fault LED?
211 cpuIERRFound = true;
212 incrementCPUErrorCount(cpu);
213 // Next check if it's a CPU/VR mismatch by reading the
214 // IA32_MC4_STATUS MSR (0x411)
215 uint64_t mc4Status = 0;
216 peciStatus =
217 peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
218 if (peciError(peciStatus, cc))
219 {
220 printPECIError("IA32_MC4_STATUS", addr, peciStatus,
221 cc);
222 continue;
223 }
224 // Check MSEC bits 31:24 for
225 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
226 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
227 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
228 uint64_t msec = (mc4Status >> 24) & 0xFF;
229 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
230 {
231 cpuIERRLog(cpu, "CPU/VR Mismatch");
232 continue;
233 }
234
235 // Next check if it's a Core FIVR fault by looking for a
236 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2
237 // offsets C0h and C4h) (Note: Bus 31 is accessed on
238 // PECI as bus 14)
239 uint32_t coreFIVRErrLog0 = 0;
240 uint32_t coreFIVRErrLog1 = 0;
241 peciStatus = peci_RdEndPointConfigPciLocal(
242 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
243 (uint8_t*)&coreFIVRErrLog0, &cc);
244 if (peciError(peciStatus, cc))
245 {
246 printPECIError("CORE_FIVR_ERR_LOG_0", addr,
247 peciStatus, cc);
248 continue;
249 }
250 peciStatus = peci_RdEndPointConfigPciLocal(
251 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
252 (uint8_t*)&coreFIVRErrLog1, &cc);
253 if (peciError(peciStatus, cc))
254 {
255 printPECIError("CORE_FIVR_ERR_LOG_1", addr,
256 peciStatus, cc);
257 continue;
258 }
259 if (coreFIVRErrLog0 || coreFIVRErrLog1)
260 {
261 cpuIERRLog(cpu, "Core FIVR Fault");
262 continue;
263 }
264
265 // Next check if it's an Uncore FIVR fault by looking
266 // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(31)
267 // D30 F2 offset 84h) (Note: Bus 31 is accessed on PECI
268 // as bus 14)
269 uint32_t uncoreFIVRErrLog = 0;
270 peciStatus = peci_RdEndPointConfigPciLocal(
271 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
272 (uint8_t*)&uncoreFIVRErrLog, &cc);
273 if (peciError(peciStatus, cc))
274 {
275 printPECIError("UNCORE_FIVR_ERR_LOG", addr,
276 peciStatus, cc);
277 continue;
278 }
279 if (uncoreFIVRErrLog)
280 {
281 cpuIERRLog(cpu, "Uncore FIVR Fault");
282 continue;
283 }
284
285 // TODO: Update MSEC/MSCOD_31_24 check
286 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
287 // both zero, but MSEC bits 31:24 have either
288 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
289 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
290 // an uncore FIVR fault
291 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
292 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
293 {
294 cpuIERRLog(cpu, "Uncore FIVR Fault");
295 continue;
296 }
297 cpuIERRLog(cpu);
298 }
299 break;
300 }
301 }
302 }
303 return cpuIERRFound;
304 }
305
306 void incrementCPUErrorCount(int cpuNum)
307 {
308 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
309
310 // Get the current count
311 conn->async_method_call(
312 [this, propertyName](boost::system::error_code ec,
313 const std::variant<uint8_t>& property) {
314 if (ec)
315 {
316 std::cerr << "Failed to read " << propertyName << ": "
317 << ec.message() << "\n";
318 return;
319 }
320 const uint8_t* errorCountVariant =
321 std::get_if<uint8_t>(&property);
322 if (errorCountVariant == nullptr)
323 {
324 std::cerr << propertyName << " invalid\n";
325 return;
326 }
327 uint8_t errorCount = *errorCountVariant;
328 if (errorCount == std::numeric_limits<uint8_t>::max())
329 {
330 std::cerr << "Maximum error count reached\n";
331 return;
332 }
333 // Increment the count
334 errorCount++;
335 conn->async_method_call(
336 [propertyName](boost::system::error_code ec) {
337 if (ec)
338 {
339 std::cerr << "Failed to set " << propertyName
340 << ": " << ec.message() << "\n";
341 }
342 },
343 "xyz.openbmc_project.Settings",
344 "/xyz/openbmc_project/control/processor_error_config",
345 "org.freedesktop.DBus.Properties", "Set",
346 "xyz.openbmc_project.Control.Processor.ErrConfig",
347 propertyName, std::variant<uint8_t>{errorCount});
348 },
349 "xyz.openbmc_project.Settings",
350 "/xyz/openbmc_project/control/processor_error_config",
351 "org.freedesktop.DBus.Properties", "Get",
352 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
353 }
354
355 void assertHandler() override
356 {
357 host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor::
358 assertHandler();
359
360 setLED();
JinFuLinfe641822022-11-16 09:23:10 +0800361 assertIERR->set_property("Asserted", true);
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800362
363 beep(conn, beepCPUIERR);
364
365 conn->async_method_call(
366 [this](boost::system::error_code ec,
367 const std::variant<bool>& property) {
JinFuLin519f2cd2022-11-08 10:48:56 +0800368 // Default to no reset after Crashdump
369 bool reset = false;
370 if (!ec)
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800371 {
JinFuLin519f2cd2022-11-08 10:48:56 +0800372 const bool* resetPtr = std::get_if<bool>(&property);
373 if (resetPtr == nullptr)
374 {
375 std::cerr << "Unable to read reset on CATERR value\n";
376 }
377 else
378 {
379 reset = *resetPtr;
380 }
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800381 }
JinFuLin519f2cd2022-11-08 10:48:56 +0800382 startCrashdumpAndRecovery(conn, reset, "IERR");
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800383 },
384 "xyz.openbmc_project.Settings",
385 "/xyz/openbmc_project/control/processor_error_config",
386 "org.freedesktop.DBus.Properties", "Get",
387 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
388 }
389
390 void deassertHandler() override
391 {
392 unsetLED();
JinFuLinfe641822022-11-16 09:23:10 +0800393 assertIERR->set_property("Asserted", false);
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800394 }
395
396 void setLED()
397 {
398 std::vector<Association> associations;
399
400 associations.emplace_back(
401 "", "critical", "/xyz/openbmc_project/host_error_monitor/ierr");
402 associations.emplace_back("", "critical", callbackMgrPath);
403
404 associationIERR->set_property("Associations", associations);
405 }
406
407 void unsetLED()
408 {
409 std::vector<Association> associations;
410
411 associations.emplace_back("", "", "");
412
413 associationIERR->set_property("Associations", associations);
414 }
415
416 public:
Ed Tanousee00ccc2023-03-01 10:37:43 -0800417 IERRMonitor(boost::asio::io_context& io,
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800418 std::shared_ptr<sdbusplus::asio::connection> conn,
JinFuLinf8c0e1a2022-12-05 16:32:43 +0800419 const std::string& signalName,
420 const std::string& customName = std::string()) :
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800421 BaseGPIOPollMonitor(io, conn, signalName, assertValue,
422 ierrPollingTimeMs, ierrTimeoutMs)
423 {
424 // Associations interface for led status
425 std::vector<host_error_monitor::Association> associations;
426 associations.emplace_back("", "", "");
427
428 sdbusplus::asio::object_server server =
429 sdbusplus::asio::object_server(conn);
430 associationIERR =
431 server.add_interface("/xyz/openbmc_project/host_error_monitor/ierr",
432 "xyz.openbmc_project.Association.Definitions");
433 associationIERR->register_property("Associations", associations);
434 associationIERR->initialize();
435
436 hostErrorTimeoutIface = server.add_interface(
437 "/xyz/openbmc_project/host_error_monitor",
438 "xyz.openbmc_project.HostErrorMonitor.Timeout");
439
440 hostErrorTimeoutIface->register_property(
441 "IERRTimeoutMs", ierrTimeoutMs,
442 [this](const std::size_t& requested, std::size_t& resp) {
443 if (requested > ierrTimeoutMsMax)
444 {
445 std::cerr << "IERRTimeoutMs update to " << requested
446 << "ms rejected. Cannot be greater than "
447 << ierrTimeoutMsMax << "ms.\n";
448 return 0;
449 }
450 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
451 setTimeoutMs(requested);
452 resp = requested;
453 return 1;
454 },
455 [this](std::size_t& resp) { return getTimeoutMs(); });
456 hostErrorTimeoutIface->initialize();
457
JinFuLinf8c0e1a2022-12-05 16:32:43 +0800458 std::string objectName = customName.empty() ? signalName : customName;
JinFuLinfe641822022-11-16 09:23:10 +0800459 assertIERR = server.add_interface(
JinFuLinf8c0e1a2022-12-05 16:32:43 +0800460 "/xyz/openbmc_project/host_error_monitor/processor/" + objectName,
461 "xyz.openbmc_project.HostErrorMonitor.Processor.IERR");
JinFuLinfe641822022-11-16 09:23:10 +0800462 assertIERR->register_property("Asserted", false);
463 assertIERR->initialize();
464
Jason M. Bills8fa1c962020-12-10 14:33:56 -0800465 if (valid)
466 {
467 startPolling();
468 }
469 }
470};
471} // namespace host_error_monitor::ierr_monitor