blob: e7431773a778309e2cac80db954011636fac0dc6 [file] [log] [blame]
Jason M. Bills8fa1c962020-12-10 14:33:56 -08001/*
2// Copyright (c) 2021 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
16#pragma once
17#include <systemd/sd-journal.h>
18
19#include <error_monitors/base_gpio_poll_monitor.hpp>
20#include <host_error_monitor.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23namespace host_error_monitor::ierr_monitor
24{
25static constexpr bool debug = true;
26
27class IERRMonitor :
28 public host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor
29{
30 const static host_error_monitor::base_gpio_poll_monitor::AssertValue
31 assertValue =
32 host_error_monitor::base_gpio_poll_monitor::AssertValue::lowAssert;
33 const static constexpr size_t ierrPollingTimeMs = 100;
34 const static constexpr size_t ierrTimeoutMs = 2000;
35 const static constexpr size_t ierrTimeoutMsMax =
36 600000; // 10 minutes maximum
37
38 const static constexpr uint8_t beepCPUIERR = 4;
39
40 std::shared_ptr<sdbusplus::asio::dbus_interface> associationIERR;
41 std::shared_ptr<sdbusplus::asio::dbus_interface> hostErrorTimeoutIface;
42
43 static const constexpr char* callbackMgrPath =
44 "/xyz/openbmc_project/CallbackManager";
45
46 void logEvent()
47 {
48 if (!checkIERRCPUs())
49 {
50 cpuIERRLog();
51 }
52 }
53
54 void cpuIERRLog()
55 {
56 sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
57 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
58 "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
59 }
60
61 void cpuIERRLog(const int cpuNum)
62 {
63 std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
64
65 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
66 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
67 "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
68 msg.c_str(), NULL);
69 }
70
71 void cpuIERRLog(const int cpuNum, const std::string& type)
72 {
73 std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
74
75 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
76 LOG_INFO, "REDFISH_MESSAGE_ID=%s",
77 "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s",
78 msg.c_str(), NULL);
79 }
80
81 bool checkIERRCPUs()
82 {
83 bool cpuIERRFound = false;
84 for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
85 cpu++, addr++)
86 {
87 EPECIStatus peciStatus = PECI_CC_SUCCESS;
88 uint8_t cc = 0;
89 CPUModel model{};
90 uint8_t stepping = 0;
91 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
92 {
93 std::cerr << "Cannot get CPUID!\n";
94 continue;
95 }
96
97 switch (model)
98 {
99 case skx:
100 {
101 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
102 // that caused the IERR
103 uint32_t mcaErrSrcLog = 0;
104 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
105 (uint8_t*)&mcaErrSrcLog, &cc);
106 if (peciError(peciStatus, cc))
107 {
108 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
109 continue;
110 }
111 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
112 if ((mcaErrSrcLog & (1 << 20)) ||
113 (mcaErrSrcLog & (1 << 27)))
114 {
115 // TODO: Light the CPU fault LED?
116 cpuIERRFound = true;
117 incrementCPUErrorCount(cpu);
118 // Next check if it's a CPU/VR mismatch by reading the
119 // IA32_MC4_STATUS MSR (0x411)
120 uint64_t mc4Status = 0;
121 peciStatus =
122 peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
123 if (peciError(peciStatus, cc))
124 {
125 printPECIError("IA32_MC4_STATUS", addr, peciStatus,
126 cc);
127 continue;
128 }
129 // Check MSEC bits 31:24 for
130 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
131 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
132 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
133 uint64_t msec = (mc4Status >> 24) & 0xFF;
134 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
135 {
136 cpuIERRLog(cpu, "CPU/VR Mismatch");
137 continue;
138 }
139
140 // Next check if it's a Core FIVR fault by looking for a
141 // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2
142 // offset 80h)
143 uint32_t coreFIVRErrLog = 0;
144 peciStatus = peci_RdPCIConfigLocal(
145 addr, 1, 30, 2, 0x80, sizeof(uint32_t),
146 (uint8_t*)&coreFIVRErrLog, &cc);
147 if (peciError(peciStatus, cc))
148 {
149 printPECIError("CORE_FIVR_ERR_LOG", addr,
150 peciStatus, cc);
151 continue;
152 }
153 if (coreFIVRErrLog)
154 {
155 cpuIERRLog(cpu, "Core FIVR Fault");
156 continue;
157 }
158
159 // Next check if it's an Uncore FIVR fault by looking
160 // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30
161 // F2 offset 84h)
162 uint32_t uncoreFIVRErrLog = 0;
163 peciStatus = peci_RdPCIConfigLocal(
164 addr, 1, 30, 2, 0x84, sizeof(uint32_t),
165 (uint8_t*)&uncoreFIVRErrLog, &cc);
166 if (peciError(peciStatus, cc))
167 {
168 printPECIError("UNCORE_FIVR_ERR_LOG", addr,
169 peciStatus, cc);
170 continue;
171 }
172 if (uncoreFIVRErrLog)
173 {
174 cpuIERRLog(cpu, "Uncore FIVR Fault");
175 continue;
176 }
177
178 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
179 // both zero, but MSEC bits 31:24 have either
180 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
181 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
182 // an uncore FIVR fault
183 if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
184 (msec == 0x51 || msec == 0x52))
185 {
186 cpuIERRLog(cpu, "Uncore FIVR Fault");
187 continue;
188 }
189 cpuIERRLog(cpu);
190 }
191 break;
192 }
193 case icx:
194 {
195 // First check the MCA_ERR_SRC_LOG to see if this is the CPU
196 // that caused the IERR
197 uint32_t mcaErrSrcLog = 0;
198 peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
199 (uint8_t*)&mcaErrSrcLog, &cc);
200 if (peciError(peciStatus, cc))
201 {
202 printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
203 continue;
204 }
205 // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
206 if ((mcaErrSrcLog & (1 << 20)) ||
207 (mcaErrSrcLog & (1 << 27)))
208 {
209 // TODO: Light the CPU fault LED?
210 cpuIERRFound = true;
211 incrementCPUErrorCount(cpu);
212 // Next check if it's a CPU/VR mismatch by reading the
213 // IA32_MC4_STATUS MSR (0x411)
214 uint64_t mc4Status = 0;
215 peciStatus =
216 peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
217 if (peciError(peciStatus, cc))
218 {
219 printPECIError("IA32_MC4_STATUS", addr, peciStatus,
220 cc);
221 continue;
222 }
223 // Check MSEC bits 31:24 for
224 // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
225 // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
226 // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
227 uint64_t msec = (mc4Status >> 24) & 0xFF;
228 if (msec == 0x40 || msec == 0x42 || msec == 0x43)
229 {
230 cpuIERRLog(cpu, "CPU/VR Mismatch");
231 continue;
232 }
233
234 // Next check if it's a Core FIVR fault by looking for a
235 // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2
236 // offsets C0h and C4h) (Note: Bus 31 is accessed on
237 // PECI as bus 14)
238 uint32_t coreFIVRErrLog0 = 0;
239 uint32_t coreFIVRErrLog1 = 0;
240 peciStatus = peci_RdEndPointConfigPciLocal(
241 addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
242 (uint8_t*)&coreFIVRErrLog0, &cc);
243 if (peciError(peciStatus, cc))
244 {
245 printPECIError("CORE_FIVR_ERR_LOG_0", addr,
246 peciStatus, cc);
247 continue;
248 }
249 peciStatus = peci_RdEndPointConfigPciLocal(
250 addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
251 (uint8_t*)&coreFIVRErrLog1, &cc);
252 if (peciError(peciStatus, cc))
253 {
254 printPECIError("CORE_FIVR_ERR_LOG_1", addr,
255 peciStatus, cc);
256 continue;
257 }
258 if (coreFIVRErrLog0 || coreFIVRErrLog1)
259 {
260 cpuIERRLog(cpu, "Core FIVR Fault");
261 continue;
262 }
263
264 // Next check if it's an Uncore FIVR fault by looking
265 // for a non-zero value of UNCORE_FIVR_ERR_LOG (B(31)
266 // D30 F2 offset 84h) (Note: Bus 31 is accessed on PECI
267 // as bus 14)
268 uint32_t uncoreFIVRErrLog = 0;
269 peciStatus = peci_RdEndPointConfigPciLocal(
270 addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
271 (uint8_t*)&uncoreFIVRErrLog, &cc);
272 if (peciError(peciStatus, cc))
273 {
274 printPECIError("UNCORE_FIVR_ERR_LOG", addr,
275 peciStatus, cc);
276 continue;
277 }
278 if (uncoreFIVRErrLog)
279 {
280 cpuIERRLog(cpu, "Uncore FIVR Fault");
281 continue;
282 }
283
284 // TODO: Update MSEC/MSCOD_31_24 check
285 // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
286 // both zero, but MSEC bits 31:24 have either
287 // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
288 // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as
289 // an uncore FIVR fault
290 if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
291 !uncoreFIVRErrLog && (msec == 0x51 || msec == 0x52))
292 {
293 cpuIERRLog(cpu, "Uncore FIVR Fault");
294 continue;
295 }
296 cpuIERRLog(cpu);
297 }
298 break;
299 }
300 }
301 }
302 return cpuIERRFound;
303 }
304
305 void incrementCPUErrorCount(int cpuNum)
306 {
307 std::string propertyName = "ErrorCountCPU" + std::to_string(cpuNum + 1);
308
309 // Get the current count
310 conn->async_method_call(
311 [this, propertyName](boost::system::error_code ec,
312 const std::variant<uint8_t>& property) {
313 if (ec)
314 {
315 std::cerr << "Failed to read " << propertyName << ": "
316 << ec.message() << "\n";
317 return;
318 }
319 const uint8_t* errorCountVariant =
320 std::get_if<uint8_t>(&property);
321 if (errorCountVariant == nullptr)
322 {
323 std::cerr << propertyName << " invalid\n";
324 return;
325 }
326 uint8_t errorCount = *errorCountVariant;
327 if (errorCount == std::numeric_limits<uint8_t>::max())
328 {
329 std::cerr << "Maximum error count reached\n";
330 return;
331 }
332 // Increment the count
333 errorCount++;
334 conn->async_method_call(
335 [propertyName](boost::system::error_code ec) {
336 if (ec)
337 {
338 std::cerr << "Failed to set " << propertyName
339 << ": " << ec.message() << "\n";
340 }
341 },
342 "xyz.openbmc_project.Settings",
343 "/xyz/openbmc_project/control/processor_error_config",
344 "org.freedesktop.DBus.Properties", "Set",
345 "xyz.openbmc_project.Control.Processor.ErrConfig",
346 propertyName, std::variant<uint8_t>{errorCount});
347 },
348 "xyz.openbmc_project.Settings",
349 "/xyz/openbmc_project/control/processor_error_config",
350 "org.freedesktop.DBus.Properties", "Get",
351 "xyz.openbmc_project.Control.Processor.ErrConfig", propertyName);
352 }
353
354 void assertHandler() override
355 {
356 host_error_monitor::base_gpio_poll_monitor::BaseGPIOPollMonitor::
357 assertHandler();
358
359 setLED();
360
361 beep(conn, beepCPUIERR);
362
363 conn->async_method_call(
364 [this](boost::system::error_code ec,
365 const std::variant<bool>& property) {
366 if (ec)
367 {
368 return;
369 }
370 const bool* reset = std::get_if<bool>(&property);
371 if (reset == nullptr)
372 {
373 std::cerr << "Unable to read reset on CATERR value\n";
374 return;
375 }
376 startCrashdumpAndRecovery(conn, *reset, "IERR");
377 },
378 "xyz.openbmc_project.Settings",
379 "/xyz/openbmc_project/control/processor_error_config",
380 "org.freedesktop.DBus.Properties", "Get",
381 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
382 }
383
384 void deassertHandler() override
385 {
386 unsetLED();
387 }
388
389 void setLED()
390 {
391 std::vector<Association> associations;
392
393 associations.emplace_back(
394 "", "critical", "/xyz/openbmc_project/host_error_monitor/ierr");
395 associations.emplace_back("", "critical", callbackMgrPath);
396
397 associationIERR->set_property("Associations", associations);
398 }
399
400 void unsetLED()
401 {
402 std::vector<Association> associations;
403
404 associations.emplace_back("", "", "");
405
406 associationIERR->set_property("Associations", associations);
407 }
408
409 public:
410 IERRMonitor(boost::asio::io_service& io,
411 std::shared_ptr<sdbusplus::asio::connection> conn,
412 const std::string& signalName) :
413 BaseGPIOPollMonitor(io, conn, signalName, assertValue,
414 ierrPollingTimeMs, ierrTimeoutMs)
415 {
416 // Associations interface for led status
417 std::vector<host_error_monitor::Association> associations;
418 associations.emplace_back("", "", "");
419
420 sdbusplus::asio::object_server server =
421 sdbusplus::asio::object_server(conn);
422 associationIERR =
423 server.add_interface("/xyz/openbmc_project/host_error_monitor/ierr",
424 "xyz.openbmc_project.Association.Definitions");
425 associationIERR->register_property("Associations", associations);
426 associationIERR->initialize();
427
428 hostErrorTimeoutIface = server.add_interface(
429 "/xyz/openbmc_project/host_error_monitor",
430 "xyz.openbmc_project.HostErrorMonitor.Timeout");
431
432 hostErrorTimeoutIface->register_property(
433 "IERRTimeoutMs", ierrTimeoutMs,
434 [this](const std::size_t& requested, std::size_t& resp) {
435 if (requested > ierrTimeoutMsMax)
436 {
437 std::cerr << "IERRTimeoutMs update to " << requested
438 << "ms rejected. Cannot be greater than "
439 << ierrTimeoutMsMax << "ms.\n";
440 return 0;
441 }
442 std::cerr << "IERRTimeoutMs updated to " << requested << "ms\n";
443 setTimeoutMs(requested);
444 resp = requested;
445 return 1;
446 },
447 [this](std::size_t& resp) { return getTimeoutMs(); });
448 hostErrorTimeoutIface->initialize();
449
450 if (valid)
451 {
452 startPolling();
453 }
454 }
455};
456} // namespace host_error_monitor::ierr_monitor