blob: b2c485ca79718ad249ed0bf53eaa0c82c4fb8a88 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Jason M. Bills6a2cb692019-08-06 11:03:49 -070016#include <peci.h>
Chen Yugange6c0f1c2019-08-02 20:36:42 +080017#include <systemd/sd-journal.h>
18
Jason M. Bills6a2cb692019-08-06 11:03:49 -070019#include <bitset>
Jason M. Bills1490b142019-07-01 15:48:43 -070020#include <boost/asio/posix/stream_descriptor.hpp>
21#include <gpiod.hpp>
22#include <iostream>
23#include <sdbusplus/asio/object_server.hpp>
24
25namespace host_error_monitor
26{
27static boost::asio::io_service io;
28static std::shared_ptr<sdbusplus::asio::connection> conn;
29
30static bool hostOff = true;
31
32const static constexpr size_t caterrTimeoutMs = 2000;
Jason M. Bills6a2cb692019-08-06 11:03:49 -070033const static constexpr size_t err2TimeoutMs = 90000;
Jason M. Bills1490b142019-07-01 15:48:43 -070034const static constexpr size_t crashdumpTimeoutS = 300;
35
36// Timers
37// Timer for CATERR asserted
38static boost::asio::steady_timer caterrAssertTimer(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070039// Timer for ERR2 asserted
40static boost::asio::steady_timer err2AssertTimer(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070041
42// GPIO Lines and Event Descriptors
43static gpiod::line caterrLine;
44static boost::asio::posix::stream_descriptor caterrEvent(io);
Jason M. Bills6a2cb692019-08-06 11:03:49 -070045static gpiod::line err2Line;
46static boost::asio::posix::stream_descriptor err2Event(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080047//----------------------------------
48// PCH_BMC_THERMTRIP function related definition
49//----------------------------------
50// GPIO Lines and Event Descriptors
51static gpiod::line pchThermtripLine;
52static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070053
Jason M. Bills6a2cb692019-08-06 11:03:49 -070054static void cpuERR2Log()
55{
56 sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
57 "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
58 "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
59}
60
61static void cpuERR2Log(const int cpuNum)
62{
63 std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
64
65 sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
66 LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
67 "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
68}
69
Jason M. Billsa15c2522019-08-16 10:01:44 -070070static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -070071static void initializeHostState()
72{
73 conn->async_method_call(
74 [](boost::system::error_code ec,
75 const std::variant<std::string>& property) {
76 if (ec)
77 {
78 return;
79 }
80 const std::string* state = std::get_if<std::string>(&property);
81 if (state == nullptr)
82 {
83 std::cerr << "Unable to read host state value\n";
84 return;
85 }
86 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -070087 // If the system is on, initialize the error state
88 if (!hostOff)
89 {
90 initializeErrorState();
91 }
Jason M. Bills1490b142019-07-01 15:48:43 -070092 },
93 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
94 "org.freedesktop.DBus.Properties", "Get",
95 "xyz.openbmc_project.State.Host", "CurrentHostState");
96}
97
98static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
99{
100 return std::make_shared<sdbusplus::bus::match::match>(
101 *conn,
102 "type='signal',interface='org.freedesktop.DBus.Properties',"
103 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
104 "Host'",
105 [](sdbusplus::message::message& msg) {
106 std::string interfaceName;
107 boost::container::flat_map<std::string, std::variant<std::string>>
108 propertiesChanged;
109 std::string state;
110 try
111 {
112 msg.read(interfaceName, propertiesChanged);
113 state =
114 std::get<std::string>(propertiesChanged.begin()->second);
115 }
116 catch (std::exception& e)
117 {
118 std::cerr << "Unable to read host state\n";
119 return;
120 }
121 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
122
123 // No host events should fire while off, so cancel any pending
124 // timers
125 if (hostOff)
126 {
127 caterrAssertTimer.cancel();
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700128 err2AssertTimer.cancel();
Jason M. Bills1490b142019-07-01 15:48:43 -0700129 }
130 });
131}
132
133static bool requestGPIOEvents(
134 const std::string& name, const std::function<void()>& handler,
135 gpiod::line& gpioLine,
136 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
137{
138 // Find the GPIO line
139 gpioLine = gpiod::find_line(name);
140 if (!gpioLine)
141 {
142 std::cerr << "Failed to find the " << name << " line\n";
143 return false;
144 }
145
146 try
147 {
148 gpioLine.request(
149 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
150 }
151 catch (std::exception&)
152 {
153 std::cerr << "Failed to request events for " << name << "\n";
154 return false;
155 }
156
157 int gpioLineFd = gpioLine.event_get_fd();
158 if (gpioLineFd < 0)
159 {
160 std::cerr << "Failed to get " << name << " fd\n";
161 return false;
162 }
163
164 gpioEventDescriptor.assign(gpioLineFd);
165
166 gpioEventDescriptor.async_wait(
167 boost::asio::posix::stream_descriptor::wait_read,
168 [&name, handler](const boost::system::error_code ec) {
169 if (ec)
170 {
171 std::cerr << name << " fd handler error: " << ec.message()
172 << "\n";
173 return;
174 }
175 handler();
176 });
177 return true;
178}
179
180static void startPowerCycle()
181{
182 conn->async_method_call(
183 [](boost::system::error_code ec) {
184 if (ec)
185 {
186 std::cerr << "failed to set Chassis State\n";
187 }
188 },
189 "xyz.openbmc_project.State.Chassis",
190 "/xyz/openbmc_project/state/chassis0",
191 "org.freedesktop.DBus.Properties", "Set",
192 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
193 std::variant<std::string>{
194 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
195}
196
197static void startCrashdumpAndRecovery(bool recoverSystem)
198{
199 std::cout << "Starting crashdump\n";
200 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
201 static boost::asio::steady_timer crashdumpTimer(io);
202
203 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
204 *conn,
205 "type='signal',interface='org.freedesktop.DBus.Properties',"
206 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
207 [recoverSystem](sdbusplus::message::message& msg) {
208 crashdumpTimer.cancel();
209 std::cout << "Crashdump completed\n";
210 if (recoverSystem)
211 {
212 std::cout << "Recovering the system\n";
213 startPowerCycle();
214 }
215 crashdumpCompleteMatch.reset();
216 });
217
218 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
219 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
220 if (ec)
221 {
222 // operation_aborted is expected if timer is canceled
223 if (ec != boost::asio::error::operation_aborted)
224 {
225 std::cerr << "Crashdump async_wait failed: " << ec.message()
226 << "\n";
227 }
228 std::cout << "Crashdump timer canceled\n";
229 return;
230 }
231 std::cerr << "Crashdump failed to complete before timeout\n";
232 crashdumpCompleteMatch.reset();
233 });
234
235 conn->async_method_call(
236 [](boost::system::error_code ec) {
237 if (ec)
238 {
239 std::cerr << "failed to start Crashdump\n";
240 crashdumpTimer.cancel();
241 crashdumpCompleteMatch.reset();
242 }
243 },
244 "com.intel.crashdump", "/com/intel/crashdump",
245 "com.intel.crashdump.Stored", "GenerateStoredLog");
246}
247
Jason M. Billsa15c2522019-08-16 10:01:44 -0700248static void caterrAssertHandler()
249{
250 std::cout << "CPU CATERR detected, starting timer\n";
251 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
252 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
253 if (ec)
254 {
255 // operation_aborted is expected if timer is canceled
256 // before completion.
257 if (ec != boost::asio::error::operation_aborted)
258 {
259 std::cerr << "caterr timeout async_wait failed: "
260 << ec.message() << "\n";
261 }
262 std::cout << "CATERR assert timer canceled\n";
263 return;
264 }
265 std::cout << "CATERR asset timer completed\n";
266 conn->async_method_call(
267 [](boost::system::error_code ec,
268 const std::variant<bool>& property) {
269 if (ec)
270 {
271 return;
272 }
273 const bool* reset = std::get_if<bool>(&property);
274 if (reset == nullptr)
275 {
276 std::cerr << "Unable to read reset on CATERR value\n";
277 return;
278 }
279 startCrashdumpAndRecovery(*reset);
280 },
281 "xyz.openbmc_project.Settings",
282 "/xyz/openbmc_project/control/processor_error_config",
283 "org.freedesktop.DBus.Properties", "Get",
284 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
285 });
286}
287
Jason M. Bills1490b142019-07-01 15:48:43 -0700288static void caterrHandler()
289{
290 if (!hostOff)
291 {
292 gpiod::line_event gpioLineEvent = caterrLine.event_read();
293
294 bool caterr =
295 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
296 if (caterr)
297 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700298 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700299 }
300 else
301 {
302 caterrAssertTimer.cancel();
303 }
304 }
305 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
306 [](const boost::system::error_code ec) {
307 if (ec)
308 {
309 std::cerr << "caterr handler error: "
310 << ec.message() << "\n";
311 return;
312 }
313 caterrHandler();
314 });
315}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800316static void pchThermtripHandler()
317{
318 if (!hostOff)
319 {
320 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
321
322 bool pchThermtrip =
323 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
324 if (pchThermtrip)
325 {
326 std::cout << "PCH Thermal trip detected \n";
327 // log to redfish, call API
328 sd_journal_send("MESSAGE=SsbThermalTrip: SSB Thermal trip",
329 "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s",
330 "OpenBMC.0.1.SsbThermalTrip", NULL);
331 }
332 }
333 pchThermtripEvent.async_wait(
334 boost::asio::posix::stream_descriptor::wait_read,
335 [](const boost::system::error_code ec) {
336 if (ec)
337 {
338 std::cerr << "PCH Thermal trip handler error: " << ec.message()
339 << "\n";
340 return;
341 }
342 pchThermtripHandler();
343 });
344}
345
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700346static std::bitset<MAX_CPUS> checkERR2CPUs()
347{
348 std::bitset<MAX_CPUS> err2CPUs = 0;
349 for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
350 cpu++, addr++)
351 {
352 if (peci_Ping(addr) == PECI_CC_SUCCESS)
353 {
354 uint8_t cc = 0;
355 CPUModel model{};
356 uint8_t stepping = 0;
357 if (peci_GetCPUID(addr, &model, &stepping, &cc) != PECI_CC_SUCCESS)
358 {
359 std::cerr << "Cannot get CPUID!\n";
360 continue;
361 }
362
363 switch (model)
364 {
365 case skx:
366 {
367 // Check the ERRPINSTS to see if this is the CPU that caused
368 // the ERR2 (B(0) D8 F0 offset 210h)
369 uint32_t errpinsts = 0;
370 if (peci_RdPCIConfigLocal(
371 addr, 0, 8, 0, 0x210, sizeof(uint32_t),
372 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
373 {
374 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
375 }
376 break;
377 }
378 case icx:
379 {
380 // Check the ERRPINSTS to see if this is the CPU that caused
381 // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
382 // accessed on PECI as bus 13)
383 uint32_t errpinsts = 0;
384 if (peci_RdEndPointConfigPciLocal(
385 addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
386 (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
387 {
388 err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
389 }
390 break;
391 }
392 }
393 }
394 }
395 return err2CPUs;
396}
397
398static void err2AssertHandler()
399{
400 // ERR2 status is not guaranteed through the timeout, so save which
401 // CPUs have asserted ERR2 now
402 std::bitset<MAX_CPUS> err2CPUs = checkERR2CPUs();
403 err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
404 err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
405 if (ec)
406 {
407 // operation_aborted is expected if timer is canceled before
408 // completion.
409 if (ec != boost::asio::error::operation_aborted)
410 {
411 std::cerr << "err2 timeout async_wait failed: " << ec.message()
412 << "\n";
413 }
414 return;
415 }
416 std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
417 << " ms\n";
418 if (err2CPUs.count())
419 {
420 for (int i = 0; i < err2CPUs.size(); i++)
421 {
422 if (err2CPUs[i])
423 {
424 cpuERR2Log(i);
425 }
426 }
427 }
428 else
429 {
430 cpuERR2Log();
431 }
432 conn->async_method_call(
433 [](boost::system::error_code ec,
434 const std::variant<bool>& property) {
435 if (ec)
436 {
437 return;
438 }
439 const bool* reset = std::get_if<bool>(&property);
440 if (reset == nullptr)
441 {
442 std::cerr << "Unable to read reset on ERR2 value\n";
443 return;
444 }
445 startCrashdumpAndRecovery(*reset);
446 },
447 "xyz.openbmc_project.Settings",
448 "/xyz/openbmc_project/control/processor_error_config",
449 "org.freedesktop.DBus.Properties", "Get",
450 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
451 });
452}
453
454static void err2Handler()
455{
456 if (!hostOff)
457 {
458 gpiod::line_event gpioLineEvent = err2Line.event_read();
459
460 bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
461 if (err2)
462 {
463 err2AssertHandler();
464 }
465 else
466 {
467 err2AssertTimer.cancel();
468 }
469 }
470 err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
471 [](const boost::system::error_code ec) {
472 if (ec)
473 {
474 std::cerr
475 << "err2 handler error: " << ec.message()
476 << "\n";
477 return;
478 }
479 err2Handler();
480 });
481}
482
Jason M. Billsa15c2522019-08-16 10:01:44 -0700483static void initializeErrorState()
484{
485 // Handle CPU_CATERR if it's asserted now
486 if (caterrLine.get_value() == 0)
487 {
488 caterrAssertHandler();
489 }
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700490
491 // Handle CPU_ERR2 if it's asserted now
492 if (err2Line.get_value() == 0)
493 {
494 err2AssertHandler();
495 }
Jason M. Billsa15c2522019-08-16 10:01:44 -0700496}
Jason M. Bills1490b142019-07-01 15:48:43 -0700497} // namespace host_error_monitor
498
499int main(int argc, char* argv[])
500{
501 // setup connection to dbus
502 host_error_monitor::conn =
503 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
504
505 // Host Error Monitor Object
506 host_error_monitor::conn->request_name(
507 "xyz.openbmc_project.HostErrorMonitor");
508 sdbusplus::asio::object_server server =
509 sdbusplus::asio::object_server(host_error_monitor::conn);
510
511 // Start tracking host state
512 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
513 host_error_monitor::startHostStateMonitor();
514
515 // Initialize the host state
516 host_error_monitor::initializeHostState();
517
518 // Request CPU_CATERR GPIO events
519 if (!host_error_monitor::requestGPIOEvents(
520 "CPU_CATERR", host_error_monitor::caterrHandler,
521 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
522 {
523 return -1;
524 }
525
Jason M. Bills6a2cb692019-08-06 11:03:49 -0700526 // Request CPU_ERR2 GPIO events
527 if (!host_error_monitor::requestGPIOEvents(
528 "CPU_ERR2", host_error_monitor::err2Handler,
529 host_error_monitor::err2Line, host_error_monitor::err2Event))
530 {
531 return -1;
532 }
533
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800534 // Request PCH_BMC_THERMTRIP GPIO events
535 if (!host_error_monitor::requestGPIOEvents(
536 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
537 host_error_monitor::pchThermtripLine,
538 host_error_monitor::pchThermtripEvent))
539 {
540 return -1;
541 }
542
Jason M. Bills1490b142019-07-01 15:48:43 -0700543 host_error_monitor::io.run();
544
545 return 0;
546}