blob: 459eb7a0fcb438f4a58040dc407e5e1300553e97 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Chen Yugange6c0f1c2019-08-02 20:36:42 +080016#include <systemd/sd-journal.h>
17
Jason M. Bills1490b142019-07-01 15:48:43 -070018#include <boost/asio/posix/stream_descriptor.hpp>
19#include <gpiod.hpp>
20#include <iostream>
21#include <sdbusplus/asio/object_server.hpp>
22
23namespace host_error_monitor
24{
25static boost::asio::io_service io;
26static std::shared_ptr<sdbusplus::asio::connection> conn;
27
28static bool hostOff = true;
29
30const static constexpr size_t caterrTimeoutMs = 2000;
31const static constexpr size_t crashdumpTimeoutS = 300;
32
33// Timers
34// Timer for CATERR asserted
35static boost::asio::steady_timer caterrAssertTimer(io);
36
37// GPIO Lines and Event Descriptors
38static gpiod::line caterrLine;
39static boost::asio::posix::stream_descriptor caterrEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080040//----------------------------------
41// PCH_BMC_THERMTRIP function related definition
42//----------------------------------
43// GPIO Lines and Event Descriptors
44static gpiod::line pchThermtripLine;
45static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070046
47static void initializeHostState()
48{
49 conn->async_method_call(
50 [](boost::system::error_code ec,
51 const std::variant<std::string>& property) {
52 if (ec)
53 {
54 return;
55 }
56 const std::string* state = std::get_if<std::string>(&property);
57 if (state == nullptr)
58 {
59 std::cerr << "Unable to read host state value\n";
60 return;
61 }
62 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
63 },
64 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
65 "org.freedesktop.DBus.Properties", "Get",
66 "xyz.openbmc_project.State.Host", "CurrentHostState");
67}
68
69static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
70{
71 return std::make_shared<sdbusplus::bus::match::match>(
72 *conn,
73 "type='signal',interface='org.freedesktop.DBus.Properties',"
74 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
75 "Host'",
76 [](sdbusplus::message::message& msg) {
77 std::string interfaceName;
78 boost::container::flat_map<std::string, std::variant<std::string>>
79 propertiesChanged;
80 std::string state;
81 try
82 {
83 msg.read(interfaceName, propertiesChanged);
84 state =
85 std::get<std::string>(propertiesChanged.begin()->second);
86 }
87 catch (std::exception& e)
88 {
89 std::cerr << "Unable to read host state\n";
90 return;
91 }
92 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
93
94 // No host events should fire while off, so cancel any pending
95 // timers
96 if (hostOff)
97 {
98 caterrAssertTimer.cancel();
99 }
100 });
101}
102
103static bool requestGPIOEvents(
104 const std::string& name, const std::function<void()>& handler,
105 gpiod::line& gpioLine,
106 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
107{
108 // Find the GPIO line
109 gpioLine = gpiod::find_line(name);
110 if (!gpioLine)
111 {
112 std::cerr << "Failed to find the " << name << " line\n";
113 return false;
114 }
115
116 try
117 {
118 gpioLine.request(
119 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
120 }
121 catch (std::exception&)
122 {
123 std::cerr << "Failed to request events for " << name << "\n";
124 return false;
125 }
126
127 int gpioLineFd = gpioLine.event_get_fd();
128 if (gpioLineFd < 0)
129 {
130 std::cerr << "Failed to get " << name << " fd\n";
131 return false;
132 }
133
134 gpioEventDescriptor.assign(gpioLineFd);
135
136 gpioEventDescriptor.async_wait(
137 boost::asio::posix::stream_descriptor::wait_read,
138 [&name, handler](const boost::system::error_code ec) {
139 if (ec)
140 {
141 std::cerr << name << " fd handler error: " << ec.message()
142 << "\n";
143 return;
144 }
145 handler();
146 });
147 return true;
148}
149
150static void startPowerCycle()
151{
152 conn->async_method_call(
153 [](boost::system::error_code ec) {
154 if (ec)
155 {
156 std::cerr << "failed to set Chassis State\n";
157 }
158 },
159 "xyz.openbmc_project.State.Chassis",
160 "/xyz/openbmc_project/state/chassis0",
161 "org.freedesktop.DBus.Properties", "Set",
162 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
163 std::variant<std::string>{
164 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
165}
166
167static void startCrashdumpAndRecovery(bool recoverSystem)
168{
169 std::cout << "Starting crashdump\n";
170 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
171 static boost::asio::steady_timer crashdumpTimer(io);
172
173 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
174 *conn,
175 "type='signal',interface='org.freedesktop.DBus.Properties',"
176 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
177 [recoverSystem](sdbusplus::message::message& msg) {
178 crashdumpTimer.cancel();
179 std::cout << "Crashdump completed\n";
180 if (recoverSystem)
181 {
182 std::cout << "Recovering the system\n";
183 startPowerCycle();
184 }
185 crashdumpCompleteMatch.reset();
186 });
187
188 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
189 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
190 if (ec)
191 {
192 // operation_aborted is expected if timer is canceled
193 if (ec != boost::asio::error::operation_aborted)
194 {
195 std::cerr << "Crashdump async_wait failed: " << ec.message()
196 << "\n";
197 }
198 std::cout << "Crashdump timer canceled\n";
199 return;
200 }
201 std::cerr << "Crashdump failed to complete before timeout\n";
202 crashdumpCompleteMatch.reset();
203 });
204
205 conn->async_method_call(
206 [](boost::system::error_code ec) {
207 if (ec)
208 {
209 std::cerr << "failed to start Crashdump\n";
210 crashdumpTimer.cancel();
211 crashdumpCompleteMatch.reset();
212 }
213 },
214 "com.intel.crashdump", "/com/intel/crashdump",
215 "com.intel.crashdump.Stored", "GenerateStoredLog");
216}
217
218static void caterrHandler()
219{
220 if (!hostOff)
221 {
222 gpiod::line_event gpioLineEvent = caterrLine.event_read();
223
224 bool caterr =
225 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
226 if (caterr)
227 {
228 std::cout << "CPU CATERR detected, starting timer\n";
229 caterrAssertTimer.expires_after(
230 std::chrono::milliseconds(caterrTimeoutMs));
231 caterrAssertTimer.async_wait(
232 [](const boost::system::error_code ec) {
233 if (ec)
234 {
235 // operation_aborted is expected if timer is canceled
236 // before completion.
237 if (ec != boost::asio::error::operation_aborted)
238 {
239 std::cerr << "caterr timeout async_wait failed: "
240 << ec.message() << "\n";
241 }
242 std::cout << "CATERR assert timer canceled\n";
243 return;
244 }
245 std::cout << "CATERR asset timer completed\n";
246 conn->async_method_call(
247 [](boost::system::error_code ec,
248 const std::variant<bool>& property) {
249 if (ec)
250 {
251 return;
252 }
253 const bool* reset = std::get_if<bool>(&property);
254 if (reset == nullptr)
255 {
256 std::cerr
257 << "Unable to read reset on CATERR value\n";
258 return;
259 }
260 startCrashdumpAndRecovery(*reset);
261 },
262 "xyz.openbmc_project.Settings",
263 "/xyz/openbmc_project/control/processor_error_config",
264 "org.freedesktop.DBus.Properties", "Get",
265 "xyz.openbmc_project.Control.Processor.ErrConfig",
266 "ResetOnCATERR");
267 });
268 }
269 else
270 {
271 caterrAssertTimer.cancel();
272 }
273 }
274 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
275 [](const boost::system::error_code ec) {
276 if (ec)
277 {
278 std::cerr << "caterr handler error: "
279 << ec.message() << "\n";
280 return;
281 }
282 caterrHandler();
283 });
284}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800285static void pchThermtripHandler()
286{
287 if (!hostOff)
288 {
289 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
290
291 bool pchThermtrip =
292 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
293 if (pchThermtrip)
294 {
295 std::cout << "PCH Thermal trip detected \n";
296 // log to redfish, call API
297 sd_journal_send("MESSAGE=SsbThermalTrip: SSB Thermal trip",
298 "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s",
299 "OpenBMC.0.1.SsbThermalTrip", NULL);
300 }
301 }
302 pchThermtripEvent.async_wait(
303 boost::asio::posix::stream_descriptor::wait_read,
304 [](const boost::system::error_code ec) {
305 if (ec)
306 {
307 std::cerr << "PCH Thermal trip handler error: " << ec.message()
308 << "\n";
309 return;
310 }
311 pchThermtripHandler();
312 });
313}
314
Jason M. Bills1490b142019-07-01 15:48:43 -0700315} // namespace host_error_monitor
316
317int main(int argc, char* argv[])
318{
319 // setup connection to dbus
320 host_error_monitor::conn =
321 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
322
323 // Host Error Monitor Object
324 host_error_monitor::conn->request_name(
325 "xyz.openbmc_project.HostErrorMonitor");
326 sdbusplus::asio::object_server server =
327 sdbusplus::asio::object_server(host_error_monitor::conn);
328
329 // Start tracking host state
330 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
331 host_error_monitor::startHostStateMonitor();
332
333 // Initialize the host state
334 host_error_monitor::initializeHostState();
335
336 // Request CPU_CATERR GPIO events
337 if (!host_error_monitor::requestGPIOEvents(
338 "CPU_CATERR", host_error_monitor::caterrHandler,
339 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
340 {
341 return -1;
342 }
343
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800344 // Request PCH_BMC_THERMTRIP GPIO events
345 if (!host_error_monitor::requestGPIOEvents(
346 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
347 host_error_monitor::pchThermtripLine,
348 host_error_monitor::pchThermtripEvent))
349 {
350 return -1;
351 }
352
Jason M. Bills1490b142019-07-01 15:48:43 -0700353 host_error_monitor::io.run();
354
355 return 0;
356}