blob: 3d1b7e7a50bea9498f51c648081a8d2fc6987465 [file] [log] [blame]
Jason M. Bills1490b142019-07-01 15:48:43 -07001/*
2// Copyright (c) 2019 Intel Corporation
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15*/
Chen Yugange6c0f1c2019-08-02 20:36:42 +080016#include <systemd/sd-journal.h>
17
Jason M. Bills1490b142019-07-01 15:48:43 -070018#include <boost/asio/posix/stream_descriptor.hpp>
19#include <gpiod.hpp>
20#include <iostream>
21#include <sdbusplus/asio/object_server.hpp>
22
23namespace host_error_monitor
24{
25static boost::asio::io_service io;
26static std::shared_ptr<sdbusplus::asio::connection> conn;
27
28static bool hostOff = true;
29
30const static constexpr size_t caterrTimeoutMs = 2000;
31const static constexpr size_t crashdumpTimeoutS = 300;
32
33// Timers
34// Timer for CATERR asserted
35static boost::asio::steady_timer caterrAssertTimer(io);
36
37// GPIO Lines and Event Descriptors
38static gpiod::line caterrLine;
39static boost::asio::posix::stream_descriptor caterrEvent(io);
Chen Yugange6c0f1c2019-08-02 20:36:42 +080040//----------------------------------
41// PCH_BMC_THERMTRIP function related definition
42//----------------------------------
43// GPIO Lines and Event Descriptors
44static gpiod::line pchThermtripLine;
45static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
Jason M. Bills1490b142019-07-01 15:48:43 -070046
Jason M. Billsa15c2522019-08-16 10:01:44 -070047static void initializeErrorState();
Jason M. Bills1490b142019-07-01 15:48:43 -070048static void initializeHostState()
49{
50 conn->async_method_call(
51 [](boost::system::error_code ec,
52 const std::variant<std::string>& property) {
53 if (ec)
54 {
55 return;
56 }
57 const std::string* state = std::get_if<std::string>(&property);
58 if (state == nullptr)
59 {
60 std::cerr << "Unable to read host state value\n";
61 return;
62 }
63 hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off";
Jason M. Billsa15c2522019-08-16 10:01:44 -070064 // If the system is on, initialize the error state
65 if (!hostOff)
66 {
67 initializeErrorState();
68 }
Jason M. Bills1490b142019-07-01 15:48:43 -070069 },
70 "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0",
71 "org.freedesktop.DBus.Properties", "Get",
72 "xyz.openbmc_project.State.Host", "CurrentHostState");
73}
74
75static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
76{
77 return std::make_shared<sdbusplus::bus::match::match>(
78 *conn,
79 "type='signal',interface='org.freedesktop.DBus.Properties',"
80 "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State."
81 "Host'",
82 [](sdbusplus::message::message& msg) {
83 std::string interfaceName;
84 boost::container::flat_map<std::string, std::variant<std::string>>
85 propertiesChanged;
86 std::string state;
87 try
88 {
89 msg.read(interfaceName, propertiesChanged);
90 state =
91 std::get<std::string>(propertiesChanged.begin()->second);
92 }
93 catch (std::exception& e)
94 {
95 std::cerr << "Unable to read host state\n";
96 return;
97 }
98 hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off";
99
100 // No host events should fire while off, so cancel any pending
101 // timers
102 if (hostOff)
103 {
104 caterrAssertTimer.cancel();
105 }
106 });
107}
108
109static bool requestGPIOEvents(
110 const std::string& name, const std::function<void()>& handler,
111 gpiod::line& gpioLine,
112 boost::asio::posix::stream_descriptor& gpioEventDescriptor)
113{
114 // Find the GPIO line
115 gpioLine = gpiod::find_line(name);
116 if (!gpioLine)
117 {
118 std::cerr << "Failed to find the " << name << " line\n";
119 return false;
120 }
121
122 try
123 {
124 gpioLine.request(
125 {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES});
126 }
127 catch (std::exception&)
128 {
129 std::cerr << "Failed to request events for " << name << "\n";
130 return false;
131 }
132
133 int gpioLineFd = gpioLine.event_get_fd();
134 if (gpioLineFd < 0)
135 {
136 std::cerr << "Failed to get " << name << " fd\n";
137 return false;
138 }
139
140 gpioEventDescriptor.assign(gpioLineFd);
141
142 gpioEventDescriptor.async_wait(
143 boost::asio::posix::stream_descriptor::wait_read,
144 [&name, handler](const boost::system::error_code ec) {
145 if (ec)
146 {
147 std::cerr << name << " fd handler error: " << ec.message()
148 << "\n";
149 return;
150 }
151 handler();
152 });
153 return true;
154}
155
156static void startPowerCycle()
157{
158 conn->async_method_call(
159 [](boost::system::error_code ec) {
160 if (ec)
161 {
162 std::cerr << "failed to set Chassis State\n";
163 }
164 },
165 "xyz.openbmc_project.State.Chassis",
166 "/xyz/openbmc_project/state/chassis0",
167 "org.freedesktop.DBus.Properties", "Set",
168 "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition",
169 std::variant<std::string>{
170 "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"});
171}
172
173static void startCrashdumpAndRecovery(bool recoverSystem)
174{
175 std::cout << "Starting crashdump\n";
176 static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch;
177 static boost::asio::steady_timer crashdumpTimer(io);
178
179 crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>(
180 *conn,
181 "type='signal',interface='org.freedesktop.DBus.Properties',"
182 "member='PropertiesChanged',arg0namespace='com.intel.crashdump'",
183 [recoverSystem](sdbusplus::message::message& msg) {
184 crashdumpTimer.cancel();
185 std::cout << "Crashdump completed\n";
186 if (recoverSystem)
187 {
188 std::cout << "Recovering the system\n";
189 startPowerCycle();
190 }
191 crashdumpCompleteMatch.reset();
192 });
193
194 crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS));
195 crashdumpTimer.async_wait([](const boost::system::error_code ec) {
196 if (ec)
197 {
198 // operation_aborted is expected if timer is canceled
199 if (ec != boost::asio::error::operation_aborted)
200 {
201 std::cerr << "Crashdump async_wait failed: " << ec.message()
202 << "\n";
203 }
204 std::cout << "Crashdump timer canceled\n";
205 return;
206 }
207 std::cerr << "Crashdump failed to complete before timeout\n";
208 crashdumpCompleteMatch.reset();
209 });
210
211 conn->async_method_call(
212 [](boost::system::error_code ec) {
213 if (ec)
214 {
215 std::cerr << "failed to start Crashdump\n";
216 crashdumpTimer.cancel();
217 crashdumpCompleteMatch.reset();
218 }
219 },
220 "com.intel.crashdump", "/com/intel/crashdump",
221 "com.intel.crashdump.Stored", "GenerateStoredLog");
222}
223
Jason M. Billsa15c2522019-08-16 10:01:44 -0700224static void caterrAssertHandler()
225{
226 std::cout << "CPU CATERR detected, starting timer\n";
227 caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
228 caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
229 if (ec)
230 {
231 // operation_aborted is expected if timer is canceled
232 // before completion.
233 if (ec != boost::asio::error::operation_aborted)
234 {
235 std::cerr << "caterr timeout async_wait failed: "
236 << ec.message() << "\n";
237 }
238 std::cout << "CATERR assert timer canceled\n";
239 return;
240 }
241 std::cout << "CATERR asset timer completed\n";
242 conn->async_method_call(
243 [](boost::system::error_code ec,
244 const std::variant<bool>& property) {
245 if (ec)
246 {
247 return;
248 }
249 const bool* reset = std::get_if<bool>(&property);
250 if (reset == nullptr)
251 {
252 std::cerr << "Unable to read reset on CATERR value\n";
253 return;
254 }
255 startCrashdumpAndRecovery(*reset);
256 },
257 "xyz.openbmc_project.Settings",
258 "/xyz/openbmc_project/control/processor_error_config",
259 "org.freedesktop.DBus.Properties", "Get",
260 "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnCATERR");
261 });
262}
263
Jason M. Bills1490b142019-07-01 15:48:43 -0700264static void caterrHandler()
265{
266 if (!hostOff)
267 {
268 gpiod::line_event gpioLineEvent = caterrLine.event_read();
269
270 bool caterr =
271 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
272 if (caterr)
273 {
Jason M. Billsa15c2522019-08-16 10:01:44 -0700274 caterrAssertHandler();
Jason M. Bills1490b142019-07-01 15:48:43 -0700275 }
276 else
277 {
278 caterrAssertTimer.cancel();
279 }
280 }
281 caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read,
282 [](const boost::system::error_code ec) {
283 if (ec)
284 {
285 std::cerr << "caterr handler error: "
286 << ec.message() << "\n";
287 return;
288 }
289 caterrHandler();
290 });
291}
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800292static void pchThermtripHandler()
293{
294 if (!hostOff)
295 {
296 gpiod::line_event gpioLineEvent = pchThermtripLine.event_read();
297
298 bool pchThermtrip =
299 gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
300 if (pchThermtrip)
301 {
302 std::cout << "PCH Thermal trip detected \n";
303 // log to redfish, call API
304 sd_journal_send("MESSAGE=SsbThermalTrip: SSB Thermal trip",
305 "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s",
306 "OpenBMC.0.1.SsbThermalTrip", NULL);
307 }
308 }
309 pchThermtripEvent.async_wait(
310 boost::asio::posix::stream_descriptor::wait_read,
311 [](const boost::system::error_code ec) {
312 if (ec)
313 {
314 std::cerr << "PCH Thermal trip handler error: " << ec.message()
315 << "\n";
316 return;
317 }
318 pchThermtripHandler();
319 });
320}
321
Jason M. Billsa15c2522019-08-16 10:01:44 -0700322static void initializeErrorState()
323{
324 // Handle CPU_CATERR if it's asserted now
325 if (caterrLine.get_value() == 0)
326 {
327 caterrAssertHandler();
328 }
329}
Jason M. Bills1490b142019-07-01 15:48:43 -0700330} // namespace host_error_monitor
331
332int main(int argc, char* argv[])
333{
334 // setup connection to dbus
335 host_error_monitor::conn =
336 std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io);
337
338 // Host Error Monitor Object
339 host_error_monitor::conn->request_name(
340 "xyz.openbmc_project.HostErrorMonitor");
341 sdbusplus::asio::object_server server =
342 sdbusplus::asio::object_server(host_error_monitor::conn);
343
344 // Start tracking host state
345 std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
346 host_error_monitor::startHostStateMonitor();
347
348 // Initialize the host state
349 host_error_monitor::initializeHostState();
350
351 // Request CPU_CATERR GPIO events
352 if (!host_error_monitor::requestGPIOEvents(
353 "CPU_CATERR", host_error_monitor::caterrHandler,
354 host_error_monitor::caterrLine, host_error_monitor::caterrEvent))
355 {
356 return -1;
357 }
358
Chen Yugange6c0f1c2019-08-02 20:36:42 +0800359 // Request PCH_BMC_THERMTRIP GPIO events
360 if (!host_error_monitor::requestGPIOEvents(
361 "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,
362 host_error_monitor::pchThermtripLine,
363 host_error_monitor::pchThermtripEvent))
364 {
365 return -1;
366 }
367
Jason M. Bills1490b142019-07-01 15:48:43 -0700368 host_error_monitor::io.run();
369
370 return 0;
371}