Add support for systemd service watchdog
Systemd has support for enabling service level watchdog. The MR enables
this support for bmcweb daemon. Request for watchdog monitor from
systemd is added in bmcweb.service.in. From the event loop a timer is
registered to kick the watchdog periodically
The default watchdog timeout is set at 120 seconds and the timer is set
to kick it at a quarter of the interval (every 30 seconds).
This timeout is set somewhat arbitrarily based on the longest blocking
call that could occur and still give a valid HTTP response. Suspect
lower values could work equally as well.
Benefits of Service Watchdog
- Bmcweb route handlers should not make any blocking IO calls which
block the event loop for considerable amount of time and slowdown the
response of other URI requests in the queue. Watchdog can help to detect
such issues.
- Watchdog can help restart the service if any route handler code has
uncaught bugs resulting from system API errors (this is in theory,
currently we don't have any use case).
Tested
1. UT is passing
2. Service validator is passing
3. Fw upgrade POST requests are working
Change-Id: If62397d8836c942fdcbc0618810fe82a8b248df8
Signed-off-by: rohitpai <ropai@nvidia.com>
Signed-off-by: Ed Tanous <etanous@nvidia.com>
diff --git a/config/bmcweb.service.in b/config/bmcweb.service.in
index 07e2ad6..78a92fe 100644
--- a/config/bmcweb.service.in
+++ b/config/bmcweb.service.in
@@ -10,6 +10,7 @@
Type=simple
WorkingDirectory=/home/root
SyslogLevelPrefix=true
+WatchdogSec=@WATCHDOG_TIMEOUT@s
[Install]
WantedBy=network.target
diff --git a/config/meson.build b/config/meson.build
index 7a3e17d..e681a5f 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -50,7 +50,7 @@
'redfish-system-uri-name',
]
-int_options = ['http-body-limit']
+int_options = ['http-body-limit', 'watchdog-timeout-seconds']
feature_options_string = '\n//Feature options\n'
string_options_string = '\n// String options\n'
@@ -157,7 +157,10 @@
install_dir: systemd_system_unit_dir,
install: true,
configuration: configuration_data(
- {'MESON_INSTALL_PREFIX': get_option('prefix')},
+ {
+ 'MESON_INSTALL_PREFIX': get_option('prefix'),
+ 'WATCHDOG_TIMEOUT_SECONDS': get_option('watchdog-timeout-seconds'),
+ },
),
)
diff --git a/include/watchdog.hpp b/include/watchdog.hpp
new file mode 100644
index 0000000..918885c
--- /dev/null
+++ b/include/watchdog.hpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright OpenBMC Authors
+#pragma once
+
+#include "bmcweb_config.h"
+
+#include "io_context_singleton.hpp"
+#include "logging.hpp"
+
+#include <systemd/sd-daemon.h>
+
+#include <boost/asio/steady_timer.hpp>
+#include <boost/system/error_code.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <ratio>
+
+namespace bmcweb
+{
+
+class ServiceWatchdog
+{
+ public:
+ ServiceWatchdog() : timer(getIoContext())
+ {
+ uint64_t usecondTimeout = 0;
+ if (sd_watchdog_enabled(0, &usecondTimeout) <= 0)
+ {
+ if (BMCWEB_WATCHDOG_TIMEOUT_SECONDS > 0)
+ {
+ BMCWEB_LOG_WARNING(
+ "Watchdog timeout was enabled at compile time, but disabled at runtime");
+ }
+ return;
+ }
+ // Pet the watchdog N times faster than required.
+ uint64_t petRatio = 4;
+ watchdogTime = std::chrono::duration<uint64_t, std::micro>(
+ usecondTimeout / petRatio);
+ startTimer();
+ }
+
+ private:
+ void startTimer()
+ {
+ timer.expires_after(watchdogTime);
+ timer.async_wait(
+ std::bind_front(&ServiceWatchdog::handleTimeout, this));
+ }
+
+ void handleTimeout(const boost::system::error_code& ec)
+ {
+ if (ec)
+ {
+ BMCWEB_LOG_ERROR("Watchdog timer async_wait failed: {}",
+ ec.message());
+ return;
+ }
+
+ int rc = sd_notify(0, "WATCHDOG=1");
+ if (rc < 0)
+ {
+ BMCWEB_LOG_ERROR("sd_notify failed: {}", -rc);
+ return;
+ }
+
+ startTimer();
+ }
+
+ boost::asio::steady_timer timer;
+ std::chrono::duration<uint64_t, std::micro> watchdogTime{};
+};
+
+} // namespace bmcweb
diff --git a/meson.options b/meson.options
index 3d988b6..981fa8b 100644
--- a/meson.options
+++ b/meson.options
@@ -461,6 +461,17 @@
behavior changes or be removed at any time.''',
)
+# BMCWEB_WATCHDOG_TIMEOUT
+option(
+ 'watchdog-timeout-seconds',
+ type: 'integer',
+ min: 0,
+ max: 600,
+ value: 120,
+ description: '''Specifies the systemd watchdog timeout interval in seconds.
+ Set to 0 to disable the watchdog.''',
+)
+
# Insecure options. Every option that starts with a `insecure` flag should
# not be enabled by default for any platform, unless the author fully comprehends
# the implications of doing so.In general, enabling these options will cause security
diff --git a/src/webserver_run.cpp b/src/webserver_run.cpp
index bf1168e..12c82fe 100644
--- a/src/webserver_run.cpp
+++ b/src/webserver_run.cpp
@@ -22,6 +22,7 @@
#include "redfish_aggregator.hpp"
#include "user_monitor.hpp"
#include "vm_websocket.hpp"
+#include "watchdog.hpp"
#include "webassets.hpp"
#include <boost/asio/io_context.hpp>
@@ -126,6 +127,8 @@
bmcweb::registerUserRemovedSignal();
+ bmcweb::ServiceWatchdog watchdog;
+
app.run();
systemBus->request_name("xyz.openbmc_project.bmcweb");