diff options
author | Djalal Harouni <tixxdz@opendz.org> | 2014-05-24 14:58:55 +0100 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2014-05-25 11:23:35 +0800 |
commit | e866af3acc30fcd1183a028ea3ef552b7237cc55 (patch) | |
tree | d13ad460ed1206420bb7e3e7dcbda67b30c052b9 /src/nspawn/nspawn.c | |
parent | 113cea802db444beab4783538d39966f707be788 (diff) |
nspawn: make nspawn robust to container failure
nspawn and the container child use eventfd to wait and notify each other
that they are ready so the container setup can be completed.
However in its current form the wait/notify event ignore errors that
may especially affect the child (container).
On errors the child will jump to the "child_fail" label and terminate
with _exit(EXIT_FAILURE) without notifying the parent. Since the eventfd
is created without the "EFD_NONBLOCK" flag, this leaves the parent
blocking on the eventfd_read() call. The container can also be killed
at any moment before execv() and the parent will not receive
notifications.
We can fix this by using cheap mechanisms, the new high level eventfd
API and handle SIGCHLD signals:
* Keep the cheap eventfd and EFD_NONBLOCK flag.
* Introduce eventfd states for parent and child to sync.
Child notifies parent with EVENTFD_CHILD_SUCCEEDED on success or
EVENTFD_CHILD_FAILED on failure and before _exit(). This prevents the
parent from waiting on an event that will never come.
* If the child is killed before execv() or before notifying the parent,
we install a NOP handler for SIGCHLD which will interrupt blocking calls
with EINTR. This gives a chance to the parent to call wait() and
terminate in main().
* If there are no errors, parent will block SIGCHLD, restore default
handler and notify child which will do execv(), then parent will pass
control to process_pty() to do its magic.
This was exposed in part by:
https://bugs.freedesktop.org/show_bug.cgi?id=76193
Reported-by: Tobias Hunger tobias.hunger@gmail.com
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r-- | src/nspawn/nspawn.c | 92 |
1 files changed, 67 insertions, 25 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 0cd476cd9e..a1d77244f8 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -84,6 +84,7 @@ #include "def.h" #include "rtnl-util.h" #include "udev-util.h" +#include "eventfd-util.h" #include "blkid-util.h" #include "gpt.h" #include "siphash24.h" @@ -2642,6 +2643,8 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { return r; } +static void nop_handler(int sig) {} + int main(int argc, char *argv[]) { _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL; @@ -2653,8 +2656,8 @@ int main(int argc, char *argv[]) { const char *console = NULL; char veth_name[IFNAMSIZ]; bool secondary = false; + sigset_t mask, mask_chld; pid_t pid = 0; - sigset_t mask; log_parse_environment(); log_open(); @@ -2816,36 +2819,44 @@ int main(int argc, char *argv[]) { sd_notify(0, "READY=1"); assert_se(sigemptyset(&mask) == 0); + assert_se(sigemptyset(&mask_chld) == 0); + sigaddset(&mask_chld, SIGCHLD); sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1); assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0); for (;;) { ContainerStatus container_status; - int parent_ready_fd = -1, child_ready_fd = -1; - eventfd_t x; - - parent_ready_fd = eventfd(0, EFD_CLOEXEC); - if (parent_ready_fd < 0) { - log_error("Failed to create event fd: %m"); + int eventfds[2] = { -1, -1 }; + struct sigaction sa = { + .sa_handler = nop_handler, + .sa_flags = SA_NOCLDSTOP, + }; + + /* Child can be killed before execv(), so handle SIGCHLD + * in order to interrupt parent's blocking calls and + * give it a chance to call wait() and terminate. */ + r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); + if (r < 0) { + log_error("Failed to change the signal mask: %m"); goto finish; } - child_ready_fd = eventfd(0, EFD_CLOEXEC); - if (child_ready_fd < 0) { - log_error("Failed to create event fd: %m"); + r = sigaction(SIGCHLD, &sa, NULL); + if (r < 0) { + log_error("Failed to install SIGCHLD handler: %m"); goto finish; } - pid = syscall(__NR_clone, - SIGCHLD|CLONE_NEWNS| - (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)| - (arg_private_network ? CLONE_NEWNET : 0), NULL); + pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS| + (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)| + (arg_private_network ? CLONE_NEWNET : 0), eventfds); if (pid < 0) { if (errno == EINVAL) log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); else log_error("clone() failed: %m"); + r = pid; goto finish; } @@ -2986,8 +2997,11 @@ int main(int argc, char *argv[]) { /* Tell the parent that we are ready, and that * it can cgroupify us to that we lack access * to certain devices and resources. */ - eventfd_write(child_ready_fd, 1); - child_ready_fd = safe_close(child_ready_fd); + r = eventfd_send_state(eventfds[1], + EVENTFD_CHILD_SUCCEEDED); + eventfds[1] = safe_close(eventfds[1]); + if (r < 0) + goto child_fail; if (chdir(arg_directory) < 0) { log_error("chdir(%s) failed: %m", arg_directory); @@ -3089,8 +3103,10 @@ int main(int argc, char *argv[]) { env_use = (char**) envp; /* Wait until the parent is ready with the setup, too... */ - eventfd_read(parent_ready_fd, &x); - parent_ready_fd = safe_close(parent_ready_fd); + r = eventfd_parent_succeeded(eventfds[0]); + eventfds[0] = safe_close(eventfds[0]); + if (r < 0) + goto child_fail; if (arg_boot) { char **a; @@ -3121,17 +3137,27 @@ int main(int argc, char *argv[]) { log_error("execv() failed: %m"); child_fail: + /* Tell the parent that the setup failed, so he + * can clean up resources and terminate. */ + if (eventfds[1] != -1) + eventfd_send_state(eventfds[1], + EVENTFD_CHILD_FAILED); _exit(EXIT_FAILURE); } fdset_free(fds); fds = NULL; - /* Wait until the child reported that it is ready with - * all it needs to do with privileges. After we got - * the notification we can make the process join its - * cgroup which might limit what it can do */ - eventfd_read(child_ready_fd, &x); + /* Wait for the child event: + * If EVENTFD_CHILD_FAILED, the child will terminate soon. + * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that + * it is ready with all it needs to do with priviliges. + * After we got the notification we can make the process + * join its cgroup which might limit what it can do */ + r = eventfd_child_succeeded(eventfds[1]); + eventfds[1] = safe_close(eventfds[1]); + if (r < 0) + goto check_container_status; r = register_machine(pid); if (r < 0) @@ -3153,10 +3179,25 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + /* Block SIGCHLD here, before notifying child. + * process_pty() will handle it with the other signals. */ + r = sigprocmask(SIG_BLOCK, &mask_chld, NULL); + if (r < 0) + goto finish; + + /* Reset signal to default */ + r = default_signals(SIGCHLD, -1); + if (r < 0) + goto finish; + /* Notify the child that the parent is ready with all - * its setup, and thtat the child can now hand over + * its setup, and that the child can now hand over * control to the code to run inside the container. */ - eventfd_write(parent_ready_fd, 1); + r = eventfd_send_state(eventfds[0], + EVENTFD_PARENT_SUCCEEDED); + eventfds[0] = safe_close(eventfds[0]); + if (r < 0) + goto finish; k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3); if (k < 0) { @@ -3170,6 +3211,7 @@ int main(int argc, char *argv[]) { /* Kill if it is not dead yet anyway */ terminate_machine(pid); +check_container_status: /* Redundant, but better safe than sorry */ kill(pid, SIGKILL); |