summaryrefslogtreecommitdiff
path: root/src/nspawn
diff options
context:
space:
mode:
authorAlessandro Puccetti <alessandro@kinvolk.io>2016-06-10 13:09:06 +0200
committerLennart Poettering <lennart@poettering.net>2016-06-10 13:09:06 +0200
commit9c1e04d0fa80c73ef0dd4647c103cdb7edb7f580 (patch)
tree26b3aad4edcbcb923a19e5904b928d3e559ce23d /src/nspawn
parent1edce01965b4da12dbf4363e77b62471ac664fa1 (diff)
nspawn: introduce --notify-ready=[no|yes] (#3474)
This the patch implements a notificaiton mechanism from the init process in the container to systemd-nspawn. The switch --notify-ready=yes configures systemd-nspawn to wait the "READY=1" message from the init process in the container to send its own to systemd. --notify-ready=no is equivalent to the previous behavior before this patch, systemd-nspawn notifies systemd with a "READY=1" message when the container is created. This notificaiton mechanism uses socket file with path relative to the contanier "/run/systemd/nspawn/notify". The default values it --notify-ready=no. It is also possible to configure this mechanism from the .nspawn files using NotifyReady. This parameter takes the same options of the command line switch. Before this patch, systemd-nspawn notifies "ready" after the inner child was created, regardless the status of the service running inside it. Now, with --notify-ready=yes, systemd-nspawn notifies when the service is ready. This is really useful when there are dependencies between different contaniers. Fixes https://github.com/systemd/systemd/issues/1369 Based on the work from https://github.com/systemd/systemd/pull/3022 Testing: Boot a OS inside a container with systemd-nspawn. Note: modify the commands accordingly with your filesystem. 1. Create a filesystem where you can boot an OS. 2. sudo systemd-nspawn -D ${HOME}/distros/fedora-23/ sh 2.1. Create the unit file /etc/systemd/system/sleep.service inside the container (You can use the example below) 2.2. systemdctl enable sleep 2.3 exit 3. sudo systemd-run --service-type=notify --unit=notify-test ${HOME}/systemd/systemd-nspawn --notify-ready=yes -D ${HOME}/distros/fedora-23/ -b 4. In a different shell run "systemctl status notify-test" When using --notify-ready=yes the service status is "activating" for 20 seconds before being set to "active (running)". Instead, using --notify-ready=no the service status is marked "active (running)" quickly, without waiting for the 20 seconds. This patch was also test with --private-users=yes, you can test it just adding it at the end of the command at point 3. ------ sleep.service ------ [Unit] Description=sleep After=network.target [Service] Type=oneshot ExecStart=/bin/sleep 20 [Install] WantedBy=multi-user.target ------------ end ------------
Diffstat (limited to 'src/nspawn')
-rw-r--r--src/nspawn/nspawn-gperf.gperf1
-rw-r--r--src/nspawn/nspawn-settings.h4
-rw-r--r--src/nspawn/nspawn.c195
3 files changed, 192 insertions, 8 deletions
diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf
index 2b5d452662..3231a48d5a 100644
--- a/src/nspawn/nspawn-gperf.gperf
+++ b/src/nspawn/nspawn-gperf.gperf
@@ -27,6 +27,7 @@ Exec.Personality, config_parse_personality, 0, offsetof(Settings,
Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id)
Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
Exec.PrivateUsers, config_parse_private_users, 0, 0
+Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index 1c47e37912..231e6d7266 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -56,7 +56,8 @@ typedef enum SettingsMask {
SETTING_CUSTOM_MOUNTS = 1 << 11,
SETTING_WORKING_DIRECTORY = 1 << 12,
SETTING_USERNS = 1 << 13,
- _SETTINGS_MASK_ALL = (1 << 14) -1
+ SETTING_NOTIFY_READY = 1 << 14,
+ _SETTINGS_MASK_ALL = (1 << 15) -1
} SettingsMask;
typedef struct Settings {
@@ -73,6 +74,7 @@ typedef struct Settings {
char *working_directory;
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
+ bool notify_ready;
/* [Image] */
int read_only;
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index d1c65e8b0b..ea24de7608 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -104,6 +104,10 @@
* UID range here */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
+ * nspawn_notify_socket_path is relative to the container
+ * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
+#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
typedef enum ContainerStatus {
CONTAINER_TERMINATED,
@@ -187,6 +191,7 @@ static SettingsMask arg_settings_mask = 0;
static int arg_settings_trusted = -1;
static char **arg_parameters = NULL;
static const char *arg_container_service_name = "systemd-nspawn";
+static bool arg_notify_ready = false;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -267,6 +272,8 @@ static void help(void) {
" the service unit nspawn is running in\n"
" --volatile[=MODE] Run the system in volatile mode\n"
" --settings=BOOLEAN Load additional settings from .nspawn file\n"
+ " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
+ " accepted values: yes and no\n"
, program_invocation_short_name);
}
@@ -367,6 +374,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_SETTINGS,
ARG_CHDIR,
ARG_PRIVATE_USERS_CHOWN,
+ ARG_NOTIFY_READY,
};
static const struct option options[] = {
@@ -415,6 +423,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{ "settings", required_argument, NULL, ARG_SETTINGS },
{ "chdir", required_argument, NULL, ARG_CHDIR },
+ { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{}
};
@@ -987,6 +996,16 @@ static int parse_argv(int argc, char *argv[]) {
arg_settings_mask |= SETTING_WORKING_DIRECTORY;
break;
+ case ARG_NOTIFY_READY:
+ r = parse_boolean(optarg);
+ if (r < 0) {
+ log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
+ return -EINVAL;
+ }
+ arg_notify_ready = r;
+ arg_settings_mask |= SETTING_NOTIFY_READY;
+ break;
+
case '?':
return -EINVAL;
@@ -2529,6 +2548,7 @@ static int inner_child(
NULL, /* container_uuid */
NULL, /* LISTEN_FDS */
NULL, /* LISTEN_PID */
+ NULL, /* NOTIFY_SOCKET */
NULL
};
@@ -2656,6 +2676,8 @@ static int inner_child(
(asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
return log_oom();
}
+ if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
+ return log_oom();
env_use = strv_env_merge(2, envp, arg_setenv);
if (!env_use)
@@ -2725,6 +2747,37 @@ static int inner_child(
return log_error_errno(r, "execv() failed: %m");
}
+static int setup_sd_notify_child(void) {
+ static const int one = 1;
+ int fd = -1;
+ union sockaddr_union sa = {
+ .sa.sa_family = AF_UNIX,
+ };
+ int r;
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate notification socket: %m");
+
+ (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
+ (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
+
+ strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
+ r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+ if (r < 0) {
+ safe_close(fd);
+ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+ }
+
+ r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
+ if (r < 0) {
+ safe_close(fd);
+ return log_error_errno(errno, "SO_PASSCRED failed: %m");
+ }
+
+ return fd;
+}
+
static int outer_child(
Barrier *barrier,
const char *directory,
@@ -2736,6 +2789,7 @@ static int outer_child(
bool secondary,
int pid_socket,
int uuid_socket,
+ int notify_socket,
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
@@ -2744,12 +2798,14 @@ static int outer_child(
pid_t pid;
ssize_t l;
int r;
+ _cleanup_close_ int fd = -1;
assert(barrier);
assert(directory);
assert(console);
assert(pid_socket >= 0);
assert(uuid_socket >= 0);
+ assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
cg_unified_flush();
@@ -2936,6 +2992,10 @@ static int outer_child(
if (r < 0)
return log_error_errno(r, "Failed to move root directory: %m");
+ fd = setup_sd_notify_child();
+ if (fd < 0)
+ return fd;
+
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
(arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
(arg_private_network ? CLONE_NEWNET : 0) |
@@ -2945,6 +3005,7 @@ static int outer_child(
if (pid == 0) {
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
+ notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
/* The inner child has all namespaces that are
@@ -2974,8 +3035,13 @@ static int outer_child(
return -EIO;
}
+ l = send_one_fd(notify_socket, fd, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send notify fd: %m");
+
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
+ notify_socket = safe_close(notify_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
@@ -3058,6 +3124,96 @@ static int setup_uid_map(pid_t pid) {
return 0;
}
+static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ _cleanup_fdset_free_ FDSet *fds = NULL;
+ char buf[NOTIFY_BUFFER_MAX+1];
+ char *p = NULL;
+ struct iovec iovec = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf)-1,
+ };
+ union {
+ struct cmsghdr cmsghdr;
+ uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
+ CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
+ } control = {};
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+ struct cmsghdr *cmsg;
+ struct ucred *ucred = NULL;
+ ssize_t n;
+ pid_t inner_child_pid;
+ _cleanup_strv_free_ char **tags = NULL;
+
+ assert(userdata);
+
+ inner_child_pid = PTR_TO_PID(userdata);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected poll event for notify fd.");
+ return 0;
+ }
+
+ n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+ if (n < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ return 0;
+
+ return log_warning_errno(errno, "Couldn't read notification socket: %m");
+ }
+ cmsg_close_all(&msghdr);
+
+ CMSG_FOREACH(cmsg, &msghdr) {
+ if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_CREDENTIALS &&
+ cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+ ucred = (struct ucred*) CMSG_DATA(cmsg);
+ }
+ }
+
+ if (!ucred || ucred->pid != inner_child_pid) {
+ log_warning("Received notify message without valid credentials. Ignoring.");
+ return 0;
+ }
+
+ if ((size_t) n >= sizeof(buf)) {
+ log_warning("Received notify message exceeded maximum size. Ignoring.");
+ return 0;
+ }
+
+ buf[n] = 0;
+ tags = strv_split(buf, "\n\r");
+ if (!tags)
+ return log_oom();
+
+ if (strv_find(tags, "READY=1"))
+ sd_notifyf(false, "READY=1\n");
+
+ p = strv_find_startswith(tags, "STATUS=");
+ if (p)
+ sd_notifyf(false, "STATUS=Container running: %s", p);
+
+ return 0;
+}
+
+static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
+ int r;
+ sd_event_source *notify_event_source;
+
+ r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify event source: %m");
+
+ (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
+
+ return 0;
+}
+
static int load_settings(void) {
_cleanup_(settings_freep) Settings *settings = NULL;
_cleanup_fclose_ FILE *f = NULL;
@@ -3286,6 +3442,9 @@ static int load_settings(void) {
}
}
+ if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
+ arg_notify_ready = settings->notify_ready;
+
return 0;
}
@@ -3536,7 +3695,9 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[2] = { -1, -1 },
pid_socket_pair[2] = { -1, -1 },
uuid_socket_pair[2] = { -1, -1 },
+ notify_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 };
+ _cleanup_close_ int notify_socket= -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
@@ -3587,6 +3748,11 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create notify socket pair: %m");
+ goto finish;
+ }
+
if (arg_userns_mode != USER_NAMESPACE_NO)
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
@@ -3628,6 +3794,7 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
+ notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
(void) reset_all_signal_handlers();
@@ -3643,6 +3810,7 @@ int main(int argc, char *argv[]) {
secondary,
pid_socket_pair[1],
uuid_socket_pair[1],
+ notify_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
@@ -3661,6 +3829,7 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
+ notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -3734,6 +3903,13 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ /* We also retrieve the socket used for notifications generated by outer child */
+ notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+ if (notify_socket < 0) {
+ r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
+ goto finish;
+ }
+
log_debug("Init process invoked as PID " PID_FMT, pid);
if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -3848,6 +4024,16 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ r = sd_event_new(&event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get default event source: %m");
+ goto finish;
+ }
+
+ r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
+ if (r < 0)
+ goto finish;
+
/* Let the child know that we are ready and wait that the child is completely ready now. */
if (!barrier_place_and_sync(&barrier)) { /* #4 */
log_error("Child died too early.");
@@ -3860,15 +4046,10 @@ int main(int argc, char *argv[]) {
etc_passwd_lock = safe_close(etc_passwd_lock);
sd_notifyf(false,
- "READY=1\n"
"STATUS=Container running.\n"
"X_NSPAWN_LEADER_PID=" PID_FMT, pid);
-
- r = sd_event_new(&event);
- if (r < 0) {
- log_error_errno(r, "Failed to get default event source: %m");
- goto finish;
- }
+ if (!arg_notify_ready)
+ sd_notify(false, "READY=1\n");
if (arg_kill_signal > 0) {
/* Try to kill the init system on SIGINT or SIGTERM */