diff options
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r-- | src/nspawn/nspawn.c | 451 |
1 files changed, 356 insertions, 95 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index d1c65e8b0b..fcf14bba4c 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -61,9 +61,9 @@ #include "fs-util.h" #include "gpt.h" #include "hostname-util.h" +#include "id128-util.h" #include "log.h" #include "loopback-setup.h" -#include "machine-id-setup.h" #include "machine-image.h" #include "macro.h" #include "missing.h" @@ -76,10 +76,10 @@ #include "nspawn-network.h" #include "nspawn-patch-uid.h" #include "nspawn-register.h" +#include "nspawn-seccomp.h" #include "nspawn-settings.h" #include "nspawn-setuid.h" #include "nspawn-stub-pid1.h" -#include "nspawn-seccomp.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" @@ -101,10 +101,16 @@ #include "util.h" /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit - * UID range here */ + * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems + * may have their own allocation ranges too. */ #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000)) #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000)) +/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path + * nspawn_notify_socket_path is relative to the container + * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */ +#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify" + typedef enum ContainerStatus { CONTAINER_TERMINATED, CONTAINER_REBOOTED @@ -133,6 +139,8 @@ static bool arg_ephemeral = false; static LinkJournal arg_link_journal = LINK_AUTO; static bool arg_link_journal_try = false; static uint64_t arg_caps_retain = + (1ULL << CAP_AUDIT_CONTROL) | + (1ULL << CAP_AUDIT_WRITE) | (1ULL << CAP_CHOWN) | (1ULL << CAP_DAC_OVERRIDE) | (1ULL << CAP_DAC_READ_SEARCH) | @@ -142,23 +150,21 @@ static uint64_t arg_caps_retain = (1ULL << CAP_KILL) | (1ULL << CAP_LEASE) | (1ULL << CAP_LINUX_IMMUTABLE) | + (1ULL << CAP_MKNOD) | (1ULL << CAP_NET_BIND_SERVICE) | (1ULL << CAP_NET_BROADCAST) | (1ULL << CAP_NET_RAW) | - (1ULL << CAP_SETGID) | (1ULL << CAP_SETFCAP) | + (1ULL << CAP_SETGID) | (1ULL << CAP_SETPCAP) | (1ULL << CAP_SETUID) | (1ULL << CAP_SYS_ADMIN) | + (1ULL << CAP_SYS_BOOT) | (1ULL << CAP_SYS_CHROOT) | (1ULL << CAP_SYS_NICE) | (1ULL << CAP_SYS_PTRACE) | - (1ULL << CAP_SYS_TTY_CONFIG) | (1ULL << CAP_SYS_RESOURCE) | - (1ULL << CAP_SYS_BOOT) | - (1ULL << CAP_AUDIT_WRITE) | - (1ULL << CAP_AUDIT_CONTROL) | - (1ULL << CAP_MKNOD); + (1ULL << CAP_SYS_TTY_CONFIG); static CustomMount *arg_custom_mounts = NULL; static unsigned arg_n_custom_mounts = 0; static char **arg_setenv = NULL; @@ -187,6 +193,8 @@ static SettingsMask arg_settings_mask = 0; static int arg_settings_trusted = -1; static char **arg_parameters = NULL; static const char *arg_container_service_name = "systemd-nspawn"; +static bool arg_notify_ready = false; +static bool arg_use_cgns = true; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -208,10 +216,10 @@ static void help(void) { " --uuid=UUID Set a specific machine UUID for the container\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" - " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n" + " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n" " --private-users[=UIDBASE[:NUIDS]]\n" - " Run within user namespace, user configured UID/GID range\n" - " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n" + " Similar, but with user configured UID/GID range\n" + " --private-user-chown Adjust OS tree ownership to private UID/GID range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -228,11 +236,10 @@ static void help(void) { " Add an additional virtual Ethernet link between\n" " host and container\n" " --network-bridge=INTERFACE\n" - " Add a virtual Ethernet connection between host\n" - " and container and add it to an existing bridge on\n" - " the host\n" - " --network-zone=NAME Add a virtual Ethernet connection to the container,\n" - " and add it to an automatically managed bridge interface\n" + " Add a virtual Ethernet connection to the container\n" + " and attach it to an existing bridge on the host\n" + " --network-zone=NAME Similar, but attach the new interface to an\n" + " an automatically managed bridge interface\n" " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" " Expose a container IP port on the host\n" " -Z --selinux-context=SECLABEL\n" @@ -261,16 +268,15 @@ static void help(void) { " --overlay-ro=PATH[:PATH...]:PATH\n" " Similar, but creates a read-only overlay mount\n" " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n" - " --share-system Share system namespaces with host\n" " --register=BOOLEAN Register container as machine\n" " --keep-unit Do not register a scope for the machine, reuse\n" " the service unit nspawn is running in\n" " --volatile[=MODE] Run the system in volatile mode\n" " --settings=BOOLEAN Load additional settings from .nspawn file\n" + " --notify-ready=BOOLEAN Receive notifications from the child init process\n" , program_invocation_short_name); } - static int custom_mounts_prepare(void) { unsigned i; int r; @@ -367,6 +373,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_SETTINGS, ARG_CHDIR, ARG_PRIVATE_USERS_CHOWN, + ARG_NOTIFY_READY, }; static const struct option options[] = { @@ -395,7 +402,7 @@ static int parse_argv(int argc, char *argv[]) { { "selinux-context", required_argument, NULL, 'Z' }, { "selinux-apifs-context", required_argument, NULL, 'L' }, { "quiet", no_argument, NULL, 'q' }, - { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, + { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */ { "register", required_argument, NULL, ARG_REGISTER }, { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT }, { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE }, @@ -415,6 +422,7 @@ static int parse_argv(int argc, char *argv[]) { { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, { "settings", required_argument, NULL, ARG_SETTINGS }, { "chdir", required_argument, NULL, ARG_CHDIR }, + { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, {} }; @@ -584,9 +592,12 @@ static int parse_argv(int argc, char *argv[]) { case ARG_UUID: r = sd_id128_from_string(optarg, &arg_uuid); - if (r < 0) { - log_error("Invalid UUID: %s", optarg); - return r; + if (r < 0) + return log_error_errno(r, "Invalid UUID: %s", optarg); + + if (sd_id128_is_null(arg_uuid)) { + log_error("Machine UUID may not be all zeroes."); + return -EINVAL; } arg_settings_mask |= SETTING_MACHINE_ID; @@ -800,6 +811,8 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_SHARE_SYSTEM: + /* We don't officially support this anymore, except for compat reasons. People should use the + * $SYSTEMD_NSPAWN_SHARE_SYSTEM environment variable instead. */ arg_share_system = true; break; @@ -987,6 +1000,16 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_WORKING_DIRECTORY; break; + case ARG_NOTIFY_READY: + r = parse_boolean(optarg); + if (r < 0) { + log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg); + return -EINVAL; + } + arg_notify_ready = r; + arg_settings_mask |= SETTING_NOTIFY_READY; + break; + case '?': return -EINVAL; @@ -994,6 +1017,9 @@ static int parse_argv(int argc, char *argv[]) { assert_not_reached("Unhandled option"); } + if (getenv_bool("SYSTEMD_NSPAWN_SHARE_SYSTEM") > 0) + arg_share_system = true; + if (arg_share_system) arg_register = false; @@ -1001,7 +1027,7 @@ static int parse_argv(int argc, char *argv[]) { arg_userns_chown = true; if (arg_start_mode != START_PID1 && arg_share_system) { - log_error("--boot and --share-system may not be combined."); + log_error("--boot and SYSTEMD_NSPAWN_SHARE_SYSTEM=1 may not be combined."); return -EINVAL; } @@ -1081,6 +1107,12 @@ static int parse_argv(int argc, char *argv[]) { if (e) arg_container_service_name = e; + r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS"); + if (r < 0) + arg_use_cgns = cg_ns_supported(); + else + arg_use_cgns = r; + return 1; } @@ -1224,42 +1256,46 @@ static int setup_resolv_conf(const char *dest) { /* Fix resolv.conf, if possible */ where = prefix_roota(dest, "/etc/resolv.conf"); + if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) { + /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the + * container, so that the container can use the host's resolver. Given that network namespacing is + * disabled it's only natural of the container also uses the host's resolver. It also has the big + * advantage that the container will be able to follow the host's DNS server configuration changes + * transparently. */ + + if (mount("/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL) < 0) + log_warning_errno(errno, "Failed to mount /etc/resolv.conf in the container, ignoring: %m"); + else { + if (mount(NULL, where, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0) + return log_error_errno(errno, "Failed to remount /etc/resolv.conf read-only: %m"); + + return 0; + } + } + + /* If that didn't work, let's copy the file */ r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0); if (r < 0) { - /* If the file already exists as symlink, let's - * suppress the warning, under the assumption that - * resolved or something similar runs inside and the - * symlink points there. + /* If the file already exists as symlink, let's suppress the warning, under the assumption that + * resolved or something similar runs inside and the symlink points there. * - * If the disk image is read-only, there's also no - * point in complaining. + * If the disk image is read-only, there's also no point in complaining. */ log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to copy /etc/resolv.conf to %s: %m", where); + "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where); return 0; } r = userns_lchown(where, 0, 0); if (r < 0) - log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m"); + log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m"); return 0; } -static char* id128_format_as_uuid(sd_id128_t id, char s[37]) { - assert(s); - - snprintf(s, 37, - "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - SD_ID128_FORMAT_VAL(id)); - - return s; -} - static int setup_boot_id(const char *dest) { + sd_id128_t rnd = SD_ID128_NULL; const char *from, *to; - sd_id128_t rnd = {}; - char as_uuid[37]; int r; if (arg_share_system) @@ -1275,18 +1311,16 @@ static int setup_boot_id(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to generate random boot id: %m"); - id128_format_as_uuid(rnd, as_uuid); - - r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE); + r = id128_write(from, ID128_UUID, rnd, false); if (r < 0) return log_error_errno(r, "Failed to write boot id: %m"); if (mount(from, to, NULL, MS_BIND, NULL) < 0) r = log_error_errno(errno, "Failed to bind mount boot id: %m"); else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0) - log_warning_errno(errno, "Failed to make boot id read-only: %m"); + r = log_error_errno(errno, "Failed to make boot id read-only: %m"); - unlink(from); + (void) unlink(from); return r; } @@ -1784,17 +1818,18 @@ static int dissect_image( char **root_device, bool *root_device_rw, char **home_device, bool *home_device_rw, char **srv_device, bool *srv_device_rw, + char **esp_device, bool *secondary) { #ifdef HAVE_BLKID - int home_nr = -1, srv_nr = -1; + int home_nr = -1, srv_nr = -1, esp_nr = -1; #ifdef GPT_ROOT_NATIVE int root_nr = -1; #endif #ifdef GPT_ROOT_SECONDARY int secondary_root_nr = -1; #endif - _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL; + _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL; _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL; _cleanup_udev_device_unref_ struct udev_device *d = NULL; _cleanup_blkid_free_probe_ blkid_probe b = NULL; @@ -1812,6 +1847,7 @@ static int dissect_image( assert(root_device); assert(home_device); assert(srv_device); + assert(esp_device); assert(secondary); assert(arg_image); @@ -2025,6 +2061,16 @@ static int dissect_image( r = free_and_strdup(&srv, node); if (r < 0) return log_oom(); + } else if (sd_id128_equal(type_id, GPT_ESP)) { + + if (esp && nr >= esp_nr) + continue; + + esp_nr = nr; + + r = free_and_strdup(&esp, node); + if (r < 0) + return log_oom(); } #ifdef GPT_ROOT_NATIVE else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) { @@ -2142,6 +2188,11 @@ static int dissect_image( *srv_device_rw = srv_rw; } + if (esp) { + *esp_device = esp; + esp = NULL; + } + return 0; #else log_error("--image= is not supported, compiled without blkid support."); @@ -2212,33 +2263,37 @@ static int mount_device(const char *what, const char *where, const char *directo } static int setup_machine_id(const char *directory) { + const char *etc_machine_id; + sd_id128_t id; int r; - const char *etc_machine_id, *t; - _cleanup_free_ char *s = NULL; - etc_machine_id = prefix_roota(directory, "/etc/machine-id"); + /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the + * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The + * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not + * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id + * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the + * container behaves nicely). */ - r = read_one_line_file(etc_machine_id, &s); - if (r < 0) - return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id); + etc_machine_id = prefix_roota(directory, "/etc/machine-id"); - t = strstrip(s); + r = id128_read(etc_machine_id, ID128_PLAIN, &id); + if (r < 0) { + if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */ + return log_error_errno(r, "Failed to read machine ID from container image: %m"); - if (!isempty(t)) { - r = sd_id128_from_string(t, &arg_uuid); - if (r < 0) - return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id); - } else { if (sd_id128_is_null(arg_uuid)) { r = sd_id128_randomize(&arg_uuid); if (r < 0) - return log_error_errno(r, "Failed to generate random machine ID: %m"); + return log_error_errno(r, "Failed to acquire randomized machine UUID: %m"); + } + } else { + if (sd_id128_is_null(id)) { + log_error("Machine ID in container image is zero, refusing."); + return -EINVAL; } - } - r = machine_id_setup(directory, arg_uuid); - if (r < 0) - return log_error_errno(r, "Failed to setup machine ID: %m"); + arg_uuid = id; + } return 0; } @@ -2270,7 +2325,8 @@ static int mount_devices( const char *where, const char *root_device, bool root_device_rw, const char *home_device, bool home_device_rw, - const char *srv_device, bool srv_device_rw) { + const char *srv_device, bool srv_device_rw, + const char *esp_device) { int r; assert(where); @@ -2293,6 +2349,27 @@ static int mount_devices( return log_error_errno(r, "Failed to mount server data directory: %m"); } + if (esp_device) { + const char *mp, *x; + + /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */ + + mp = "/efi"; + x = strjoina(arg_directory, mp); + r = dir_is_empty(x); + if (r == -ENOENT) { + mp = "/boot"; + x = strjoina(arg_directory, mp); + r = dir_is_empty(x); + } + + if (r > 0) { + r = mount_device(esp_device, arg_directory, mp, true); + if (r < 0) + return log_error_errno(r, "Failed to mount ESP: %m"); + } + } + return 0; } @@ -2529,6 +2606,7 @@ static int inner_child( NULL, /* container_uuid */ NULL, /* LISTEN_FDS */ NULL, /* LISTEN_PID */ + NULL, /* NOTIFY_SOCKET */ NULL }; @@ -2574,9 +2652,25 @@ static int inner_child( return -ESRCH; } - r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); - if (r < 0) - return r; + if (arg_use_cgns && cg_ns_supported()) { + r = unshare(CLONE_NEWCGROUP); + if (r < 0) + return log_error_errno(errno, "Failed to unshare cgroup namespace"); + r = mount_cgroups( + "", + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + arg_use_cgns); + if (r < 0) + return r; + } else { + r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); + if (r < 0) + return r; + } r = reset_uid_gid(); if (r < 0) @@ -2622,7 +2716,7 @@ static int inner_child( #ifdef HAVE_SELINUX if (arg_selinux_context) - if (setexeccon((security_context_t) arg_selinux_context) < 0) + if (setexeccon(arg_selinux_context) < 0) return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); #endif @@ -2642,9 +2736,9 @@ static int inner_child( (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) return log_oom(); - assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL)); + assert(!sd_id128_is_null(arg_uuid)); - if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) + if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0) return log_oom(); if (fdset_size(fds) > 0) { @@ -2656,6 +2750,8 @@ static int inner_child( (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) return log_oom(); } + if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0) + return log_oom(); env_use = strv_env_merge(2, envp, arg_setenv); if (!env_use) @@ -2725,6 +2821,37 @@ static int inner_child( return log_error_errno(r, "execv() failed: %m"); } +static int setup_sd_notify_child(void) { + static const int one = 1; + int fd = -1; + union sockaddr_union sa = { + .sa.sa_family = AF_UNIX, + }; + int r; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate notification socket: %m"); + + (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755); + (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH); + + strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1); + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) { + safe_close(fd); + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + } + + r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)); + if (r < 0) { + safe_close(fd); + return log_error_errno(errno, "SO_PASSCRED failed: %m"); + } + + return fd; +} + static int outer_child( Barrier *barrier, const char *directory, @@ -2732,10 +2859,12 @@ static int outer_child( const char *root_device, bool root_device_rw, const char *home_device, bool home_device_rw, const char *srv_device, bool srv_device_rw, + const char *esp_device, bool interactive, bool secondary, int pid_socket, int uuid_socket, + int notify_socket, int kmsg_socket, int rtnl_socket, int uid_shift_socket, @@ -2744,12 +2873,14 @@ static int outer_child( pid_t pid; ssize_t l; int r; + _cleanup_close_ int fd = -1; assert(barrier); assert(directory); assert(console); assert(pid_socket >= 0); assert(uuid_socket >= 0); + assert(notify_socket >= 0); assert(kmsg_socket >= 0); cg_unified_flush(); @@ -2790,7 +2921,8 @@ static int outer_child( r = mount_devices(directory, root_device, root_device_rw, home_device, home_device_rw, - srv_device, srv_device_rw); + srv_device, srv_device_rw, + esp_device); if (r < 0) return r; @@ -2817,7 +2949,7 @@ static int outer_child( if (l < 0) return log_error_errno(errno, "Failed to recv UID shift: %m"); if (l != sizeof(arg_uid_shift)) { - log_error("Short read while recieving UID shift."); + log_error("Short read while receiving UID shift."); return -EIO; } } @@ -2922,20 +3054,27 @@ static int outer_child( if (r < 0) return r; - r = mount_cgroups( - directory, - arg_unified_cgroup_hierarchy, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_apifs_context); - if (r < 0) - return r; + if (!arg_use_cgns || !cg_ns_supported()) { + r = mount_cgroups( + directory, + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + arg_use_cgns); + if (r < 0) + return r; + } r = mount_move_root(directory); if (r < 0) return log_error_errno(r, "Failed to move root directory: %m"); + fd = setup_sd_notify_child(); + if (fd < 0) + return fd; + pid = raw_clone(SIGCHLD|CLONE_NEWNS| (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | (arg_private_network ? CLONE_NEWNET : 0) | @@ -2945,6 +3084,7 @@ static int outer_child( if (pid == 0) { pid_socket = safe_close(pid_socket); uuid_socket = safe_close(uuid_socket); + notify_socket = safe_close(notify_socket); uid_shift_socket = safe_close(uid_shift_socket); /* The inner child has all namespaces that are @@ -2974,8 +3114,13 @@ static int outer_child( return -EIO; } + l = send_one_fd(notify_socket, fd, 0); + if (l < 0) + return log_error_errno(errno, "Failed to send notify fd: %m"); + pid_socket = safe_close(pid_socket); uuid_socket = safe_close(uuid_socket); + notify_socket = safe_close(notify_socket); kmsg_socket = safe_close(kmsg_socket); rtnl_socket = safe_close(rtnl_socket); @@ -3058,6 +3203,95 @@ static int setup_uid_map(pid_t pid) { return 0; } +static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + char buf[NOTIFY_BUFFER_MAX+1]; + char *p = NULL; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + union { + struct cmsghdr cmsghdr; + uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)]; + } control = {}; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + struct ucred *ucred = NULL; + ssize_t n; + pid_t inner_child_pid; + _cleanup_strv_free_ char **tags = NULL; + + assert(userdata); + + inner_child_pid = PTR_TO_PID(userdata); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for notify fd."); + return 0; + } + + n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (n < 0) { + if (errno == EAGAIN || errno == EINTR) + return 0; + + return log_warning_errno(errno, "Couldn't read notification socket: %m"); + } + cmsg_close_all(&msghdr); + + CMSG_FOREACH(cmsg, &msghdr) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + + ucred = (struct ucred*) CMSG_DATA(cmsg); + } + } + + if (!ucred || ucred->pid != inner_child_pid) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } + + if ((size_t) n >= sizeof(buf)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + buf[n] = 0; + tags = strv_split(buf, "\n\r"); + if (!tags) + return log_oom(); + + if (strv_find(tags, "READY=1")) + sd_notifyf(false, "READY=1\n"); + + p = strv_find_startswith(tags, "STATUS="); + if (p) + sd_notifyf(false, "STATUS=Container running: %s", p); + + return 0; +} + +static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) { + int r; + sd_event_source *notify_event_source; + + r = sd_event_add_io(event, ¬ify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + + (void) sd_event_source_set_description(notify_event_source, "nspawn-notify"); + + return 0; +} + static int load_settings(void) { _cleanup_(settings_freep) Settings *settings = NULL; _cleanup_fclose_ FILE *f = NULL; @@ -3286,12 +3520,15 @@ static int load_settings(void) { } } + if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0) + arg_notify_ready = settings->notify_ready; + return 0; } int main(int argc, char *argv[]) { - _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL; + _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL; bool root_device_rw = true, home_device_rw = true, srv_device_rw = true; _cleanup_close_ int master = -1, image_fd = -1; _cleanup_fdset_free_ FDSet *fds = NULL; @@ -3400,7 +3637,7 @@ int main(int argc, char *argv[]) { } if (r < 0) { log_error_errno(r, "Failed to lock %s: %m", arg_directory); - return r; + goto finish; } if (arg_template) { @@ -3473,6 +3710,7 @@ int main(int argc, char *argv[]) { &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, + &esp_device, &secondary); if (r < 0) goto finish; @@ -3536,7 +3774,9 @@ int main(int argc, char *argv[]) { rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, + notify_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 }; + _cleanup_close_ int notify_socket= -1; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; @@ -3587,6 +3827,11 @@ int main(int argc, char *argv[]) { goto finish; } + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) { + r = log_error_errno(errno, "Failed to create notify socket pair: %m"); + goto finish; + } + if (arg_userns_mode != USER_NAMESPACE_NO) if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); @@ -3628,6 +3873,7 @@ int main(int argc, char *argv[]) { rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); pid_socket_pair[0] = safe_close(pid_socket_pair[0]); uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); + notify_socket_pair[0] = safe_close(notify_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); (void) reset_all_signal_handlers(); @@ -3639,10 +3885,12 @@ int main(int argc, char *argv[]) { root_device, root_device_rw, home_device, home_device_rw, srv_device, srv_device_rw, + esp_device, interactive, secondary, pid_socket_pair[1], uuid_socket_pair[1], + notify_socket_pair[1], kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], @@ -3661,6 +3909,7 @@ int main(int argc, char *argv[]) { rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); pid_socket_pair[1] = safe_close(pid_socket_pair[1]); uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); + notify_socket_pair[1] = safe_close(notify_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); if (arg_userns_mode != USER_NAMESPACE_NO) { @@ -3734,6 +3983,13 @@ int main(int argc, char *argv[]) { goto finish; } + /* We also retrieve the socket used for notifications generated by outer child */ + notify_socket = receive_one_fd(notify_socket_pair[0], 0); + if (notify_socket < 0) { + r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m"); + goto finish; + } + log_debug("Init process invoked as PID " PID_FMT, pid); if (arg_userns_mode != USER_NAMESPACE_NO) { @@ -3848,6 +4104,16 @@ int main(int argc, char *argv[]) { goto finish; } + r = sd_event_new(&event); + if (r < 0) { + log_error_errno(r, "Failed to get default event source: %m"); + goto finish; + } + + r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid)); + if (r < 0) + goto finish; + /* Let the child know that we are ready and wait that the child is completely ready now. */ if (!barrier_place_and_sync(&barrier)) { /* #4 */ log_error("Child died too early."); @@ -3860,15 +4126,10 @@ int main(int argc, char *argv[]) { etc_passwd_lock = safe_close(etc_passwd_lock); sd_notifyf(false, - "READY=1\n" "STATUS=Container running.\n" "X_NSPAWN_LEADER_PID=" PID_FMT, pid); - - r = sd_event_new(&event); - if (r < 0) { - log_error_errno(r, "Failed to get default event source: %m"); - goto finish; - } + if (!arg_notify_ready) + sd_notify(false, "READY=1\n"); if (arg_kill_signal > 0) { /* Try to kill the init system on SIGINT or SIGTERM */ |