diff options
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r-- | src/nspawn/nspawn.c | 1339 |
1 files changed, 734 insertions, 605 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index b1c012a9e4..dea54c70b4 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -169,7 +169,6 @@ static CustomMount *arg_custom_mounts = NULL; static unsigned arg_n_custom_mounts = 0; static char **arg_setenv = NULL; static bool arg_quiet = false; -static bool arg_share_system = false; static bool arg_register = true; static bool arg_keep_unit = false; static char **arg_network_interfaces = NULL; @@ -188,12 +187,14 @@ static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns_chown = false; static int arg_kill_signal = 0; -static bool arg_unified_cgroup_hierarchy = false; +static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN; static SettingsMask arg_settings_mask = 0; static int arg_settings_trusted = -1; static char **arg_parameters = NULL; static const char *arg_container_service_name = "systemd-nspawn"; static bool arg_notify_ready = false; +static bool arg_use_cgns = true; +static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -215,10 +216,10 @@ static void help(void) { " --uuid=UUID Set a specific machine UUID for the container\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" - " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n" + " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n" " --private-users[=UIDBASE[:NUIDS]]\n" - " Run within user namespace, user configured UID/GID range\n" - " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n" + " Similar, but with user configured UID/GID range\n" + " --private-users-chown Adjust OS tree ownership to private UID/GID range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -235,11 +236,10 @@ static void help(void) { " Add an additional virtual Ethernet link between\n" " host and container\n" " --network-bridge=INTERFACE\n" - " Add a virtual Ethernet connection between host\n" - " and container and add it to an existing bridge on\n" - " the host\n" - " --network-zone=NAME Add a virtual Ethernet connection to the container,\n" - " and add it to an automatically managed bridge interface\n" + " Add a virtual Ethernet connection to the container\n" + " and attach it to an existing bridge on the host\n" + " --network-zone=NAME Similar, but attach the new interface to an\n" + " an automatically managed bridge interface\n" " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" " Expose a container IP port on the host\n" " -Z --selinux-context=SECLABEL\n" @@ -268,14 +268,12 @@ static void help(void) { " --overlay-ro=PATH[:PATH...]:PATH\n" " Similar, but creates a read-only overlay mount\n" " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n" - " --share-system Share system namespaces with host\n" " --register=BOOLEAN Register container as machine\n" " --keep-unit Do not register a scope for the machine, reuse\n" " the service unit nspawn is running in\n" " --volatile[=MODE] Run the system in volatile mode\n" " --settings=BOOLEAN Load additional settings from .nspawn file\n" - " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n" - " accepted values: yes and no\n" + " --notify-ready=BOOLEAN Receive notifications from the child init process\n" , program_invocation_short_name); } @@ -318,9 +316,9 @@ static int custom_mounts_prepare(void) { return 0; } -static int detect_unified_cgroup_hierarchy(void) { +static int detect_unified_cgroup_hierarchy(const char *directory) { const char *e; - int r; + int r, all_unified, systemd_unified; /* Allow the user to control whether the unified hierarchy is used */ e = getenv("UNIFIED_CGROUP_HIERARCHY"); @@ -328,20 +326,58 @@ static int detect_unified_cgroup_hierarchy(void) { r = parse_boolean(e); if (r < 0) return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY."); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; - arg_unified_cgroup_hierarchy = r; return 0; } + all_unified = cg_all_unified(); + systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER); + + if (all_unified < 0 || systemd_unified < 0) + return log_error_errno(all_unified < 0 ? all_unified : systemd_unified, + "Failed to determine whether the unified cgroups hierarchy is used: %m"); + /* Otherwise inherit the default from the host system */ - r = cg_unified(); - if (r < 0) - return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + if (all_unified > 0) { + /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection + * routine only detects 231, so we'll have a false negative here for 230. */ + r = systemd_installation_has_version(directory, 230); + if (r < 0) + return log_error_errno(r, "Failed to determine systemd version in container: %m"); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + } else if (systemd_unified > 0) { + /* Mixed cgroup hierarchy support was added in 232 */ + r = systemd_installation_has_version(directory, 232); + if (r < 0) + return log_error_errno(r, "Failed to determine systemd version in container: %m"); + if (r > 0) + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD; + else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + } else + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; - arg_unified_cgroup_hierarchy = r; return 0; } +static void parse_share_ns_env(const char *name, unsigned long ns_flag) { + int r; + + r = getenv_bool(name); + if (r == -ENXIO) + return; + if (r < 0) + log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name); + arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag); +} + static int parse_argv(int argc, char *argv[]) { enum { @@ -379,52 +415,52 @@ static int parse_argv(int argc, char *argv[]) { }; static const struct option options[] = { - { "help", no_argument, NULL, 'h' }, - { "version", no_argument, NULL, ARG_VERSION }, - { "directory", required_argument, NULL, 'D' }, - { "template", required_argument, NULL, ARG_TEMPLATE }, - { "ephemeral", no_argument, NULL, 'x' }, - { "user", required_argument, NULL, 'u' }, - { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK }, - { "as-pid2", no_argument, NULL, 'a' }, - { "boot", no_argument, NULL, 'b' }, - { "uuid", required_argument, NULL, ARG_UUID }, - { "read-only", no_argument, NULL, ARG_READ_ONLY }, - { "capability", required_argument, NULL, ARG_CAPABILITY }, - { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY }, - { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL }, - { "bind", required_argument, NULL, ARG_BIND }, - { "bind-ro", required_argument, NULL, ARG_BIND_RO }, - { "tmpfs", required_argument, NULL, ARG_TMPFS }, - { "overlay", required_argument, NULL, ARG_OVERLAY }, - { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, - { "machine", required_argument, NULL, 'M' }, - { "slice", required_argument, NULL, 'S' }, - { "setenv", required_argument, NULL, 'E' }, - { "selinux-context", required_argument, NULL, 'Z' }, - { "selinux-apifs-context", required_argument, NULL, 'L' }, - { "quiet", no_argument, NULL, 'q' }, - { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, - { "register", required_argument, NULL, ARG_REGISTER }, - { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT }, - { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE }, - { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN }, - { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN }, - { "network-veth", no_argument, NULL, 'n' }, - { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA}, - { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE }, - { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE }, - { "personality", required_argument, NULL, ARG_PERSONALITY }, - { "image", required_argument, NULL, 'i' }, - { "volatile", optional_argument, NULL, ARG_VOLATILE }, - { "port", required_argument, NULL, 'p' }, - { "property", required_argument, NULL, ARG_PROPERTY }, - { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, - { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN}, - { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, - { "settings", required_argument, NULL, ARG_SETTINGS }, - { "chdir", required_argument, NULL, ARG_CHDIR }, - { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "directory", required_argument, NULL, 'D' }, + { "template", required_argument, NULL, ARG_TEMPLATE }, + { "ephemeral", no_argument, NULL, 'x' }, + { "user", required_argument, NULL, 'u' }, + { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK }, + { "as-pid2", no_argument, NULL, 'a' }, + { "boot", no_argument, NULL, 'b' }, + { "uuid", required_argument, NULL, ARG_UUID }, + { "read-only", no_argument, NULL, ARG_READ_ONLY }, + { "capability", required_argument, NULL, ARG_CAPABILITY }, + { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY }, + { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL }, + { "bind", required_argument, NULL, ARG_BIND }, + { "bind-ro", required_argument, NULL, ARG_BIND_RO }, + { "tmpfs", required_argument, NULL, ARG_TMPFS }, + { "overlay", required_argument, NULL, ARG_OVERLAY }, + { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, + { "machine", required_argument, NULL, 'M' }, + { "slice", required_argument, NULL, 'S' }, + { "setenv", required_argument, NULL, 'E' }, + { "selinux-context", required_argument, NULL, 'Z' }, + { "selinux-apifs-context", required_argument, NULL, 'L' }, + { "quiet", no_argument, NULL, 'q' }, + { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */ + { "register", required_argument, NULL, ARG_REGISTER }, + { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT }, + { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE }, + { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN }, + { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN }, + { "network-veth", no_argument, NULL, 'n' }, + { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA }, + { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE }, + { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE }, + { "personality", required_argument, NULL, ARG_PERSONALITY }, + { "image", required_argument, NULL, 'i' }, + { "volatile", optional_argument, NULL, ARG_VOLATILE }, + { "port", required_argument, NULL, 'p' }, + { "property", required_argument, NULL, ARG_PROPERTY }, + { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, + { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, + { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, + { "settings", required_argument, NULL, ARG_SETTINGS }, + { "chdir", required_argument, NULL, ARG_CHDIR }, + { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, {} }; @@ -813,7 +849,9 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_SHARE_SYSTEM: - arg_share_system = true; + /* We don't officially support this anymore, except for compat reasons. People should use the + * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */ + arg_clone_ns_flags = 0; break; case ARG_REGISTER: @@ -875,15 +913,21 @@ static int parse_argv(int argc, char *argv[]) { break; - case ARG_PRIVATE_USERS: + case ARG_PRIVATE_USERS: { + int boolean = -1; - r = optarg ? parse_boolean(optarg) : 1; - if (r == 0) { + if (!optarg) + boolean = true; + else if (!in_charset(optarg, DIGITS)) + /* do *not* parse numbers as booleans */ + boolean = parse_boolean(optarg); + + if (boolean == false) { /* no: User namespacing off */ arg_userns_mode = USER_NAMESPACE_NO; arg_uid_shift = UID_INVALID; arg_uid_range = UINT32_C(0x10000); - } else if (r > 0) { + } else if (boolean == true) { /* yes: User namespacing on, UID range is read from root dir */ arg_userns_mode = USER_NAMESPACE_FIXED; arg_uid_shift = UID_INVALID; @@ -907,23 +951,27 @@ static int parse_argv(int argc, char *argv[]) { shift = buffer; range++; - if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { - log_error("Failed to parse UID range: %s", range); - return -EINVAL; - } + r = safe_atou32(range, &arg_uid_range); + if (r < 0) + return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range); } else shift = optarg; - if (parse_uid(shift, &arg_uid_shift) < 0) { - log_error("Failed to parse UID: %s", optarg); - return -EINVAL; - } + r = parse_uid(shift, &arg_uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg); arg_userns_mode = USER_NAMESPACE_FIXED; } + if (arg_uid_range <= 0) { + log_error("UID range cannot be 0."); + return -EINVAL; + } + arg_settings_mask |= SETTING_USERNS; break; + } case 'U': if (userns_supported()) { @@ -1017,17 +1065,23 @@ static int parse_argv(int argc, char *argv[]) { assert_not_reached("Unhandled option"); } - if (arg_share_system) + parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC); + parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID); + parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS); + parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS); + + if (!(arg_clone_ns_flags & CLONE_NEWPID) || + !(arg_clone_ns_flags & CLONE_NEWUTS)) { arg_register = false; + if (arg_start_mode != START_PID1) { + log_error("--boot cannot be used without namespacing."); + return -EINVAL; + } + } if (arg_userns_mode == USER_NAMESPACE_PICK) arg_userns_chown = true; - if (arg_start_mode != START_PID1 && arg_share_system) { - log_error("--boot and --share-system may not be combined."); - return -EINVAL; - } - if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) { log_error("--keep-unit may not be used when invoked from a user session."); return -EINVAL; @@ -1096,14 +1150,16 @@ static int parse_argv(int argc, char *argv[]) { arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; - r = detect_unified_cgroup_hierarchy(); - if (r < 0) - return r; - e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE"); if (e) arg_container_service_name = e; + r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS"); + if (r < 0) + arg_use_cgns = cg_ns_supported(); + else + arg_use_cgns = r; + return 1; } @@ -1185,7 +1241,13 @@ static int setup_timezone(const char *dest) { /* Fix the timezone, if possible */ r = readlink_malloc("/etc/localtime", &p); if (r < 0) { - log_warning("/etc/localtime is not a symlink, not updating container timezone."); + log_warning("host's /etc/localtime is not a symlink, not updating container timezone."); + /* to handle warning, delete /etc/localtime and replace it + * with a symbolic link to a time zone data file. + * + * Example: + * ln -s /usr/share/zoneinfo/UTC /etc/localtime + */ return 0; } @@ -1274,9 +1336,6 @@ static int setup_boot_id(const char *dest) { const char *from, *to; int r; - if (arg_share_system) - return 0; - /* Generate a new randomized boot ID, so that each boot-up of * the container gets a new one */ @@ -1291,10 +1350,10 @@ static int setup_boot_id(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to write boot id: %m"); - if (mount(from, to, NULL, MS_BIND, NULL) < 0) - r = log_error_errno(errno, "Failed to bind mount boot id: %m"); - else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0) - log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m"); + r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL); + if (r >= 0) + r = mount_verbose(LOG_ERR, NULL, to, NULL, + MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL); (void) unlink(from); return r; @@ -1342,6 +1401,12 @@ static int copy_devnodes(const char *dest) { } else { if (mknod(to, st.st_mode, st.st_rdev) < 0) { + /* + * This is some sort of protection too against + * recursive userns chown on shared /dev/ + */ + if (errno == EEXIST) + log_notice("%s/dev/ should be an empty directory", dest); if (errno != EPERM) return log_error_errno(errno, "mknod(%s) failed: %m", to); @@ -1350,8 +1415,9 @@ static int copy_devnodes(const char *dest) { r = touch(to); if (r < 0) return log_error_errno(r, "touch (%s) failed: %m", to); - if (mount(from, to, NULL, MS_BIND, NULL) < 0) - return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to); + r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL); + if (r < 0) + return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to); } r = userns_lchown(to, 0, 0); @@ -1387,8 +1453,9 @@ static int setup_pts(const char *dest) { p = prefix_roota(dest, "/dev/pts"); if (mkdir(p, 0755) < 0) return log_error_errno(errno, "Failed to create /dev/pts: %m"); - if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0) - return log_error_errno(errno, "Failed to mount /dev/pts: %m"); + r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options); + if (r < 0) + return r; r = userns_lchown(p, 0, 0); if (r < 0) return log_error_errno(r, "Failed to chown /dev/pts: %m"); @@ -1433,10 +1500,7 @@ static int setup_dev_console(const char *dest, const char *console) { if (r < 0) return log_error_errno(r, "touch() for /dev/console failed: %m"); - if (mount(console, to, NULL, MS_BIND, NULL) < 0) - return log_error_errno(errno, "Bind mount for /dev/console failed: %m"); - - return 0; + return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL); } static int setup_kmsg(const char *dest, int kmsg_socket) { @@ -1460,8 +1524,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) { if (mkfifo(from, 0600) < 0) return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m"); - if (mount(from, to, NULL, MS_BIND, NULL) < 0) - return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m"); + r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL); + if (r < 0) + return r; fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC); if (fd < 0) @@ -1494,7 +1559,7 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user static int setup_hostname(void) { - if (arg_share_system) + if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0) return 0; if (sethostname_idempotent(arg_machine) < 0) @@ -1631,7 +1696,8 @@ static int setup_journal(const char *directory) { if (r < 0) return log_error_errno(r, "Failed to create %s: %m", q); - if (mount(p, q, NULL, MS_BIND, NULL) < 0) + r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL); + if (r < 0) return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m"); return 0; @@ -1645,7 +1711,7 @@ static int reset_audit_loginuid(void) { _cleanup_free_ char *p = NULL; int r; - if (arg_share_system) + if ((arg_clone_ns_flags & CLONE_NEWPID) == 0) return 0; r = read_one_line_file("/proc/self/loginuid", &p); @@ -1696,13 +1762,17 @@ static int setup_propagate(const char *root) { return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m"); q = prefix_roota(root, "/run/systemd/nspawn/incoming"); - if (mount(p, q, NULL, MS_BIND, NULL) < 0) - return log_error_errno(errno, "Failed to install propagation bind mount."); + r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL); + if (r < 0) + return r; - if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) - return log_error_errno(errno, "Failed to make propagation mount read-only"); + r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL); + if (r < 0) + return r; - return 0; + /* machined will MS_MOVE into that directory, and that's only + * supported for non-shared mounts. */ + return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL); } static int setup_image(char **device_path, int *loop_nr) { @@ -1794,17 +1864,18 @@ static int dissect_image( char **root_device, bool *root_device_rw, char **home_device, bool *home_device_rw, char **srv_device, bool *srv_device_rw, + char **esp_device, bool *secondary) { #ifdef HAVE_BLKID - int home_nr = -1, srv_nr = -1; + int home_nr = -1, srv_nr = -1, esp_nr = -1; #ifdef GPT_ROOT_NATIVE int root_nr = -1; #endif #ifdef GPT_ROOT_SECONDARY int secondary_root_nr = -1; #endif - _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL; + _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL; _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL; _cleanup_udev_device_unref_ struct udev_device *d = NULL; _cleanup_blkid_free_probe_ blkid_probe b = NULL; @@ -1822,6 +1893,7 @@ static int dissect_image( assert(root_device); assert(home_device); assert(srv_device); + assert(esp_device); assert(secondary); assert(arg_image); @@ -2035,6 +2107,16 @@ static int dissect_image( r = free_and_strdup(&srv, node); if (r < 0) return log_oom(); + } else if (sd_id128_equal(type_id, GPT_ESP)) { + + if (esp && nr >= esp_nr) + continue; + + esp_nr = nr; + + r = free_and_strdup(&esp, node); + if (r < 0) + return log_oom(); } #ifdef GPT_ROOT_NATIVE else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) { @@ -2152,6 +2234,11 @@ static int dissect_image( *srv_device_rw = srv_rw; } + if (esp) { + *esp_device = esp; + esp = NULL; + } + return 0; #else log_error("--image= is not supported, compiled without blkid support."); @@ -2162,7 +2249,7 @@ static int dissect_image( static int mount_device(const char *what, const char *where, const char *directory, bool rw) { #ifdef HAVE_BLKID _cleanup_blkid_free_probe_ blkid_probe b = NULL; - const char *fstype, *p; + const char *fstype, *p, *options; int r; assert(what); @@ -2211,10 +2298,17 @@ static int mount_device(const char *what, const char *where, const char *directo return -EOPNOTSUPP; } - if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) - return log_error_errno(errno, "Failed to mount %s: %m", what); + /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains + * sparse when possible. */ + if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) { + const char *l; - return 0; + l = path_startswith(what, "/dev"); + if (l && startswith(l, "loop")) + options = "discard"; + } + + return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options); #else log_error("--image= is not supported, compiled without blkid support."); return -EOPNOTSUPP; @@ -2284,7 +2378,8 @@ static int mount_devices( const char *where, const char *root_device, bool root_device_rw, const char *home_device, bool home_device_rw, - const char *srv_device, bool srv_device_rw) { + const char *srv_device, bool srv_device_rw, + const char *esp_device) { int r; assert(where); @@ -2307,6 +2402,27 @@ static int mount_devices( return log_error_errno(r, "Failed to mount server data directory: %m"); } + if (esp_device) { + const char *mp, *x; + + /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */ + + mp = "/efi"; + x = strjoina(arg_directory, mp); + r = dir_is_empty(x); + if (r == -ENOENT) { + mp = "/boot"; + x = strjoina(arg_directory, mp); + r = dir_is_empty(x); + } + + if (r > 0) { + r = mount_device(esp_device, arg_directory, mp, true); + if (r < 0) + return log_error_errno(r, "Failed to mount ESP: %m"); + } + } + return 0; } @@ -2567,6 +2683,10 @@ static int inner_child( } } + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Couldn't become new root: %m"); + r = mount_all(NULL, arg_userns_mode != USER_NAMESPACE_NO, true, @@ -2589,13 +2709,25 @@ static int inner_child( return -ESRCH; } - r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); - if (r < 0) - return r; - - r = reset_uid_gid(); - if (r < 0) - return log_error_errno(r, "Couldn't become new root: %m"); + if (arg_use_cgns && cg_ns_supported()) { + r = unshare(CLONE_NEWCGROUP); + if (r < 0) + return log_error_errno(errno, "Failed to unshare cgroup namespace"); + r = mount_cgroups( + "", + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + true); + if (r < 0) + return r; + } else { + r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); + if (r < 0) + return r; + } r = setup_boot_id(NULL); if (r < 0) @@ -2780,6 +2912,7 @@ static int outer_child( const char *root_device, bool root_device_rw, const char *home_device, bool home_device_rw, const char *srv_device, bool srv_device_rw, + const char *esp_device, bool interactive, bool secondary, int pid_socket, @@ -2835,13 +2968,15 @@ static int outer_child( /* Mark everything as slave, so that we still * receive mounts from the real root, but don't * propagate mounts to the real root. */ - if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) - return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m"); + r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL); + if (r < 0) + return r; r = mount_devices(directory, root_device, root_device_rw, home_device, home_device_rw, - srv_device, srv_device_rw); + srv_device, srv_device_rw, + esp_device); if (r < 0) return r; @@ -2849,6 +2984,10 @@ static int outer_child( if (r < 0) return r; + r = detect_unified_cgroup_hierarchy(directory); + if (r < 0) + return r; + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); @@ -2877,8 +3016,19 @@ static int outer_child( } /* Turn directory into bind mount */ - if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) - return log_error_errno(errno, "Failed to make bind mount: %m"); + r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); + if (r < 0) + return r; + + /* Mark everything as shared so our mounts get propagated down. This is + * required to make new bind mounts available in systemd services + * inside the containter that create a new mount namespace. + * See https://github.com/systemd/systemd/issues/3860 + * Further submounts (such as /dev) done after this will inherit the + * shared propagation mode.*/ + r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL); + if (r < 0) + return r; r = recursive_chown(directory, arg_uid_shift, arg_uid_range); if (r < 0) @@ -2909,7 +3059,7 @@ static int outer_child( return r; if (arg_read_only) { - r = bind_remount_recursive(directory, true); + r = bind_remount_recursive(directory, true, NULL); if (r < 0) return log_error_errno(r, "Failed to make tree read-only: %m"); } @@ -2973,15 +3123,18 @@ static int outer_child( if (r < 0) return r; - r = mount_cgroups( - directory, - arg_unified_cgroup_hierarchy, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_apifs_context); - if (r < 0) - return r; + if (!arg_use_cgns || !cg_ns_supported()) { + r = mount_cgroups( + directory, + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context, + false); + if (r < 0) + return r; + } r = mount_move_root(directory); if (r < 0) @@ -2992,7 +3145,7 @@ static int outer_child( return fd; pid = raw_clone(SIGCHLD|CLONE_NEWNS| - (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | + arg_clone_ns_flags | (arg_private_network ? CLONE_NEWNET : 0) | (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); if (pid < 0) @@ -3442,18 +3595,437 @@ static int load_settings(void) { return 0; } +static int run(int master, + const char* console, + const char *root_device, bool root_device_rw, + const char *home_device, bool home_device_rw, + const char *srv_device, bool srv_device_rw, + const char *esp_device, + bool interactive, + bool secondary, + FDSet *fds, + char veth_name[IFNAMSIZ], bool *veth_created, + union in_addr_union *exposed, + pid_t *pid, int *ret) { + + static const struct sigaction sa = { + .sa_handler = nop_signal_handler, + .sa_flags = SA_NOCLDSTOP, + }; + + _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT; + _cleanup_close_ int etc_passwd_lock = -1; + _cleanup_close_pair_ int + kmsg_socket_pair[2] = { -1, -1 }, + rtnl_socket_pair[2] = { -1, -1 }, + pid_socket_pair[2] = { -1, -1 }, + uuid_socket_pair[2] = { -1, -1 }, + notify_socket_pair[2] = { -1, -1 }, + uid_shift_socket_pair[2] = { -1, -1 }; + _cleanup_close_ int notify_socket= -1; + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + ContainerStatus container_status = 0; + char last_char = 0; + int ifi = 0, r; + ssize_t l; + sigset_t mask_chld; + + assert_se(sigemptyset(&mask_chld) == 0); + assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely + * check with getpwuid() if the specific user already exists. Note that /etc might be + * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we + * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are + * really just an extra safety net. We kinda assume that the UID range we allocate from is + * really ours. */ + + etc_passwd_lock = take_etc_passwd_lock(NULL); + if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) + return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m"); + } + + r = barrier_create(&barrier); + if (r < 0) + return log_error_errno(r, "Cannot initialize IPC barrier: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) + return log_error_errno(errno, "Failed to create kmsg socket pair: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) + return log_error_errno(errno, "Failed to create rtnl socket pair: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) + return log_error_errno(errno, "Failed to create pid socket pair: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) + return log_error_errno(errno, "Failed to create id socket pair: %m"); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) + return log_error_errno(errno, "Failed to create notify socket pair: %m"); + + if (arg_userns_mode != USER_NAMESPACE_NO) + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) + return log_error_errno(errno, "Failed to create uid shift socket pair: %m"); + + /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt + * parent's blocking calls and give it a chance to call wait() and terminate. */ + r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); + if (r < 0) + return log_error_errno(errno, "Failed to change the signal mask: %m"); + + r = sigaction(SIGCHLD, &sa, NULL); + if (r < 0) + return log_error_errno(errno, "Failed to install SIGCHLD handler: %m"); + + *pid = raw_clone(SIGCHLD|CLONE_NEWNS); + if (*pid < 0) + return log_error_errno(errno, "clone() failed%s: %m", + errno == EINVAL ? + ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : ""); + + if (*pid == 0) { + /* The outer child only has a file system namespace. */ + barrier_set_role(&barrier, BARRIER_CHILD); + + master = safe_close(master); + + kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]); + rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); + pid_socket_pair[0] = safe_close(pid_socket_pair[0]); + uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); + notify_socket_pair[0] = safe_close(notify_socket_pair[0]); + uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); + + (void) reset_all_signal_handlers(); + (void) reset_signal_mask(); + + r = outer_child(&barrier, + arg_directory, + console, + root_device, root_device_rw, + home_device, home_device_rw, + srv_device, srv_device_rw, + esp_device, + interactive, + secondary, + pid_socket_pair[1], + uuid_socket_pair[1], + notify_socket_pair[1], + kmsg_socket_pair[1], + rtnl_socket_pair[1], + uid_shift_socket_pair[1], + fds); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + barrier_set_role(&barrier, BARRIER_PARENT); + + fds = fdset_free(fds); + + kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); + rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); + pid_socket_pair[1] = safe_close(pid_socket_pair[1]); + uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); + notify_socket_pair[1] = safe_close(notify_socket_pair[1]); + uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* The child just let us know the UID shift it might have read from the image. */ + l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read UID shift: %m"); + + if (l != sizeof arg_uid_shift) { + log_error("Short read while reading UID shift."); + return -EIO; + } + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* If we are supposed to pick the UID shift, let's try to use the shift read from the + * image, but if that's already in use, pick a new one, and report back to the child, + * which one we now picked. */ + + r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock); + if (r < 0) + return log_error_errno(r, "Failed to pick suitable UID/GID range: %m"); + + l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send UID shift: %m"); + if (l != sizeof arg_uid_shift) { + log_error("Short write while writing UID shift."); + return -EIO; + } + } + } + + /* Wait for the outer child. */ + r = wait_for_terminate_and_warn("namespace helper", *pid, NULL); + if (r != 0) + return r < 0 ? r : -EIO; + + /* And now retrieve the PID of the inner child. */ + l = recv(pid_socket_pair[0], pid, sizeof *pid, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read inner child PID: %m"); + if (l != sizeof *pid) { + log_error("Short read while reading inner child PID."); + return -EIO; + } + + /* We also retrieve container UUID in case it was generated by outer child */ + l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read container machine ID: %m"); + if (l != sizeof(arg_uuid)) { + log_error("Short read while reading container machined ID."); + return -EIO; + } + + /* We also retrieve the socket used for notifications generated by outer child */ + notify_socket = receive_one_fd(notify_socket_pair[0], 0); + if (notify_socket < 0) + return log_error_errno(notify_socket, + "Failed to receive notification socket from the outer child: %m"); + + log_debug("Init process invoked as PID "PID_FMT, *pid); + + if (arg_userns_mode != USER_NAMESPACE_NO) { + if (!barrier_place_and_sync(&barrier)) { /* #1 */ + log_error("Child died too early."); + return -ESRCH; + } + + r = setup_uid_map(*pid); + if (r < 0) + return r; + + (void) barrier_place(&barrier); /* #2 */ + } + + if (arg_private_network) { + + r = move_network_interfaces(*pid, arg_network_interfaces); + if (r < 0) + return r; + + if (arg_network_veth) { + r = setup_veth(arg_machine, *pid, veth_name, + arg_network_bridge || arg_network_zone); + if (r < 0) + return r; + else if (r > 0) + ifi = r; + + if (arg_network_bridge) { + /* Add the interface to a bridge */ + r = setup_bridge(veth_name, arg_network_bridge, false); + if (r < 0) + return r; + if (r > 0) + ifi = r; + } else if (arg_network_zone) { + /* Add the interface to a bridge, possibly creating it */ + r = setup_bridge(veth_name, arg_network_zone, true); + if (r < 0) + return r; + if (r > 0) + ifi = r; + } + } + + r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra); + if (r < 0) + return r; + + /* We created the primary and extra veth links now; let's remember this, so that we know to + remove them later on. Note that we don't bother with removing veth links that were created + here when their setup failed half-way, because in that case the kernel should be able to + remove them on its own, since they cannot be referenced by anything yet. */ + *veth_created = true; + + r = setup_macvlan(arg_machine, *pid, arg_network_macvlan); + if (r < 0) + return r; + + r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan); + if (r < 0) + return r; + } + + if (arg_register) { + r = register_machine( + arg_machine, + *pid, + arg_directory, + arg_uuid, + ifi, + arg_slice, + arg_custom_mounts, arg_n_custom_mounts, + arg_kill_signal, + arg_property, + arg_keep_unit, + arg_container_service_name); + if (r < 0) + return r; + } + + r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); + if (r < 0) + return r; + + if (arg_keep_unit) { + r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy); + if (r < 0) + return r; + } + + r = chown_cgroup(*pid, arg_uid_shift); + if (r < 0) + return r; + + /* Notify the child that the parent is ready with all + * its setup (including cgroup-ification), and that + * the child can now hand over control to the code to + * run inside the container. */ + (void) barrier_place(&barrier); /* #3 */ + + /* Block SIGCHLD here, before notifying child. + * process_pty() will handle it with the other signals. */ + assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0); + + /* Reset signal to default */ + r = default_signals(SIGCHLD, -1); + if (r < 0) + return log_error_errno(r, "Failed to reset SIGCHLD: %m"); + + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event source: %m"); + + r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid)); + if (r < 0) + return r; + + /* Let the child know that we are ready and wait that the child is completely ready now. */ + if (!barrier_place_and_sync(&barrier)) { /* #4 */ + log_error("Child died too early."); + return -ESRCH; + } + + /* At this point we have made use of the UID we picked, and thus nss-mymachines + * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */ + etc_passwd_lock = safe_close(etc_passwd_lock); + + sd_notifyf(false, + "STATUS=Container running.\n" + "X_NSPAWN_LEADER_PID=" PID_FMT, *pid); + if (!arg_notify_ready) + sd_notify(false, "READY=1\n"); + + if (arg_kill_signal > 0) { + /* Try to kill the init system on SIGINT or SIGTERM */ + sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid)); + sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid)); + } else { + /* Immediately exit */ + sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + } + + /* simply exit on sigchld */ + sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL); + + if (arg_expose_ports) { + r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl); + if (r < 0) + return r; + + (void) expose_port_execute(rtnl, arg_expose_ports, exposed); + } + + rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); + + r = pty_forward_new(event, master, + PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), + &forward); + if (r < 0) + return log_error_errno(r, "Failed to create PTY forwarder: %m"); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + pty_forward_get_last_char(forward, &last_char); + + forward = pty_forward_free(forward); + + if (!arg_quiet && last_char != '\n') + putc('\n', stdout); + + /* Kill if it is not dead yet anyway */ + if (arg_register && !arg_keep_unit) + terminate_machine(*pid); + + /* Normally redundant, but better safe than sorry */ + kill(*pid, SIGKILL); + + r = wait_for_container(*pid, &container_status); + *pid = 0; + + if (r < 0) + /* We failed to wait for the container, or the container exited abnormally. */ + return r; + if (r > 0 || container_status == CONTAINER_TERMINATED) { + /* r > 0 → The container exited with a non-zero status. + * As a special case, we need to replace 133 with a different value, + * because 133 is special-cased in the service file to reboot the container. + * otherwise → The container exited with zero status and a reboot was not requested. + */ + if (r == 133) + r = EXIT_FAILURE; /* replace 133 with the general failure code */ + *ret = r; + return 0; /* finito */ + } + + /* CONTAINER_REBOOTED, loop again */ + + if (arg_keep_unit) { + /* Special handling if we are running as a service: instead of simply + * restarting the machine we want to restart the entire service, so let's + * inform systemd about this with the special exit code 133. The service + * file uses RestartForceExitStatus=133 so that this results in a full + * nspawn restart. This is necessary since we might have cgroup parameters + * set we want to have flushed out. */ + *ret = 0; + return 133; + } + + expose_port_flush(arg_expose_ports, exposed); + + (void) remove_veth_links(veth_name, arg_network_veth_extra); + *veth_created = false; + return 1; /* loop again */ +} + int main(int argc, char *argv[]) { - _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL; + _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL; bool root_device_rw = true, home_device_rw = true, srv_device_rw = true; _cleanup_close_ int master = -1, image_fd = -1; _cleanup_fdset_free_ FDSet *fds = NULL; - int r, n_fd_passed, loop_nr = -1; + int r, n_fd_passed, loop_nr = -1, ret = EXIT_FAILURE; char veth_name[IFNAMSIZ] = ""; bool secondary = false, remove_subvol = false; - sigset_t mask_chld; pid_t pid = 0; - int ret = EXIT_SUCCESS; union in_addr_union exposed = {}; _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; bool interactive, veth_created = false; @@ -3626,6 +4198,7 @@ int main(int argc, char *argv[]) { &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, + &esp_device, &secondary); if (r < 0) goto finish; @@ -3668,469 +4241,25 @@ int main(int argc, char *argv[]) { assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0); - assert_se(sigemptyset(&mask_chld) == 0); - assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); - if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) { r = log_error_errno(errno, "Failed to become subreaper: %m"); goto finish; } for (;;) { - static const struct sigaction sa = { - .sa_handler = nop_signal_handler, - .sa_flags = SA_NOCLDSTOP, - }; - - _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT; - _cleanup_close_ int etc_passwd_lock = -1; - _cleanup_close_pair_ int - kmsg_socket_pair[2] = { -1, -1 }, - rtnl_socket_pair[2] = { -1, -1 }, - pid_socket_pair[2] = { -1, -1 }, - uuid_socket_pair[2] = { -1, -1 }, - notify_socket_pair[2] = { -1, -1 }, - uid_shift_socket_pair[2] = { -1, -1 }; - _cleanup_close_ int notify_socket= -1; - _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; - _cleanup_(sd_event_unrefp) sd_event *event = NULL; - _cleanup_(pty_forward_freep) PTYForward *forward = NULL; - _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; - ContainerStatus container_status; - char last_char = 0; - int ifi = 0; - ssize_t l; - - if (arg_userns_mode == USER_NAMESPACE_PICK) { - /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely - * check with getpwuid() if the specific user already exists. Note that /etc might be - * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we - * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are - * really just an extra safety net. We kinda assume that the UID range we allocate from is - * really ours. */ - - etc_passwd_lock = take_etc_passwd_lock(NULL); - if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) { - log_error_errno(r, "Failed to take /etc/passwd lock: %m"); - goto finish; - } - } - - r = barrier_create(&barrier); - if (r < 0) { - log_error_errno(r, "Cannot initialize IPC barrier: %m"); - goto finish; - } - - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) { - r = log_error_errno(errno, "Failed to create kmsg socket pair: %m"); - goto finish; - } - - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) { - r = log_error_errno(errno, "Failed to create rtnl socket pair: %m"); - goto finish; - } - - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) { - r = log_error_errno(errno, "Failed to create pid socket pair: %m"); - goto finish; - } - - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) { - r = log_error_errno(errno, "Failed to create id socket pair: %m"); - goto finish; - } - - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) { - r = log_error_errno(errno, "Failed to create notify socket pair: %m"); - goto finish; - } - - if (arg_userns_mode != USER_NAMESPACE_NO) - if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { - r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); - goto finish; - } - - /* Child can be killed before execv(), so handle SIGCHLD - * in order to interrupt parent's blocking calls and - * give it a chance to call wait() and terminate. */ - r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); - if (r < 0) { - r = log_error_errno(errno, "Failed to change the signal mask: %m"); - goto finish; - } - - r = sigaction(SIGCHLD, &sa, NULL); - if (r < 0) { - r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m"); - goto finish; - } - - pid = raw_clone(SIGCHLD|CLONE_NEWNS); - if (pid < 0) { - if (errno == EINVAL) - r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); - else - r = log_error_errno(errno, "clone() failed: %m"); - - goto finish; - } - - if (pid == 0) { - /* The outer child only has a file system namespace. */ - barrier_set_role(&barrier, BARRIER_CHILD); - - master = safe_close(master); - - kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]); - rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); - pid_socket_pair[0] = safe_close(pid_socket_pair[0]); - uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); - notify_socket_pair[0] = safe_close(notify_socket_pair[0]); - uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); - - (void) reset_all_signal_handlers(); - (void) reset_signal_mask(); - - r = outer_child(&barrier, - arg_directory, - console, - root_device, root_device_rw, - home_device, home_device_rw, - srv_device, srv_device_rw, - interactive, - secondary, - pid_socket_pair[1], - uuid_socket_pair[1], - notify_socket_pair[1], - kmsg_socket_pair[1], - rtnl_socket_pair[1], - uid_shift_socket_pair[1], - fds); - if (r < 0) - _exit(EXIT_FAILURE); - - _exit(EXIT_SUCCESS); - } - - barrier_set_role(&barrier, BARRIER_PARENT); - - fds = fdset_free(fds); - - kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); - rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); - pid_socket_pair[1] = safe_close(pid_socket_pair[1]); - uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); - notify_socket_pair[1] = safe_close(notify_socket_pair[1]); - uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); - - if (arg_userns_mode != USER_NAMESPACE_NO) { - /* The child just let us know the UID shift it might have read from the image. */ - l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); - if (l < 0) { - r = log_error_errno(errno, "Failed to read UID shift: %m"); - goto finish; - } - if (l != sizeof(arg_uid_shift)) { - log_error("Short read while reading UID shift."); - r = EIO; - goto finish; - } - - if (arg_userns_mode == USER_NAMESPACE_PICK) { - /* If we are supposed to pick the UID shift, let's try to use the shift read from the - * image, but if that's already in use, pick a new one, and report back to the child, - * which one we now picked. */ - - r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock); - if (r < 0) { - log_error_errno(r, "Failed to pick suitable UID/GID range: %m"); - goto finish; - } - - l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); - if (l < 0) { - r = log_error_errno(errno, "Failed to send UID shift: %m"); - goto finish; - } - if (l != sizeof(arg_uid_shift)) { - log_error("Short write while writing UID shift."); - r = -EIO; - goto finish; - } - } - } - - /* Wait for the outer child. */ - r = wait_for_terminate_and_warn("namespace helper", pid, NULL); - if (r < 0) - goto finish; - if (r != 0) { - r = -EIO; - goto finish; - } - pid = 0; - - /* And now retrieve the PID of the inner child. */ - l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0); - if (l < 0) { - r = log_error_errno(errno, "Failed to read inner child PID: %m"); - goto finish; - } - if (l != sizeof(pid)) { - log_error("Short read while reading inner child PID."); - r = EIO; - goto finish; - } - - /* We also retrieve container UUID in case it was generated by outer child */ - l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0); - if (l < 0) { - r = log_error_errno(errno, "Failed to read container machine ID: %m"); - goto finish; - } - if (l != sizeof(arg_uuid)) { - log_error("Short read while reading container machined ID."); - r = EIO; - goto finish; - } - - /* We also retrieve the socket used for notifications generated by outer child */ - notify_socket = receive_one_fd(notify_socket_pair[0], 0); - if (notify_socket < 0) { - r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m"); - goto finish; - } - - log_debug("Init process invoked as PID " PID_FMT, pid); - - if (arg_userns_mode != USER_NAMESPACE_NO) { - if (!barrier_place_and_sync(&barrier)) { /* #1 */ - log_error("Child died too early."); - r = -ESRCH; - goto finish; - } - - r = setup_uid_map(pid); - if (r < 0) - goto finish; - - (void) barrier_place(&barrier); /* #2 */ - } - - if (arg_private_network) { - - r = move_network_interfaces(pid, arg_network_interfaces); - if (r < 0) - goto finish; - - if (arg_network_veth) { - r = setup_veth(arg_machine, pid, veth_name, - arg_network_bridge || arg_network_zone); - if (r < 0) - goto finish; - else if (r > 0) - ifi = r; - - if (arg_network_bridge) { - /* Add the interface to a bridge */ - r = setup_bridge(veth_name, arg_network_bridge, false); - if (r < 0) - goto finish; - if (r > 0) - ifi = r; - } else if (arg_network_zone) { - /* Add the interface to a bridge, possibly creating it */ - r = setup_bridge(veth_name, arg_network_zone, true); - if (r < 0) - goto finish; - if (r > 0) - ifi = r; - } - } - - r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra); - if (r < 0) - goto finish; - - /* We created the primary and extra veth links now; let's remember this, so that we know to - remove them later on. Note that we don't bother with removing veth links that were created - here when their setup failed half-way, because in that case the kernel should be able to - remove them on its own, since they cannot be referenced by anything yet. */ - veth_created = true; - - r = setup_macvlan(arg_machine, pid, arg_network_macvlan); - if (r < 0) - goto finish; - - r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan); - if (r < 0) - goto finish; - } - - if (arg_register) { - r = register_machine( - arg_machine, - pid, - arg_directory, - arg_uuid, - ifi, - arg_slice, - arg_custom_mounts, arg_n_custom_mounts, - arg_kill_signal, - arg_property, - arg_keep_unit, - arg_container_service_name); - if (r < 0) - goto finish; - } - - r = sync_cgroup(pid, arg_unified_cgroup_hierarchy); - if (r < 0) - goto finish; - - if (arg_keep_unit) { - r = create_subcgroup(pid, arg_unified_cgroup_hierarchy); - if (r < 0) - goto finish; - } - - r = chown_cgroup(pid, arg_uid_shift); - if (r < 0) - goto finish; - - /* Notify the child that the parent is ready with all - * its setup (including cgroup-ification), and that - * the child can now hand over control to the code to - * run inside the container. */ - (void) barrier_place(&barrier); /* #3 */ - - /* Block SIGCHLD here, before notifying child. - * process_pty() will handle it with the other signals. */ - assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0); - - /* Reset signal to default */ - r = default_signals(SIGCHLD, -1); - if (r < 0) { - log_error_errno(r, "Failed to reset SIGCHLD: %m"); - goto finish; - } - - r = sd_event_new(&event); - if (r < 0) { - log_error_errno(r, "Failed to get default event source: %m"); - goto finish; - } - - r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid)); - if (r < 0) - goto finish; - - /* Let the child know that we are ready and wait that the child is completely ready now. */ - if (!barrier_place_and_sync(&barrier)) { /* #4 */ - log_error("Child died too early."); - r = -ESRCH; - goto finish; - } - - /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear - * in getpwuid(), thus we can release the /etc/passwd lock. */ - etc_passwd_lock = safe_close(etc_passwd_lock); - - sd_notifyf(false, - "STATUS=Container running.\n" - "X_NSPAWN_LEADER_PID=" PID_FMT, pid); - if (!arg_notify_ready) - sd_notify(false, "READY=1\n"); - - if (arg_kill_signal > 0) { - /* Try to kill the init system on SIGINT or SIGTERM */ - sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid)); - sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid)); - } else { - /* Immediately exit */ - sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); - sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); - } - - /* simply exit on sigchld */ - sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL); - - if (arg_expose_ports) { - r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl); - if (r < 0) - goto finish; - - (void) expose_port_execute(rtnl, arg_expose_ports, &exposed); - } - - rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); - - r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward); - if (r < 0) { - log_error_errno(r, "Failed to create PTY forwarder: %m"); - goto finish; - } - - r = sd_event_loop(event); - if (r < 0) { - log_error_errno(r, "Failed to run event loop: %m"); - goto finish; - } - - pty_forward_get_last_char(forward, &last_char); - - forward = pty_forward_free(forward); - - if (!arg_quiet && last_char != '\n') - putc('\n', stdout); - - /* Kill if it is not dead yet anyway */ - if (arg_register && !arg_keep_unit) - terminate_machine(pid); - - /* Normally redundant, but better safe than sorry */ - kill(pid, SIGKILL); - - r = wait_for_container(pid, &container_status); - pid = 0; - - if (r < 0) - /* We failed to wait for the container, or the - * container exited abnormally */ - goto finish; - else if (r > 0 || container_status == CONTAINER_TERMINATED) { - /* The container exited with a non-zero - * status, or with zero status and no reboot - * was requested. */ - ret = r; + r = run(master, + console, + root_device, root_device_rw, + home_device, home_device_rw, + srv_device, srv_device_rw, + esp_device, + interactive, secondary, + fds, + veth_name, &veth_created, + &exposed, + &pid, &ret); + if (r <= 0) break; - } - - /* CONTAINER_REBOOTED, loop again */ - - if (arg_keep_unit) { - /* Special handling if we are running as a - * service: instead of simply restarting the - * machine we want to restart the entire - * service, so let's inform systemd about this - * with the special exit code 133. The service - * file uses RestartForceExitStatus=133 so - * that this results in a full nspawn - * restart. This is necessary since we might - * have cgroup parameters set we want to have - * flushed out. */ - ret = 133; - r = 0; - break; - } - - expose_port_flush(arg_expose_ports, &exposed); - - (void) remove_veth_links(veth_name, arg_network_veth_extra); - veth_created = false; } finish: |