diff options
Diffstat (limited to 'src/nspawn')
-rw-r--r-- | src/nspawn/nspawn.c | 233 |
1 files changed, 220 insertions, 13 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 8039847a72..a56960506c 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -204,6 +204,7 @@ static char **arg_property = NULL; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; static int arg_kill_signal = 0; +static bool arg_unified_cgroup_hierarchy = false; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -385,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) { return 0; } +static int detect_unified_cgroup_hierarchy(void) { + const char *e; + int r; + + /* Allow the user to control whether the unified hierarchy is used */ + e = getenv("UNIFIED_CGROUP_HIERARCHY"); + if (e) { + r = parse_boolean(e); + if (r < 0) + return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY."); + + arg_unified_cgroup_hierarchy = r; + return 0; + } + + /* Otherwise inherit the default from the host system */ + r = cg_unified(); + if (r < 0) + return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); + + arg_unified_cgroup_hierarchy = r; + return 0; +} + static int parse_argv(int argc, char *argv[]) { enum { @@ -1037,6 +1062,10 @@ static int parse_argv(int argc, char *argv[]) { if (arg_boot && arg_kill_signal <= 0) arg_kill_signal = SIGRTMIN+3; + r = detect_unified_cgroup_hierarchy(); + if (r < 0) + return r; + return 1; } @@ -1095,7 +1124,6 @@ static int mount_all(const char *dest, bool userns) { { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, - { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false }, { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, @@ -1381,7 +1409,7 @@ static int mount_custom(const char *dest) { return 0; } -static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { +static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { char *to; int r; @@ -1409,11 +1437,31 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons return 1; } -static int mount_cgroup(const char *dest) { +static int mount_legacy_cgroups(const char *dest) { _cleanup_set_free_free_ Set *controllers = NULL; const char *cgroup_root; int r; + cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + r = tmpfs_patch_options("mode=755", &options); + if (r < 0) + return log_oom(); + + if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0) + return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m"); + } + + if (cg_unified() > 0) + goto skip_controllers; + controllers = set_new(&string_hash_ops); if (!controllers) return log_oom(); @@ -1437,7 +1485,7 @@ static int mount_cgroup(const char *dest) { if (r == -EINVAL) { /* Not a symbolic link, but directly a single cgroup hierarchy */ - r = mount_cgroup_hierarchy(dest, controller, controller, true); + r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); if (r < 0) return r; @@ -1457,7 +1505,7 @@ static int mount_cgroup(const char *dest) { continue; } - r = mount_cgroup_hierarchy(dest, combined, combined, true); + r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); if (r < 0) return r; @@ -1471,17 +1519,52 @@ static int mount_cgroup(const char *dest) { } } - r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false); +skip_controllers: + r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false); if (r < 0) return r; - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); return 0; } +static int mount_unified_cgroups(const char *dest) { + const char *p; + int r; + + assert(dest); + + p = strjoina(dest, "/sys/fs/cgroup"); + + r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); + if (r > 0) { + p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs"); + if (access(p, F_OK) >= 0) + return 0; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); + + log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + return -EINVAL; + } + + if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0) + return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p); + + return 0; +} + +static int mount_cgroups(const char *dest) { + if (arg_unified_cgroup_hierarchy) + return mount_unified_cgroups(dest); + else + return mount_legacy_cgroups(dest); +} + static int mount_systemd_cgroup_writable(const char *dest) { _cleanup_free_ char *own_cgroup_path = NULL; const char *systemd_root, *systemd_own; @@ -1493,13 +1576,23 @@ static int mount_systemd_cgroup_writable(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + /* If we are living in the top-level, then there's nothing to do... */ + if (path_equal(own_cgroup_path, "/")) + return 0; + + if (arg_unified_cgroup_hierarchy) { + systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); + } else { + systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + } + /* Make our own cgroup a (writable) bind mount */ - systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0) return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path); /* And then remount the systemd cgroup root read-only */ - systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) return log_error_errno(errno, "Failed to mount cgroup root read-only: %m"); @@ -4187,6 +4280,8 @@ static int inner_child( assert(directory); assert(kmsg_socket >= 0); + cg_unified_flush(); + if (arg_userns) { /* Tell the parent, that it now can write the UID map. */ (void) barrier_place(barrier); /* #1 */ @@ -4368,6 +4463,8 @@ static int outer_child( assert(pid_socket >= 0); assert(kmsg_socket >= 0); + cg_unified_flush(); + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); @@ -4484,7 +4581,7 @@ static int outer_child( if (r < 0) return r; - r = mount_cgroup(directory); + r = mount_cgroups(directory); if (r < 0) return r; @@ -4499,7 +4596,6 @@ static int outer_child( NULL); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); - if (pid == 0) { pid_socket = safe_close(pid_socket); uid_shift_socket = safe_close(uid_shift_socket); @@ -4567,9 +4663,112 @@ static int chown_cgroup(pid_t pid) { if (fd < 0) return log_error_errno(errno, "Failed to open %s: %m", fs); - FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children") + FOREACH_STRING(fn, + ".", + "tasks", + "notify_on_release", + "cgroup.procs", + "cgroup.clone_children", + "cgroup.controllers", + "cgroup.subtree_control", + "cgroup.populated") if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0) - log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn); + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to chown() cgroup file %s, ignoring: %m", fn); + + return 0; +} + +static int sync_cgroup(pid_t pid) { + _cleanup_free_ char *cgroup = NULL; + char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; + bool undo_mount = false; + const char *fn; + int unified, r; + + unified = cg_unified(); + if (unified < 0) + return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + + if ((unified > 0) == arg_unified_cgroup_hierarchy) + return 0; + + /* When the host uses the legacy cgroup setup, but the + * container shall use the unified hierarchy, let's make sure + * we copy the path from the name=systemd hierarchy into the + * unified hierarchy. Similar for the reverse situation. */ + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid); + + /* In order to access the unified hierarchy we need to mount it */ + if (!mkdtemp(tree)) + return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m"); + + if (unified) + r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); + else + r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior"); + if (r < 0) { + r = log_error_errno(errno, "Failed to mount unified hierarchy: %m"); + goto finish; + } + + undo_mount = true; + + fn = strjoina(tree, cgroup, "/cgroup.procs"); + (void) mkdir_parents(fn, 0755); + + sprintf(pid_string, PID_FMT, pid); + r = write_string_file(fn, pid_string, 0); + if (r < 0) + log_error_errno(r, "Failed to move process: %m"); + +finish: + if (undo_mount) + (void) umount(tree); + + (void) rmdir(tree); + return r; +} + +static int create_subcgroup(pid_t pid) { + _cleanup_free_ char *cgroup = NULL; + const char *child; + int unified, r; + + /* In the unified hierarchy inner nodes may only only contain + * subgroups, but not processes. Hence, if we running in the + * unified hierarchy and the container does the same, and we + * did not create a scope unit for the container move us and + * the container into two separate subcgroups. */ + + if (!arg_keep_unit) + return 0; + + if (!arg_unified_cgroup_hierarchy) + return 0; + + unified = cg_unified(); + if (unified < 0) + return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + if (unified == 0) + return 0; + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get our control group: %m"); + + child = strjoina(cgroup, "/payload"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + + child = strjoina(cgroup, "/supervisor"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", child); return 0; } @@ -4976,6 +5175,14 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + r = sync_cgroup(pid); + if (r < 0) + goto finish; + + r = create_subcgroup(pid); + if (r < 0) + goto finish; + r = chown_cgroup(pid); if (r < 0) goto finish; |