diff options
author | Luke Shumaker <lukeshu@lukeshu.com> | 2017-06-14 13:32:18 -0400 |
---|---|---|
committer | Luke Shumaker <lukeshu@lukeshu.com> | 2017-06-16 18:52:04 -0400 |
commit | a2c5a0f3e0f005899fd6de214ff0525b7415cb8a (patch) | |
tree | 8394b4d9e63233ec428d5d77a798feeda6658c23 | |
parent | 76b7be5e7e8bd15b549aa12eabbe3a36fdc4e924 (diff) |
nspawn: Divorce the code deciding the cgroup mounts from the code performing them
TODO: a better commit message
-rw-r--r-- | src/basic/cgroup-util.c | 37 | ||||
-rw-r--r-- | src/basic/cgroup-util.h | 1 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 450 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.h | 12 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 100 |
5 files changed, 363 insertions, 237 deletions
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 5bf68105f2..624626c867 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -921,11 +921,18 @@ int cg_get_xattr(const char *controller, const char *path, const char *name, voi return (int) n; } +/** + * Returns the cgroup path of the process under the hierarchy specified by @controller: + * + * controller : whichever hierarchy has @controller bound to it (with the special case that + * SYSTEMD_CGROUP_CONTROLLER selects whichever hierarchy systemd is using, even if it is the + * v2 (unified) hierarchy and thus the SYSTEMD_CGROUP_CONTROLLER doesn't actually exist) + * + * controller == NULL : equivalent to SYSTEMD_CGROUP_CONTROLLER + */ int cg_pid_get_path(const char *controller, pid_t pid, char **path) { _cleanup_fclose_ FILE *f = NULL; - char line[LINE_MAX]; const char *fs; - size_t cs = 0; int unified; assert(path); @@ -940,20 +947,40 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) { unified = cg_unified(controller); if (unified < 0) return unified; - if (unified == 0) - cs = strlen(controller); fs = procfs_file_alloca(pid, "cgroup"); f = fopen(fs, "re"); if (!f) return errno == ENOENT ? -ESRCH : -errno; + return cg_pid_get_path_internal(unified ? NULL : controller, f, path); +} + +/** + * NB: The meaning of @controller is different here than for cg_pid_get_path(): + * + * controller : the cgroup v1 hierarchy with this controller bound to it + * controller == NULL : the cgroup v2 (unified) hierarchy + */ +int cg_pid_get_path_internal(const char *controller, FILE *f, char **path) { + char line[LINE_MAX]; + size_t cs = 0; + + assert(path); + assert(f); + + if (controller && !cg_controller_is_valid(controller)) + return -EINVAL; + + if (controller) + cs = strlen(controller); + FOREACH_LINE(line, f, return -errno) { char *e, *p; truncate_nl(line); - if (unified) { + if (!controller) { e = startswith(line, "0:"); if (!e) continue; diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index d730f3490c..0e5b41103e 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -169,6 +169,7 @@ int cg_get_path(const char *controller, const char *path, const char *suffix, ch int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs); int cg_pid_get_path(const char *controller, pid_t pid, char **path); +int cg_pid_get_path_internal(const char *controller, FILE *f, char **path); int cg_trim(const char *controller, const char *path, bool delete_root); diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 795ad5f4ae..dafeb4a1ce 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -37,6 +37,66 @@ #include "nspawn-cgroup.h" #include "nspawn-mount.h" +/* Code for managing the list of CGMounts ***************************/ + +typedef enum CGMountType { + CGMOUNT_SYMLINK, + CGMOUNT_TMPFS, + CGMOUNT_CGROUP1, + CGMOUNT_CGROUP2, + _CGMOUNT_MAX +} CGMountType; + +struct CGMount { + CGMountType type; + char *src; + char *dst; +}; + +static CGMount *cgmount_add(CGMounts *mounts, CGMountType type, const char *src, const char *dst) { + char *hsrc = NULL, *hdst = NULL; + CGMount *c, *ret; + + assert(mounts); + assert(type >= 0 && type < _CGMOUNT_MAX); + assert(src); + assert(dst); + + hsrc = strdup(src); + hdst = strdup(dst); + if (!hsrc || !hdst) { + free(hsrc); + free(hdst); + return NULL; + } + + c = realloc_multiply(mounts->mounts, sizeof(CGMount), mounts->n + 1); + if (!c) + return NULL; + + mounts->mounts = c; + ret = &(mounts->mounts)[mounts->n]; + (mounts->n)++; + + *ret = (CGMount) { + .type = type, + .src = hsrc, + .dst = hdst, + }; + return ret; +} + +void cgroup_free_mounts(CGMounts *mounts) { + for (size_t i = 0; i < mounts->n; i++) { + free(mounts->mounts[i].src); + free(mounts->mounts[i].dst); + } + mounts->mounts = mfree(mounts->mounts); + mounts->n = 0; +} + +/********************************************************************/ + static int chown_cgroup_path(const char *path, uid_t uid_shift) { _cleanup_close_ int fd = -1; const char *fn; @@ -293,84 +353,18 @@ static int get_v1_hierarchies(Set *subsystems) { return 0; } -static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, - CGroupUnified inner_cgver, bool read_only) { - const char *to, *fstype, *opts; - int r; - - to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy); - - r = path_is_mount_point(to, 0); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); - if (r > 0) - return 0; - - mkdir_p(to, 0755); - - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ - if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { - if (inner_cgver >= CGROUP_UNIFIED_SYSTEMD) { - fstype = "cgroup2"; - opts = NULL; - } else { - fstype = "cgroup"; - opts = "none,name=systemd,xattr"; - } - } else { - fstype = "cgroup"; - opts = controller; - } - - r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); - if (r < 0) - return r; - - /* ... hence let's only make the bind mount read-only, not the superblock. */ - if (read_only) { - r = mount_verbose(LOG_ERR, NULL, to, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); - if (r < 0) - return r; - } - - return 1; -} - -/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */ -static int mount_legacy_cgns_supported( - CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, - uid_t uid_range, const char *selinux_apifs_context) { +/* Decide the legacy cgroup mounts when cgroup namespaces are used. */ +static int cgroup_decide_mounts_sd_y_cgns( + CGMounts *ret_mounts, + CGroupUnified outer_cgver, CGroupUnified inner_cgver) { + _cleanup_(cgroup_free_mounts) CGMounts mounts = {}; _cleanup_set_free_free_ Set *hierarchies = NULL; - const char *cgroup_root = "/sys/fs/cgroup", *c; + const char *c; int r; - (void) mkdir_p(cgroup_root, 0755); - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - /* When cgroup namespaces are enabled and user namespaces are - * used then the mount of the cgroupfs is done *inside* the new - * user namespace. We're root in the new user namespace and the - * kernel will happily translate our uid/gid to the correct - * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply - * pass uid 0 and not uid_shift to tmpfs_patch_options(). - */ - r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options); - if (r < 0) - return log_oom(); - - r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", - MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); - if (r < 0) - return r; - } + if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", "")) + return log_oom(); if (outer_cgver >= CGROUP_UNIFIED_ALL) goto skip_controllers; @@ -393,9 +387,8 @@ static int mount_legacy_cgns_supported( if (streq(hierarchy, "name=systemd")) continue; - r = mount_legacy_cgroup_hierarchy("", hierarchy, hierarchy, inner_cgver, !userns); - if (r < 0) - return r; + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, hierarchy, hierarchy)) + return log_oom(); /* When multiple hierarchies are co-mounted, make their * constituting individual hierarchies a symlink to the @@ -411,62 +404,43 @@ static int mount_legacy_cgns_supported( if (r == 0) break; - target = prefix_root("/sys/fs/cgroup", controller); - if (!target) + if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, hierarchy, controller)) return log_oom(); - - if (streq(hierarchy, controller)) - break; - - r = symlink_idempotent(hierarchy, target); - if (r == -EINVAL) - return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); } } skip_controllers: - r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false); - if (r < 0) - return r; + switch (inner_cgver) { + case CGROUP_UNIFIED_NONE: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd")) + return log_oom(); + break; + case CGROUP_UNIFIED_ALL: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd")) + return log_oom(); + break; + default: + assert_not_reached("non-legacy cgroup version desired in legacy setup function"); + return -EINVAL; + } - if (!userns) - return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, - MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); + *ret_mounts = mounts; + mounts = (CGMounts){}; return 0; } -/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */ -static int mount_legacy_cgns_unsupported( - const char *dest, - CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range, - const char *selinux_apifs_context) { +/* Decide the legacy cgroup mounts when cgroup namespaces are not used. */ +static int cgroup_decide_mounts_sd_n_cgns( + CGMounts *ret_mounts, + CGroupUnified outer_cgver, CGroupUnified inner_cgver) { + _cleanup_(cgroup_free_mounts) CGMounts mounts = {}; _cleanup_set_free_free_ Set *controllers = NULL; - const char *cgroup_root; int r; - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); - - (void) mkdir_p(cgroup_root, 0755); - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); - if (r < 0) - return log_oom(); - - r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", - MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); - if (r < 0) - return r; - } + if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", "")) + return log_oom(); if (outer_cgver >= CGROUP_UNIFIED_ALL) goto skip_controllers; @@ -494,19 +468,12 @@ static int mount_legacy_cgns_unsupported( if (r == -EINVAL) { /* Not a symbolic link, but directly a single cgroup hierarchy */ - r = mount_legacy_cgroup_hierarchy(dest, controller, controller, inner_cgver, true); - if (r < 0) - return r; + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, controller, controller)) + return log_oom(); } else if (r < 0) return log_error_errno(r, "Failed to read link %s: %m", origin); else { - _cleanup_free_ char *target = NULL; - - target = prefix_root(dest, origin); - if (!target) - return log_oom(); - /* A symbolic link, a combination of controllers in one hierarchy */ if (!filename_is_valid(combined)) { @@ -514,101 +481,184 @@ static int mount_legacy_cgns_unsupported( continue; } - r = mount_legacy_cgroup_hierarchy(dest, combined, combined, inner_cgver, true); - if (r < 0) - return r; + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, combined, combined)) + return log_oom(); - r = symlink_idempotent(combined, target); - if (r == -EINVAL) - return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, combined, controller)) + return log_oom(); } } skip_controllers: - r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false); - if (r < 0) - return r; + switch (inner_cgver) { + case CGROUP_UNIFIED_NONE: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd")) + return log_oom(); + break; + case CGROUP_UNIFIED_ALL: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd")) + return log_oom(); + break; + default: + assert_not_reached("non-legacy cgroup version desired in legacy setup function"); + return -EINVAL; + } - return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, - MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); -} + *ret_mounts = mounts; + mounts = (CGMounts){}; -static int mount_unified_cgroups(const char *dest) { - const char *p; - int r; + return 0; +} - assert(dest); +int cgroup_decide_mounts( + CGMounts *ret_mounts, + CGroupUnified outer_cgver, CGroupUnified inner_cgver, + bool use_cgns) { + switch (inner_cgver) { + case CGROUP_UNIFIED_NONE: + case CGROUP_UNIFIED_SYSTEMD: + if (use_cgns) + return cgroup_decide_mounts_sd_y_cgns(ret_mounts, outer_cgver, inner_cgver); + else + return cgroup_decide_mounts_sd_n_cgns(ret_mounts, outer_cgver, inner_cgver); + case CGROUP_UNIFIED_ALL: + if (!cgmount_add(ret_mounts, CGMOUNT_CGROUP2, "cgroup", "")) + return log_oom(); + return 0; + default: + assert_not_reached("Invalid cgroup ver requested"); + return -EINVAL; + } +} - p = prefix_roota(dest, "/sys/fs/cgroup"); +/********************************************************************/ - (void) mkdir_p(p, 0755); +static int cgroup_mount_cg( + const char *mountpoint, const char *opts, CGMountType fstype, + FILE *cgfile, bool use_userns) { + const bool use_cgns = cgfile == NULL; + /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user + * to not have access to the things that would make us want to mount it RO. Otherwise, we give the container + * RW access to its unified or name=systemd cgroup. */ + const bool rw = (use_userns && use_cgns) || fstype == CGMOUNT_CGROUP2 || streq(mountpoint, "/sys/fs/cgroup/systemd"); + int r; - r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); + /* First the base mount; this is always RW, as to not change the superblock settings */ + r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2", + MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); if (r < 0) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); - if (r > 0) { - p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs"); - if (access(p, F_OK) >= 0) - return 0; - if (errno != ENOENT) - return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); - - log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); - return -EINVAL; - } - - return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); -} + return r; -int mount_cgroups( - const char *dest, - CGroupUnified outer_cgver, CGroupUnified inner_cgver, - bool userns, uid_t uid_shift, uid_t uid_range, - const char *selinux_apifs_context, - bool use_cgns) { + /* Now, if nescessary, we remount RO */ + if (rw) { + if (!use_cgns) { + /* emulate cgns by mounting everything but our subcgroup RO */ + const char *cgpath; + char *cgroup = NULL; + if (fstype == CGMOUNT_CGROUP2) { + rewind(cgfile); + r = cg_pid_get_path_internal(NULL, cgfile, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get child's cgroup v2 path"); + } else { + const char *scontroller, *state; + size_t controller_len; + FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) { + _cleanup_free_ const char *controller = strndup(scontroller, controller_len); + rewind(cgfile); + if (cg_pid_get_path_internal(controller, cgfile, &cgroup) == 0) + break; + } + if (!cgroup) + return log_error_errno(EBADMSG, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint); + } + cgpath = prefix_roota(mountpoint, cgroup); - if (inner_cgver >= CGROUP_UNIFIED_ALL) - return mount_unified_cgroups(dest); - else if (use_cgns) - return mount_legacy_cgns_supported(outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context); + r = mount_verbose(LOG_ERR, cgpath, cgpath, NULL, MS_BIND, NULL); + if (r < 0) + return r; + r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (r < 0) + return r; + } + } else { + r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (r < 0) + return r; + } - return mount_legacy_cgns_unsupported(dest, outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context); + return 0; } -int mount_systemd_cgroup_writable( - const char *dest, - CGroupUnified inner_cgver) { +int cgroup_mount_mounts(CGMounts m, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) { + const bool use_cgns = cgfile == NULL; + const bool use_userns = uid_shift != UID_INVALID; - _cleanup_free_ char *own_cgroup_path = NULL; - const char *systemd_root, *systemd_own; - int r; + bool used_tmpfs = false; - assert(dest); + for (size_t i = 0; i < m.n; i++) { + _cleanup_free_ char *options = NULL; + const char *dst; + int r; - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + dst = prefix_roota("/sys/fs/cgroup", m.mounts[i].dst); - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; + /* The checks here to see if things are already mounted are kind of primative. Perhaps they should + * actually check the statfs() f_type to verify that the thing mounted is what we want to be mounted + * (similar to cgroup-util's detection logic)? But I don't really understand the use-case for having + * any of these already mounted, so I'm not sure if such increased strictness would be unwelcome. */ - if (inner_cgver >= CGROUP_UNIFIED_ALL) { - systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); - } else { - systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + switch (m.mounts[i].type) { + case CGMOUNT_SYMLINK: + (void) mkdir_parents(dst, 0755); + r = symlink_idempotent(m.mounts[i].src, dst); + if (r < 0) + return r; + break; + case CGMOUNT_TMPFS: + used_tmpfs = true; + r = path_is_mount_point(dst, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst); + if (r > 0) + continue; + r = tmpfs_patch_options(m.mounts[i].src, uid_shift, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + r = mount_verbose(LOG_ERR, /*name*/"tmpfs", dst, /*fstype*/"tmpfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); + if (r < 0) + return r; + break; + case CGMOUNT_CGROUP1: + case CGMOUNT_CGROUP2: + r = path_is_mount_point(dst, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst); + if (r > 0) { + if (access(prefix_roota(dst, "cgroup.procs"), F_OK) >= 0) + continue; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s is a cgroup hierarchy: %m", dst); + return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst); + } + (void) mkdir_p(dst, 0755); + r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns); + if (r < 0) + return r; + break; + default: + assert_not_reached("Invalid CGMount type"); + return -EINVAL; + } } - /* Make our own cgroup a (writable) bind mount */ - r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL); - if (r < 0) - return r; + /* I'm going to be honest: I don't understand why we don't do this if we're using both userns and cgns. */ + if (used_tmpfs && (!use_userns || !use_cgns)) + return mount_verbose(LOG_ERR, NULL, "/sys/fs/cgroup", NULL, + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); - /* And then remount the systemd cgroup root read-only */ - return mount_verbose(LOG_ERR, NULL, systemd_root, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + return 0; } diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h index e677766726..46c669c446 100644 --- a/src/nspawn/nspawn-cgroup.h +++ b/src/nspawn/nspawn-cgroup.h @@ -24,7 +24,13 @@ #include "cgroup-util.h" -int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift); +typedef struct CGMount CGMount; +typedef struct CGMounts { + CGMount *mounts; + size_t n; +} CGMounts; -int mount_cgroups(const char *dest, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); -int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver); +int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift); +int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns); +int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context); +void cgroup_free_mounts(CGMounts *mounts); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 5c4341e0ee..94c7eea9b7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2648,7 +2648,7 @@ static int inner_child( int kmsg_socket, int rtnl_socket, FDSet *fds, - CGroupUnified outer_cgver) { + CGMounts cgmounts) { _cleanup_free_ char *home = NULL; char as_uuid[37]; @@ -2706,19 +2706,11 @@ static int inner_child( r = unshare(CLONE_NEWCGROUP); if (r < 0) return log_error_errno(errno, "Failed to unshare cgroup namespace"); - r = mount_cgroups( - "", - outer_cgver, - arg_unified_cgroup_hierarchy, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_apifs_context, - true); - if (r < 0) - return r; - } else { - r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); + r = cgroup_mount_mounts(cgmounts, + NULL, + arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0, + arg_selinux_apifs_context); + cgroup_free_mounts(&cgmounts); if (r < 0) return r; } @@ -2915,6 +2907,7 @@ static int outer_child( int kmsg_socket, int rtnl_socket, int uid_shift_socket, + int cgroup_socket, FDSet *fds, CGroupUnified outer_cgver) { @@ -2922,6 +2915,7 @@ static int outer_child( ssize_t l; int r; _cleanup_close_ int fd = -1; + _cleanup_(cgroup_free_mounts) CGMounts cgmounts = {}; assert(barrier); assert(directory); @@ -3110,19 +3104,11 @@ static int outer_child( if (r < 0) return r; - if (!arg_use_cgns) { - r = mount_cgroups( - directory, - outer_cgver, - arg_unified_cgroup_hierarchy, - arg_userns_mode != USER_NAMESPACE_NO, - arg_uid_shift, - arg_uid_range, - arg_selinux_apifs_context, - false); - if (r < 0) - return r; - } + r = cgroup_decide_mounts(&cgmounts, + outer_cgver, arg_unified_cgroup_hierarchy, + arg_use_cgns); + if (r < 0) + return r; r = mount_move_root(directory); if (r < 0) @@ -3143,12 +3129,13 @@ static int outer_child( uuid_socket = safe_close(uuid_socket); notify_socket = safe_close(notify_socket); uid_shift_socket = safe_close(uid_shift_socket); + cgroup_socket = safe_close(cgroup_socket); /* The inner child has all namespaces that are * requested, so that we all are owned by the user if * user namespaces are turned on. */ - r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, outer_cgver); + r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, cgmounts); if (r < 0) _exit(EXIT_FAILURE); @@ -3175,11 +3162,42 @@ static int outer_child( if (r < 0) return log_error_errno(r, "Failed to send notify fd: %m"); + /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a + * less privileged mountns (and using userns causes the mountns to be less privileged). */ + if (!arg_use_cgns) { + /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've + * already chroot()ed, we don't have access to /proc. So the parent opens the file and sends it to + * us. */ + int cgfd; + _cleanup_fclose_ FILE *cgfile = NULL; + + assert(cgroup_socket); + + cgfd = receive_one_fd(cgroup_socket, 0); + if (cgfd < 0) + return log_error_errno(cgfd, "Failed to recv cgroup fd: %m"); + + cgfile = fdopen(cgfd, "re"); + if (!cgfile) { + r = -errno; /* in case safe_close sets errno */ + cgfd = safe_close(cgfd); + return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m"); + } + + r = cgroup_mount_mounts(cgmounts, + cgfile, + arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift, + arg_selinux_apifs_context); + if (r < 0) + return r; + } + pid_socket = safe_close(pid_socket); uuid_socket = safe_close(uuid_socket); notify_socket = safe_close(notify_socket); kmsg_socket = safe_close(kmsg_socket); rtnl_socket = safe_close(rtnl_socket); + cgroup_socket = safe_close(cgroup_socket); return 0; } @@ -3611,7 +3629,8 @@ static int run(int master, pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, notify_socket_pair[2] = { -1, -1 }, - uid_shift_socket_pair[2] = { -1, -1 }; + uid_shift_socket_pair[2] = { -1, -1 }, + cgroup_socket_pair[2] = {-1, -1 }; _cleanup_close_ int notify_socket= -1; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; @@ -3662,6 +3681,10 @@ static int run(int master, if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) return log_error_errno(errno, "Failed to create uid shift socket pair: %m"); + if (!arg_use_cgns) + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_socket_pair) < 0) + return log_error_errno(errno, "Failed to create cgroup socket pair: %m"); + /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt * parent's blocking calls and give it a chance to call wait() and terminate. */ r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); @@ -3690,6 +3713,7 @@ static int run(int master, uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); notify_socket_pair[0] = safe_close(notify_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); + cgroup_socket_pair[0] = safe_close(cgroup_socket_pair[0]); (void) reset_all_signal_handlers(); (void) reset_signal_mask(); @@ -3709,6 +3733,7 @@ static int run(int master, kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], + cgroup_socket_pair[1], fds, outer_cgver); if (r < 0) @@ -3727,6 +3752,7 @@ static int run(int master, uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); notify_socket_pair[1] = safe_close(notify_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); + cgroup_socket_pair[1] = safe_close(cgroup_socket_pair[1]); if (arg_userns_mode != USER_NAMESPACE_NO) { /* The child just let us know the UID shift it might have read from the image. */ @@ -3847,6 +3873,8 @@ static int run(int master, } if (arg_register) { + /* If the child is to be placed into a different cgroup, + * this is what does it. */ r = register_machine( arg_machine, *main_pid, @@ -3867,6 +3895,20 @@ static int run(int master, if (r < 0) return r; + if (!arg_use_cgns) { + /* helper_pid won't exit until this happens */ + const char *fs; + _cleanup_close_ int fd; + + fs = procfs_file_alloca(*main_pid, "cgroup"); + fd = open(fs, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open cgroups of child: %m"); + + r = send_one_fd(cgroup_socket_pair[0], fd, 0); + if (r < 0) + return log_error_errno(r, "Failed to send cgroup fd: %m"); + } /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", *helper_pid, NULL); |