diff options
Diffstat (limited to 'src/nspawn/nspawn-cgroup.c')
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 450 |
1 files changed, 250 insertions, 200 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 795ad5f4ae..dafeb4a1ce 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -37,6 +37,66 @@ #include "nspawn-cgroup.h" #include "nspawn-mount.h" +/* Code for managing the list of CGMounts ***************************/ + +typedef enum CGMountType { + CGMOUNT_SYMLINK, + CGMOUNT_TMPFS, + CGMOUNT_CGROUP1, + CGMOUNT_CGROUP2, + _CGMOUNT_MAX +} CGMountType; + +struct CGMount { + CGMountType type; + char *src; + char *dst; +}; + +static CGMount *cgmount_add(CGMounts *mounts, CGMountType type, const char *src, const char *dst) { + char *hsrc = NULL, *hdst = NULL; + CGMount *c, *ret; + + assert(mounts); + assert(type >= 0 && type < _CGMOUNT_MAX); + assert(src); + assert(dst); + + hsrc = strdup(src); + hdst = strdup(dst); + if (!hsrc || !hdst) { + free(hsrc); + free(hdst); + return NULL; + } + + c = realloc_multiply(mounts->mounts, sizeof(CGMount), mounts->n + 1); + if (!c) + return NULL; + + mounts->mounts = c; + ret = &(mounts->mounts)[mounts->n]; + (mounts->n)++; + + *ret = (CGMount) { + .type = type, + .src = hsrc, + .dst = hdst, + }; + return ret; +} + +void cgroup_free_mounts(CGMounts *mounts) { + for (size_t i = 0; i < mounts->n; i++) { + free(mounts->mounts[i].src); + free(mounts->mounts[i].dst); + } + mounts->mounts = mfree(mounts->mounts); + mounts->n = 0; +} + +/********************************************************************/ + static int chown_cgroup_path(const char *path, uid_t uid_shift) { _cleanup_close_ int fd = -1; const char *fn; @@ -293,84 +353,18 @@ static int get_v1_hierarchies(Set *subsystems) { return 0; } -static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, - CGroupUnified inner_cgver, bool read_only) { - const char *to, *fstype, *opts; - int r; - - to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy); - - r = path_is_mount_point(to, 0); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); - if (r > 0) - return 0; - - mkdir_p(to, 0755); - - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ - if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { - if (inner_cgver >= CGROUP_UNIFIED_SYSTEMD) { - fstype = "cgroup2"; - opts = NULL; - } else { - fstype = "cgroup"; - opts = "none,name=systemd,xattr"; - } - } else { - fstype = "cgroup"; - opts = controller; - } - - r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); - if (r < 0) - return r; - - /* ... hence let's only make the bind mount read-only, not the superblock. */ - if (read_only) { - r = mount_verbose(LOG_ERR, NULL, to, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); - if (r < 0) - return r; - } - - return 1; -} - -/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */ -static int mount_legacy_cgns_supported( - CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, - uid_t uid_range, const char *selinux_apifs_context) { +/* Decide the legacy cgroup mounts when cgroup namespaces are used. */ +static int cgroup_decide_mounts_sd_y_cgns( + CGMounts *ret_mounts, + CGroupUnified outer_cgver, CGroupUnified inner_cgver) { + _cleanup_(cgroup_free_mounts) CGMounts mounts = {}; _cleanup_set_free_free_ Set *hierarchies = NULL; - const char *cgroup_root = "/sys/fs/cgroup", *c; + const char *c; int r; - (void) mkdir_p(cgroup_root, 0755); - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - /* When cgroup namespaces are enabled and user namespaces are - * used then the mount of the cgroupfs is done *inside* the new - * user namespace. We're root in the new user namespace and the - * kernel will happily translate our uid/gid to the correct - * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply - * pass uid 0 and not uid_shift to tmpfs_patch_options(). - */ - r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options); - if (r < 0) - return log_oom(); - - r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", - MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); - if (r < 0) - return r; - } + if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", "")) + return log_oom(); if (outer_cgver >= CGROUP_UNIFIED_ALL) goto skip_controllers; @@ -393,9 +387,8 @@ static int mount_legacy_cgns_supported( if (streq(hierarchy, "name=systemd")) continue; - r = mount_legacy_cgroup_hierarchy("", hierarchy, hierarchy, inner_cgver, !userns); - if (r < 0) - return r; + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, hierarchy, hierarchy)) + return log_oom(); /* When multiple hierarchies are co-mounted, make their * constituting individual hierarchies a symlink to the @@ -411,62 +404,43 @@ static int mount_legacy_cgns_supported( if (r == 0) break; - target = prefix_root("/sys/fs/cgroup", controller); - if (!target) + if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, hierarchy, controller)) return log_oom(); - - if (streq(hierarchy, controller)) - break; - - r = symlink_idempotent(hierarchy, target); - if (r == -EINVAL) - return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); } } skip_controllers: - r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false); - if (r < 0) - return r; + switch (inner_cgver) { + case CGROUP_UNIFIED_NONE: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd")) + return log_oom(); + break; + case CGROUP_UNIFIED_ALL: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd")) + return log_oom(); + break; + default: + assert_not_reached("non-legacy cgroup version desired in legacy setup function"); + return -EINVAL; + } - if (!userns) - return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, - MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); + *ret_mounts = mounts; + mounts = (CGMounts){}; return 0; } -/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */ -static int mount_legacy_cgns_unsupported( - const char *dest, - CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range, - const char *selinux_apifs_context) { +/* Decide the legacy cgroup mounts when cgroup namespaces are not used. */ +static int cgroup_decide_mounts_sd_n_cgns( + CGMounts *ret_mounts, + CGroupUnified outer_cgver, CGroupUnified inner_cgver) { + _cleanup_(cgroup_free_mounts) CGMounts mounts = {}; _cleanup_set_free_free_ Set *controllers = NULL; - const char *cgroup_root; int r; - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); - - (void) mkdir_p(cgroup_root, 0755); - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options); - if (r < 0) - return log_oom(); - - r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs", - MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); - if (r < 0) - return r; - } + if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", "")) + return log_oom(); if (outer_cgver >= CGROUP_UNIFIED_ALL) goto skip_controllers; @@ -494,19 +468,12 @@ static int mount_legacy_cgns_unsupported( if (r == -EINVAL) { /* Not a symbolic link, but directly a single cgroup hierarchy */ - r = mount_legacy_cgroup_hierarchy(dest, controller, controller, inner_cgver, true); - if (r < 0) - return r; + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, controller, controller)) + return log_oom(); } else if (r < 0) return log_error_errno(r, "Failed to read link %s: %m", origin); else { - _cleanup_free_ char *target = NULL; - - target = prefix_root(dest, origin); - if (!target) - return log_oom(); - /* A symbolic link, a combination of controllers in one hierarchy */ if (!filename_is_valid(combined)) { @@ -514,101 +481,184 @@ static int mount_legacy_cgns_unsupported( continue; } - r = mount_legacy_cgroup_hierarchy(dest, combined, combined, inner_cgver, true); - if (r < 0) - return r; + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, combined, combined)) + return log_oom(); - r = symlink_idempotent(combined, target); - if (r == -EINVAL) - return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m"); - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, combined, controller)) + return log_oom(); } } skip_controllers: - r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false); - if (r < 0) - return r; + switch (inner_cgver) { + case CGROUP_UNIFIED_NONE: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd")) + return log_oom(); + break; + case CGROUP_UNIFIED_ALL: + if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd")) + return log_oom(); + break; + default: + assert_not_reached("non-legacy cgroup version desired in legacy setup function"); + return -EINVAL; + } - return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL, - MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); -} + *ret_mounts = mounts; + mounts = (CGMounts){}; -static int mount_unified_cgroups(const char *dest) { - const char *p; - int r; + return 0; +} - assert(dest); +int cgroup_decide_mounts( + CGMounts *ret_mounts, + CGroupUnified outer_cgver, CGroupUnified inner_cgver, + bool use_cgns) { + switch (inner_cgver) { + case CGROUP_UNIFIED_NONE: + case CGROUP_UNIFIED_SYSTEMD: + if (use_cgns) + return cgroup_decide_mounts_sd_y_cgns(ret_mounts, outer_cgver, inner_cgver); + else + return cgroup_decide_mounts_sd_n_cgns(ret_mounts, outer_cgver, inner_cgver); + case CGROUP_UNIFIED_ALL: + if (!cgmount_add(ret_mounts, CGMOUNT_CGROUP2, "cgroup", "")) + return log_oom(); + return 0; + default: + assert_not_reached("Invalid cgroup ver requested"); + return -EINVAL; + } +} - p = prefix_roota(dest, "/sys/fs/cgroup"); +/********************************************************************/ - (void) mkdir_p(p, 0755); +static int cgroup_mount_cg( + const char *mountpoint, const char *opts, CGMountType fstype, + FILE *cgfile, bool use_userns) { + const bool use_cgns = cgfile == NULL; + /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user + * to not have access to the things that would make us want to mount it RO. Otherwise, we give the container + * RW access to its unified or name=systemd cgroup. */ + const bool rw = (use_userns && use_cgns) || fstype == CGMOUNT_CGROUP2 || streq(mountpoint, "/sys/fs/cgroup/systemd"); + int r; - r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); + /* First the base mount; this is always RW, as to not change the superblock settings */ + r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2", + MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); if (r < 0) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); - if (r > 0) { - p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs"); - if (access(p, F_OK) >= 0) - return 0; - if (errno != ENOENT) - return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); - - log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); - return -EINVAL; - } - - return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); -} + return r; -int mount_cgroups( - const char *dest, - CGroupUnified outer_cgver, CGroupUnified inner_cgver, - bool userns, uid_t uid_shift, uid_t uid_range, - const char *selinux_apifs_context, - bool use_cgns) { + /* Now, if nescessary, we remount RO */ + if (rw) { + if (!use_cgns) { + /* emulate cgns by mounting everything but our subcgroup RO */ + const char *cgpath; + char *cgroup = NULL; + if (fstype == CGMOUNT_CGROUP2) { + rewind(cgfile); + r = cg_pid_get_path_internal(NULL, cgfile, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get child's cgroup v2 path"); + } else { + const char *scontroller, *state; + size_t controller_len; + FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) { + _cleanup_free_ const char *controller = strndup(scontroller, controller_len); + rewind(cgfile); + if (cg_pid_get_path_internal(controller, cgfile, &cgroup) == 0) + break; + } + if (!cgroup) + return log_error_errno(EBADMSG, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint); + } + cgpath = prefix_roota(mountpoint, cgroup); - if (inner_cgver >= CGROUP_UNIFIED_ALL) - return mount_unified_cgroups(dest); - else if (use_cgns) - return mount_legacy_cgns_supported(outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context); + r = mount_verbose(LOG_ERR, cgpath, cgpath, NULL, MS_BIND, NULL); + if (r < 0) + return r; + r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (r < 0) + return r; + } + } else { + r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL, + MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (r < 0) + return r; + } - return mount_legacy_cgns_unsupported(dest, outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context); + return 0; } -int mount_systemd_cgroup_writable( - const char *dest, - CGroupUnified inner_cgver) { +int cgroup_mount_mounts(CGMounts m, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) { + const bool use_cgns = cgfile == NULL; + const bool use_userns = uid_shift != UID_INVALID; - _cleanup_free_ char *own_cgroup_path = NULL; - const char *systemd_root, *systemd_own; - int r; + bool used_tmpfs = false; - assert(dest); + for (size_t i = 0; i < m.n; i++) { + _cleanup_free_ char *options = NULL; + const char *dst; + int r; - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + dst = prefix_roota("/sys/fs/cgroup", m.mounts[i].dst); - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; + /* The checks here to see if things are already mounted are kind of primative. Perhaps they should + * actually check the statfs() f_type to verify that the thing mounted is what we want to be mounted + * (similar to cgroup-util's detection logic)? But I don't really understand the use-case for having + * any of these already mounted, so I'm not sure if such increased strictness would be unwelcome. */ - if (inner_cgver >= CGROUP_UNIFIED_ALL) { - systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); - } else { - systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + switch (m.mounts[i].type) { + case CGMOUNT_SYMLINK: + (void) mkdir_parents(dst, 0755); + r = symlink_idempotent(m.mounts[i].src, dst); + if (r < 0) + return r; + break; + case CGMOUNT_TMPFS: + used_tmpfs = true; + r = path_is_mount_point(dst, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst); + if (r > 0) + continue; + r = tmpfs_patch_options(m.mounts[i].src, uid_shift, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + r = mount_verbose(LOG_ERR, /*name*/"tmpfs", dst, /*fstype*/"tmpfs", + MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options); + if (r < 0) + return r; + break; + case CGMOUNT_CGROUP1: + case CGMOUNT_CGROUP2: + r = path_is_mount_point(dst, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst); + if (r > 0) { + if (access(prefix_roota(dst, "cgroup.procs"), F_OK) >= 0) + continue; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s is a cgroup hierarchy: %m", dst); + return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst); + } + (void) mkdir_p(dst, 0755); + r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns); + if (r < 0) + return r; + break; + default: + assert_not_reached("Invalid CGMount type"); + return -EINVAL; + } } - /* Make our own cgroup a (writable) bind mount */ - r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL); - if (r < 0) - return r; + /* I'm going to be honest: I don't understand why we don't do this if we're using both userns and cgns. */ + if (used_tmpfs && (!use_userns || !use_cgns)) + return mount_verbose(LOG_ERR, NULL, "/sys/fs/cgroup", NULL, + MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755"); - /* And then remount the systemd cgroup root read-only */ - return mount_verbose(LOG_ERR, NULL, systemd_root, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + return 0; } |