nspawn: Divorce the code deciding the cgroup mounts from the code performing them

TODO: a better commit message
author: Luke Shumaker <lukeshu@lukeshu.com> 2017-06-14 13:32:18 -0400
committer: Luke Shumaker <lukeshu@lukeshu.com> 2017-06-16 18:52:04 -0400
commit: a2c5a0f3e0f005899fd6de214ff0525b7415cb8a (patch)
tree: 8394b4d9e63233ec428d5d77a798feeda6658c23
parent: 76b7be5e7e8bd15b549aa12eabbe3a36fdc4e924 (diff)
5 files changed, 363 insertions, 237 deletions
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 5bf68105f2..624626c867 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -921,11 +921,18 @@ int cg_get_xattr(const char *controller, const char *path, const char *name, voi
         return (int) n;
 }
 
+/**
+ * Returns the cgroup path of the process under the hierarchy specified by @controller:
+ *
+ *     controller         : whichever hierarchy has @controller bound to it (with the special case that
+ *                          SYSTEMD_CGROUP_CONTROLLER selects whichever hierarchy systemd is using, even if it is the
+ *                          v2 (unified) hierarchy and thus the SYSTEMD_CGROUP_CONTROLLER doesn't actually exist)
+ *
+ *     controller == NULL : equivalent to SYSTEMD_CGROUP_CONTROLLER
+ */
 int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
         _cleanup_fclose_ FILE *f = NULL;
-        char line[LINE_MAX];
         const char *fs;
-        size_t cs = 0;
         int unified;
 
         assert(path);
@@ -940,20 +947,40 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
         unified = cg_unified(controller);
         if (unified < 0)
                 return unified;
-        if (unified == 0)
-                cs = strlen(controller);
 
         fs = procfs_file_alloca(pid, "cgroup");
         f = fopen(fs, "re");
         if (!f)
                 return errno == ENOENT ? -ESRCH : -errno;
 
+        return cg_pid_get_path_internal(unified ? NULL : controller, f, path);
+}
+
+/**
+ * NB: The meaning of @controller is different here than for cg_pid_get_path():
+ *
+ *     controller         : the cgroup v1 hierarchy with this controller bound to it
+ *     controller == NULL : the cgroup v2 (unified) hierarchy
+ */
+int cg_pid_get_path_internal(const char *controller, FILE *f, char **path) {
+        char line[LINE_MAX];
+        size_t cs = 0;
+
+        assert(path);
+        assert(f);
+
+        if (controller && !cg_controller_is_valid(controller))
+                return -EINVAL;
+
+        if (controller)
+                cs = strlen(controller);
+
         FOREACH_LINE(line, f, return -errno) {
                 char *e, *p;
 
                 truncate_nl(line);
 
-                if (unified) {
+                if (!controller) {
                         e = startswith(line, "0:");
                         if (!e)
                                 continue;
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index d730f3490c..0e5b41103e 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -169,6 +169,7 @@ int cg_get_path(const char *controller, const char *path, const char *suffix, ch
 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs);
 
 int cg_pid_get_path(const char *controller, pid_t pid, char **path);
+int cg_pid_get_path_internal(const char *controller, FILE *f, char **path);
 
 int cg_trim(const char *controller, const char *path, bool delete_root);
 
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 795ad5f4ae..dafeb4a1ce 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -37,6 +37,66 @@
 #include "nspawn-cgroup.h"
 #include "nspawn-mount.h"
 
+/* Code for managing the list of CGMounts ***************************/
+
+typedef enum CGMountType {
+        CGMOUNT_SYMLINK,
+        CGMOUNT_TMPFS,
+        CGMOUNT_CGROUP1,
+        CGMOUNT_CGROUP2,
+        _CGMOUNT_MAX
+} CGMountType;
+
+struct CGMount {
+        CGMountType type;
+        char *src;
+        char *dst;
+};
+
+static CGMount *cgmount_add(CGMounts *mounts, CGMountType type, const char *src, const char *dst) {
+        char *hsrc = NULL, *hdst = NULL;
+        CGMount *c, *ret;
+
+        assert(mounts);
+        assert(type >= 0 && type < _CGMOUNT_MAX);
+        assert(src);
+        assert(dst);
+
+        hsrc = strdup(src);
+        hdst = strdup(dst);
+        if (!hsrc || !hdst) {
+                free(hsrc);
+                free(hdst);
+                return NULL;
+        }
+
+        c = realloc_multiply(mounts->mounts, sizeof(CGMount), mounts->n + 1);
+        if (!c)
+                return NULL;
+
+        mounts->mounts = c;
+        ret = &(mounts->mounts)[mounts->n];
+        (mounts->n)++;
+
+        *ret = (CGMount) {
+                .type = type,
+                .src = hsrc,
+                .dst = hdst,
+        };
+        return ret;
+}
+
+void cgroup_free_mounts(CGMounts *mounts) {
+        for (size_t i = 0; i < mounts->n; i++) {
+                free(mounts->mounts[i].src);
+                free(mounts->mounts[i].dst);
+        }
+        mounts->mounts = mfree(mounts->mounts);
+        mounts->n = 0;
+}
+
+/********************************************************************/
+
 static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
@@ -293,84 +353,18 @@ static int get_v1_hierarchies(Set *subsystems) {
         return 0;
 }
 
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
-                                         CGroupUnified inner_cgver, bool read_only) {
-        const char *to, *fstype, *opts;
-        int r;
-
-        to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
-
-        r = path_is_mount_point(to, 0);
-        if (r < 0 && r != -ENOENT)
-                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
-        if (r > 0)
-                return 0;
-
-        mkdir_p(to, 0755);
-
-        /* The superblock mount options of the mount point need to be
-         * identical to the hosts', and hence writable... */
-        if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
-                if (inner_cgver >= CGROUP_UNIFIED_SYSTEMD) {
-                        fstype = "cgroup2";
-                        opts = NULL;
-                } else {
-                        fstype = "cgroup";
-                        opts = "none,name=systemd,xattr";
-                }
-        } else {
-                fstype = "cgroup";
-                opts = controller;
-        }
-
-        r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
-        if (r < 0)
-                return r;
-
-        /* ... hence let's only make the bind mount read-only, not the superblock. */
-        if (read_only) {
-                r = mount_verbose(LOG_ERR, NULL, to, NULL,
-                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
-                if (r < 0)
-                        return r;
-        }
-
-        return 1;
-}
-
-/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
-static int mount_legacy_cgns_supported(
-                CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift,
-                uid_t uid_range, const char *selinux_apifs_context) {
+/* Decide the legacy cgroup mounts when cgroup namespaces are used. */
+static int cgroup_decide_mounts_sd_y_cgns(
+                CGMounts *ret_mounts,
+                CGroupUnified outer_cgver, CGroupUnified inner_cgver) {
+        _cleanup_(cgroup_free_mounts) CGMounts mounts = {};
         _cleanup_set_free_free_ Set *hierarchies = NULL;
-        const char *cgroup_root = "/sys/fs/cgroup", *c;
+        const char *c;
         int r;
 
-        (void) mkdir_p(cgroup_root, 0755);
-
         /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
-        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
-        if (r == 0) {
-                _cleanup_free_ char *options = NULL;
-
-                /* When cgroup namespaces are enabled and user namespaces are
-                 * used then the mount of the cgroupfs is done *inside* the new
-                 * user namespace. We're root in the new user namespace and the
-                 * kernel will happily translate our uid/gid to the correct
-                 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
-                 * pass uid 0 and not uid_shift to tmpfs_patch_options().
-                 */
-                r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options);
-                if (r < 0)
-                        return log_oom();
-
-                r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
-                                  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
-                if (r < 0)
-                        return r;
-        }
+        if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", ""))
+                return log_oom();
 
         if (outer_cgver >= CGROUP_UNIFIED_ALL)
                 goto skip_controllers;
@@ -393,9 +387,8 @@ static int mount_legacy_cgns_supported(
                 if (streq(hierarchy, "name=systemd"))
                         continue;
 
-                r = mount_legacy_cgroup_hierarchy("", hierarchy, hierarchy, inner_cgver, !userns);
-                if (r < 0)
-                        return r;
+                if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, hierarchy, hierarchy))
+                        return log_oom();
 
                 /* When multiple hierarchies are co-mounted, make their
                  * constituting individual hierarchies a symlink to the
@@ -411,62 +404,43 @@ static int mount_legacy_cgns_supported(
                         if (r == 0)
                                 break;
 
-                        target = prefix_root("/sys/fs/cgroup", controller);
-                        if (!target)
+                        if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, hierarchy, controller))
                                 return log_oom();
-
-                        if (streq(hierarchy, controller))
-                                break;
-
-                        r = symlink_idempotent(hierarchy, target);
-                        if (r == -EINVAL)
-                                return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
                 }
         }
 
 skip_controllers:
-        r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false);
-        if (r < 0)
-                return r;
+        switch (inner_cgver) {
+        case CGROUP_UNIFIED_NONE:
+                if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd"))
+                        return log_oom();
+                break;
+        case CGROUP_UNIFIED_ALL:
+                if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd"))
+                        return log_oom();
+                break;
+        default:
+                assert_not_reached("non-legacy cgroup version desired in legacy setup function");
+                return -EINVAL;
+        }
 
-        if (!userns)
-                return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
-                                     MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+        *ret_mounts = mounts;
+        mounts = (CGMounts){};
 
         return 0;
 }
 
-/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
-static int mount_legacy_cgns_unsupported(
-                const char *dest,
-                CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range,
-                const char *selinux_apifs_context) {
+/* Decide the legacy cgroup mounts when cgroup namespaces are not used. */
+static int cgroup_decide_mounts_sd_n_cgns(
+                CGMounts *ret_mounts,
+                CGroupUnified outer_cgver, CGroupUnified inner_cgver) {
+        _cleanup_(cgroup_free_mounts) CGMounts mounts = {};
         _cleanup_set_free_free_ Set *controllers = NULL;
-        const char *cgroup_root;
         int r;
 
-        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
-
-        (void) mkdir_p(cgroup_root, 0755);
-
         /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
-        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
-        if (r == 0) {
-                _cleanup_free_ char *options = NULL;
-
-                r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
-                if (r < 0)
-                        return log_oom();
-
-                r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
-                                  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
-                if (r < 0)
-                        return r;
-        }
+        if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", ""))
+                return log_oom();
 
         if (outer_cgver >= CGROUP_UNIFIED_ALL)
                 goto skip_controllers;
@@ -494,19 +468,12 @@ static int mount_legacy_cgns_unsupported(
                 if (r == -EINVAL) {
                         /* Not a symbolic link, but directly a single cgroup hierarchy */
 
-                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, inner_cgver, true);
-                        if (r < 0)
-                                return r;
+                        if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, controller, controller))
+                                return log_oom();
 
                 } else if (r < 0)
                         return log_error_errno(r, "Failed to read link %s: %m", origin);
                 else {
-                        _cleanup_free_ char *target = NULL;
-
-                        target = prefix_root(dest, origin);
-                        if (!target)
-                                return log_oom();
-
                         /* A symbolic link, a combination of controllers in one hierarchy */
 
                         if (!filename_is_valid(combined)) {
@@ -514,101 +481,184 @@ static int mount_legacy_cgns_unsupported(
                                 continue;
                         }
 
-                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, inner_cgver, true);
-                        if (r < 0)
-                                return r;
+                        if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, combined, combined))
+                                return log_oom();
 
-                        r = symlink_idempotent(combined, target);
-                        if (r == -EINVAL)
-                                return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
+                        if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, combined, controller))
+                                return log_oom();
                 }
         }
 
 skip_controllers:
-        r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false);
-        if (r < 0)
-                return r;
+        switch (inner_cgver) {
+        case CGROUP_UNIFIED_NONE:
+                if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd"))
+                        return log_oom();
+                break;
+        case CGROUP_UNIFIED_ALL:
+                if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd"))
+                        return log_oom();
+                break;
+        default:
+                assert_not_reached("non-legacy cgroup version desired in legacy setup function");
+                return -EINVAL;
+        }
 
-        return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
-                             MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
-}
+        *ret_mounts = mounts;
+        mounts = (CGMounts){};
 
-static int mount_unified_cgroups(const char *dest) {
-        const char *p;
-        int r;
+        return 0;
+}
 
-        assert(dest);
+int cgroup_decide_mounts(
+                CGMounts *ret_mounts,
+                CGroupUnified outer_cgver, CGroupUnified inner_cgver,
+                bool use_cgns) {
+        switch (inner_cgver) {
+        case CGROUP_UNIFIED_NONE:
+        case CGROUP_UNIFIED_SYSTEMD:
+                if (use_cgns)
+                        return cgroup_decide_mounts_sd_y_cgns(ret_mounts, outer_cgver, inner_cgver);
+                else
+                        return cgroup_decide_mounts_sd_n_cgns(ret_mounts, outer_cgver, inner_cgver);
+        case CGROUP_UNIFIED_ALL:
+                if (!cgmount_add(ret_mounts, CGMOUNT_CGROUP2, "cgroup", ""))
+                        return log_oom();
+                return 0;
+        default:
+                assert_not_reached("Invalid cgroup ver requested");
+                return -EINVAL;
+        }
+}
 
-        p = prefix_roota(dest, "/sys/fs/cgroup");
+/********************************************************************/
 
-        (void) mkdir_p(p, 0755);
+static int cgroup_mount_cg(
+                const char *mountpoint, const char *opts, CGMountType fstype,
+                FILE *cgfile, bool use_userns) {
+        const bool use_cgns = cgfile == NULL;
+        /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
+         * to not have access to the things that would make us want to mount it RO.  Otherwise, we give the container
+         * RW access to its unified or name=systemd cgroup. */
+        const bool rw = (use_userns && use_cgns) || fstype == CGMOUNT_CGROUP2 || streq(mountpoint, "/sys/fs/cgroup/systemd");
+        int r;
 
-        r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+        /* First the base mount; this is always RW, as to not change the superblock settings */
+        r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+                          MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
         if (r < 0)
-                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
-        if (r > 0) {
-                p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
-                if (access(p, F_OK) >= 0)
-                        return 0;
-                if (errno != ENOENT)
-                        return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
-
-                log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
-                return -EINVAL;
-        }
-
-        return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-}
+                return r;
 
-int mount_cgroups(
-                const char *dest,
-                CGroupUnified outer_cgver, CGroupUnified inner_cgver,
-                bool userns, uid_t uid_shift, uid_t uid_range,
-                const char *selinux_apifs_context,
-                bool use_cgns) {
+        /* Now, if nescessary, we remount RO */
+        if (rw) {
+                if (!use_cgns) {
+                        /* emulate cgns by mounting everything but our subcgroup RO */
+                        const char *cgpath;
+                        char *cgroup = NULL;
+                        if (fstype == CGMOUNT_CGROUP2) {
+                                rewind(cgfile);
+                                r = cg_pid_get_path_internal(NULL, cgfile, &cgroup);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to get child's cgroup v2 path");
+                        } else {
+                                const char *scontroller, *state;
+                                size_t controller_len;
+                                FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+                                        _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+                                        rewind(cgfile);
+                                        if (cg_pid_get_path_internal(controller, cgfile, &cgroup) == 0)
+                                                break;
+                                }
+                                if (!cgroup)
+                                        return log_error_errno(EBADMSG, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+                        }
+                        cgpath = prefix_roota(mountpoint, cgroup);
 
-        if (inner_cgver >= CGROUP_UNIFIED_ALL)
-                return mount_unified_cgroups(dest);
-        else if (use_cgns)
-                return mount_legacy_cgns_supported(outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context);
+                        r = mount_verbose(LOG_ERR, cgpath, cgpath, NULL, MS_BIND, NULL);
+                        if (r < 0)
+                                return r;
+                        r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+                                          MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+                        if (r < 0)
+                                return r;
+                }
+        } else {
+                r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+                if (r < 0)
+                        return r;
+        }
 
-        return mount_legacy_cgns_unsupported(dest, outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context);
+        return 0;
 }
 
-int mount_systemd_cgroup_writable(
-                const char *dest,
-                CGroupUnified inner_cgver) {
+int cgroup_mount_mounts(CGMounts m, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) {
+        const bool use_cgns = cgfile == NULL;
+        const bool use_userns = uid_shift != UID_INVALID;
 
-        _cleanup_free_ char *own_cgroup_path = NULL;
-        const char *systemd_root, *systemd_own;
-        int r;
+        bool used_tmpfs = false;
 
-        assert(dest);
+        for (size_t i = 0; i < m.n; i++) {
+                _cleanup_free_ char *options = NULL;
+                const char *dst;
+                int r;
 
-        r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+                dst = prefix_roota("/sys/fs/cgroup", m.mounts[i].dst);
 
-        /* If we are living in the top-level, then there's nothing to do... */
-        if (path_equal(own_cgroup_path, "/"))
-                return 0;
+                /* The checks here to see if things are already mounted are kind of primative.  Perhaps they should
+                 * actually check the statfs() f_type to verify that the thing mounted is what we want to be mounted
+                 * (similar to cgroup-util's detection logic)?  But I don't really understand the use-case for having
+                 * any of these already mounted, so I'm not sure if such increased strictness would be unwelcome. */
 
-        if (inner_cgver >= CGROUP_UNIFIED_ALL) {
-                systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
-                systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
-        } else {
-                systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
-                systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+                switch (m.mounts[i].type) {
+                case CGMOUNT_SYMLINK:
+                        (void) mkdir_parents(dst, 0755);
+                        r = symlink_idempotent(m.mounts[i].src, dst);
+                        if (r < 0)
+                                return r;
+                        break;
+                case CGMOUNT_TMPFS:
+                        used_tmpfs = true;
+                        r = path_is_mount_point(dst, AT_SYMLINK_FOLLOW);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+                        if (r > 0)
+                                continue;
+                        r = tmpfs_patch_options(m.mounts[i].src, uid_shift, selinux_apifs_context, &options);
+                        if (r < 0)
+                                return log_oom();
+                        r = mount_verbose(LOG_ERR, /*name*/"tmpfs", dst, /*fstype*/"tmpfs",
+                                          MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+                        if (r < 0)
+                                return r;
+                        break;
+                case CGMOUNT_CGROUP1:
+                case CGMOUNT_CGROUP2:
+                        r = path_is_mount_point(dst, 0);
+                        if (r < 0 && r != -ENOENT)
+                                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+                        if (r > 0) {
+                                if (access(prefix_roota(dst, "cgroup.procs"), F_OK) >= 0)
+                                        continue;
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno, "Failed to determine if mount point %s is a cgroup hierarchy: %m", dst);
+                                return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
+                        }
+                        (void) mkdir_p(dst, 0755);
+                        r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
+                        if (r < 0)
+                                return r;
+                        break;
+                default:
+                        assert_not_reached("Invalid CGMount type");
+                        return -EINVAL;
+                }
         }
 
-        /* Make our own cgroup a (writable) bind mount */
-        r = mount_verbose(LOG_ERR, systemd_own, systemd_own,  NULL, MS_BIND, NULL);
-        if (r < 0)
-                return r;
+        /* I'm going to be honest: I don't understand why we don't do this if we're using both userns and cgns. */
+        if (used_tmpfs && (!use_userns || !use_cgns))
+                return mount_verbose(LOG_ERR, NULL, "/sys/fs/cgroup", NULL,
+                                     MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
 
-        /* And then remount the systemd cgroup root read-only */
-        return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
-                             MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+        return 0;
 }
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index e677766726..46c669c446 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -24,7 +24,13 @@
 
 #include "cgroup-util.h"
 
-int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift);
+typedef struct CGMount CGMount;
+typedef struct CGMounts {
+        CGMount *mounts;
+        size_t n;
+} CGMounts;
 
-int mount_cgroups(const char *dest, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
-int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver);
+int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift);
+int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns);
+int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context);
+void cgroup_free_mounts(CGMounts *mounts);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 5c4341e0ee..94c7eea9b7 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2648,7 +2648,7 @@ static int inner_child(
                 int kmsg_socket,
                 int rtnl_socket,
                 FDSet *fds,
-                CGroupUnified outer_cgver) {
+                CGMounts cgmounts) {
 
         _cleanup_free_ char *home = NULL;
         char as_uuid[37];
@@ -2706,19 +2706,11 @@ static int inner_child(
                 r = unshare(CLONE_NEWCGROUP);
                 if (r < 0)
                         return log_error_errno(errno, "Failed to unshare cgroup namespace");
-                r = mount_cgroups(
-                                "",
-                                outer_cgver,
-                                arg_unified_cgroup_hierarchy,
-                                arg_userns_mode != USER_NAMESPACE_NO,
-                                arg_uid_shift,
-                                arg_uid_range,
-                                arg_selinux_apifs_context,
-                                true);
-                if (r < 0)
-                        return r;
-        } else {
-                r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+                r = cgroup_mount_mounts(cgmounts,
+                                        NULL,
+                                        arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0,
+                                        arg_selinux_apifs_context);
+                cgroup_free_mounts(&cgmounts);
                 if (r < 0)
                         return r;
         }
@@ -2915,6 +2907,7 @@ static int outer_child(
                 int kmsg_socket,
                 int rtnl_socket,
                 int uid_shift_socket,
+                int cgroup_socket,
                 FDSet *fds,
                 CGroupUnified outer_cgver) {
 
@@ -2922,6 +2915,7 @@ static int outer_child(
         ssize_t l;
         int r;
         _cleanup_close_ int fd = -1;
+        _cleanup_(cgroup_free_mounts) CGMounts cgmounts = {};
 
         assert(barrier);
         assert(directory);
@@ -3110,19 +3104,11 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        if (!arg_use_cgns) {
-                r = mount_cgroups(
-                                directory,
-                                outer_cgver,
-                                arg_unified_cgroup_hierarchy,
-                                arg_userns_mode != USER_NAMESPACE_NO,
-                                arg_uid_shift,
-                                arg_uid_range,
-                                arg_selinux_apifs_context,
-                                false);
-                if (r < 0)
-                        return r;
-        }
+        r = cgroup_decide_mounts(&cgmounts,
+                                 outer_cgver, arg_unified_cgroup_hierarchy,
+                                 arg_use_cgns);
+        if (r < 0)
+                return r;
 
         r = mount_move_root(directory);
         if (r < 0)
@@ -3143,12 +3129,13 @@ static int outer_child(
                 uuid_socket = safe_close(uuid_socket);
                 notify_socket = safe_close(notify_socket);
                 uid_shift_socket = safe_close(uid_shift_socket);
+                cgroup_socket = safe_close(cgroup_socket);
 
                 /* The inner child has all namespaces that are
                  * requested, so that we all are owned by the user if
                  * user namespaces are turned on. */
 
-                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, outer_cgver);
+                r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, cgmounts);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
 
@@ -3175,11 +3162,42 @@ static int outer_child(
         if (r < 0)
                 return log_error_errno(r, "Failed to send notify fd: %m");
 
+        /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a
+         * less privileged mountns (and using userns causes the mountns to be less privileged). */
+        if (!arg_use_cgns) {
+                /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've
+                 * already chroot()ed, we don't have access to /proc.  So the parent opens the file and sends it to
+                 * us. */
+                int cgfd;
+                _cleanup_fclose_ FILE *cgfile = NULL;
+
+                assert(cgroup_socket);
+
+                cgfd = receive_one_fd(cgroup_socket, 0);
+                if (cgfd < 0)
+                        return log_error_errno(cgfd, "Failed to recv cgroup fd: %m");
+
+                cgfile = fdopen(cgfd, "re");
+                if (!cgfile) {
+                        r = -errno; /* in case safe_close sets errno */
+                        cgfd = safe_close(cgfd);
+                        return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m");
+                }
+
+                r = cgroup_mount_mounts(cgmounts,
+                                        cgfile,
+                                        arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
+                                        arg_selinux_apifs_context);
+                if (r < 0)
+                        return r;
+        }
+
         pid_socket = safe_close(pid_socket);
         uuid_socket = safe_close(uuid_socket);
         notify_socket = safe_close(notify_socket);
         kmsg_socket = safe_close(kmsg_socket);
         rtnl_socket = safe_close(rtnl_socket);
+        cgroup_socket = safe_close(cgroup_socket);
 
         return 0;
 }
@@ -3611,7 +3629,8 @@ static int run(int master,
                 pid_socket_pair[2] = { -1, -1 },
                 uuid_socket_pair[2] = { -1, -1 },
                 notify_socket_pair[2] = { -1, -1 },
-                uid_shift_socket_pair[2] = { -1, -1 };
+                uid_shift_socket_pair[2] = { -1, -1 },
+                cgroup_socket_pair[2] = {-1, -1 };
         _cleanup_close_ int notify_socket= -1;
         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
@@ -3662,6 +3681,10 @@ static int run(int master,
                 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
                         return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
 
+        if (!arg_use_cgns)
+                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_socket_pair) < 0)
+                        return log_error_errno(errno, "Failed to create cgroup socket pair: %m");
+
         /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
          * parent's blocking calls and give it a chance to call wait() and terminate. */
         r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
@@ -3690,6 +3713,7 @@ static int run(int master,
                 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
                 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
                 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+                cgroup_socket_pair[0] = safe_close(cgroup_socket_pair[0]);
 
                 (void) reset_all_signal_handlers();
                 (void) reset_signal_mask();
@@ -3709,6 +3733,7 @@ static int run(int master,
                                 kmsg_socket_pair[1],
                                 rtnl_socket_pair[1],
                                 uid_shift_socket_pair[1],
+                                cgroup_socket_pair[1],
                                 fds,
                                 outer_cgver);
                 if (r < 0)
@@ -3727,6 +3752,7 @@ static int run(int master,
         uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
         notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
         uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+        cgroup_socket_pair[1] = safe_close(cgroup_socket_pair[1]);
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 /* The child just let us know the UID shift it might have read from the image. */
@@ -3847,6 +3873,8 @@ static int run(int master,
         }
 
         if (arg_register) {
+                /* If the child is to be placed into a different cgroup,
+                 * this is what does it. */
                 r = register_machine(
                                 arg_machine,
                                 *main_pid,
@@ -3867,6 +3895,20 @@ static int run(int master,
         if (r < 0)
                 return r;
 
+        if (!arg_use_cgns) {
+                /* helper_pid won't exit until this happens */
+                const char *fs;
+                _cleanup_close_ int fd;
+
+                fs = procfs_file_alloca(*main_pid, "cgroup");
+                fd = open(fs, O_RDONLY|O_CLOEXEC);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to open cgroups of child: %m");
+
+                r = send_one_fd(cgroup_socket_pair[0], fd, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to send cgroup fd: %m");
+        }
 
         /* Wait for the outer child. */
         r = wait_for_terminate_and_warn("namespace helper", *helper_pid, NULL);
author	Luke Shumaker <lukeshu@lukeshu.com>	2017-06-14 13:32:18 -0400
committer	Luke Shumaker <lukeshu@lukeshu.com>	2017-06-16 18:52:04 -0400
commit	a2c5a0f3e0f005899fd6de214ff0525b7415cb8a (patch)
tree	8394b4d9e63233ec428d5d77a798feeda6658c23
parent	76b7be5e7e8bd15b549aa12eabbe3a36fdc4e924 (diff)