nspawn: handle cgroup namespaces

(NOTE: Cgroup namespaces work with legacy and unified hierarchies: "This is completely backward compatible and will be completely invisible to any existing cgroup users (except for those running inside a cgroup namespace and looking at /proc/pid/cgroup of tasks outside their namespace.)" (https://lists.linuxfoundation.org/pipermail/containers/2016-January/036582.html) So there is no need to special case unified.) If cgroup namespaces are supported we skip mount_cgroups() in the outer_child(). Instead, we unshare(CLONE_NEWCGROUP) in the inner_child() and only then do we call mount_cgroups(). The clean way to handle cgroup namespaces would be to delegate mounting of cgroups completely to the init system in the container. However, this would likely break backward compatibility with the UNIFIED_CGROUP_HIERARCHY flag of systemd-nspawn. Also no cgroupfs would be mounted whenever the user simply requests a shell and no init is available to mount cgroups. Hence, we introduce mount_legacy_cgns_supported(). After calling unshare(CLONE_NEWCGROUP) it parses /proc/self/cgroup to find the mounted controllers and mounts them inside the new cgroup namespace. This should preserve backward compatibility with the UNIFIED_CGROUP_HIERARCHY flag and mount a cgroupfs when no init in the container is running.
author: Christian Brauner <cbrauner@suse.de> 2016-06-23 13:41:56 +0200
committer: Christian Brauner <cbrauner@suse.de> 2016-07-09 06:34:11 +0200
commit: 0996ef00fb5c0770d49670f81a230fcc2552af89 (patch)
tree: 816306f265ce35d08f57044572bac32ca353abb1 /src/nspawn
parent: 3228995c534c4cae609e97502f7c7ca1d4a14840 (diff)
2 files changed, 201 insertions, 28 deletions
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 8e2d2d543c..b76b09b9cb 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -23,6 +23,8 @@
 #include "alloc-util.h"
 #include "cgroup-util.h"
 #include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
 #include "fs-util.h"
 #include "label.h"
 #include "mkdir.h"
@@ -181,13 +183,15 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
 
 static int tmpfs_patch_options(
                 const char *options,
-                bool userns, uid_t uid_shift, uid_t uid_range,
+                bool userns,
+                uid_t uid_shift, uid_t uid_range,
+                bool patch_ids,
                 const char *selinux_apifs_context,
                 char **ret) {
 
         char *buf = NULL;
 
-        if (userns && uid_shift != 0) {
+        if ((userns && uid_shift != 0) || patch_ids) {
                 assert(uid_shift != UID_INVALID);
 
                 if (options)
@@ -218,7 +222,13 @@ static int tmpfs_patch_options(
         }
 #endif
 
+        if (!buf && options) {
+                buf = strdup(options);
+                if (!buf)
+                        return -ENOMEM;
+        }
         *ret = buf;
+
         return !!buf;
 }
 
@@ -271,7 +281,15 @@ int mount_sysfs(const char *dest) {
                 return log_error_errno(errno, "Failed to remove %s: %m", full);
 
         x = prefix_roota(top, "/fs/kdbus");
-        (void) mkdir(x, 0755);
+        (void) mkdir_p(x, 0755);
+
+        /* Create mountpoint for cgroups. Otherwise we are not allowed since we
+         * remount /sys read-only.
+         */
+        if (cg_ns_supported()) {
+                x = prefix_roota(top, "/fs/cgroup");
+                (void) mkdir_p(x, 0755);
+        }
 
         if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
                 return log_error_errno(errno, "Failed to make %s read-only: %m", top);
@@ -348,7 +366,7 @@ int mount_all(const char *dest,
 
                 o = mount_table[k].options;
                 if (streq_ptr(mount_table[k].type, "tmpfs")) {
-                        r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
+                        r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
                         if (r < 0)
                                 return log_oom();
                         if (r > 0)
@@ -485,7 +503,7 @@ static int mount_tmpfs(
         if (r < 0 && r != -EEXIST)
                 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
 
-        r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+        r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
         if (r < 0)
                 return log_oom();
         options = r > 0 ? buf : m->options;
@@ -600,6 +618,48 @@ int mount_custom(
         return 0;
 }
 
+/* Retrieve existing subsystems. This function is called in a new cgroup
+ * namespace.
+ */
+static int get_controllers(Set *subsystems) {
+        _cleanup_fclose_ FILE *f = NULL;
+        char line[LINE_MAX];
+
+        assert(subsystems);
+
+        f = fopen("/proc/self/cgroup", "re");
+        if (!f)
+                return errno == ENOENT ? -ESRCH : -errno;
+
+        FOREACH_LINE(line, f, return -errno) {
+                int r;
+                char *e, *l, *p;
+
+                truncate_nl(line);
+
+                l = strchr(line, ':');
+                if (!l)
+                        continue;
+
+                l++;
+                e = strchr(l, ':');
+                if (!e)
+                        continue;
+
+                *e = 0;
+
+                if (streq(l, "") || streq(l, "name=systemd"))
+                        continue;
+
+                p = strdup(l);
+                r = set_consume(subsystems, p);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
 static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
         char *to;
         int r;
@@ -628,11 +688,107 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
         return 1;
 }
 
-static int mount_legacy_cgroups(
-                const char *dest,
+/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
+static int mount_legacy_cgns_supported(
                 bool userns, uid_t uid_shift, uid_t uid_range,
                 const char *selinux_apifs_context) {
+        _cleanup_set_free_free_ Set *controllers = NULL;
+        const char *cgroup_root = "/sys/fs/cgroup", *c;
+        int r;
 
+        (void) mkdir_p(cgroup_root, 0755);
+
+        /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+        if (r == 0) {
+                _cleanup_free_ char *options = NULL;
+
+                /* When cgroup namespaces are enabled and user namespaces are
+                 * used then the mount of the cgroupfs is done *inside* the new
+                 * user namespace. We're root in the new user namespace and the
+                 * kernel will happily translate our uid/gid to the correct
+                 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
+                 * pass uid 0 and not uid_shift to tmpfs_patch_options().
+                 */
+                r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
+                if (r < 0)
+                        return log_oom();
+
+                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+        }
+
+        if (cg_unified() > 0)
+                goto skip_controllers;
+
+        controllers = set_new(&string_hash_ops);
+        if (!controllers)
+                return log_oom();
+
+        r = get_controllers(controllers);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+
+        for (;;) {
+                _cleanup_free_ const char *controller = NULL;
+
+                controller = set_steal_first(controllers);
+                if (!controller)
+                        break;
+
+                r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
+                if (r < 0)
+                        return r;
+
+                /* When multiple hierarchies are co-mounted, make their
+                 * constituting individual hierarchies a symlink to the
+                 * co-mount.
+                 */
+                c = controller;
+                for (;;) {
+                        _cleanup_free_ char *target = NULL, *tok = NULL;
+
+                        r = extract_first_word(&c, &tok, ",", 0);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
+                        if (r == 0)
+                                break;
+
+                        target = prefix_root("/sys/fs/cgroup", tok);
+                        if (!target)
+                                return log_oom();
+
+                        if (streq(controller, tok))
+                                break;
+
+                        r = symlink_idempotent(controller, target);
+                        if (r == -EINVAL)
+                                return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
+                }
+        }
+
+skip_controllers:
+        r = mount_legacy_cgroup_hierarchy("", "none,name=systemd,xattr", "systemd", false);
+        if (r < 0)
+                return r;
+
+        if (!userns) {
+                if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
+                        return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
+        }
+
+        return 0;
+}
+
+/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
+static int mount_legacy_cgns_unsupported(
+                const char *dest,
+                bool userns, uid_t uid_shift, uid_t uid_range,
+                const char *selinux_apifs_context) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root;
         int r;
@@ -648,7 +804,7 @@ static int mount_legacy_cgroups(
         if (r == 0) {
                 _cleanup_free_ char *options = NULL;
 
-                r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
+                r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
                 if (r < 0)
                         return log_oom();
 
@@ -707,10 +863,8 @@ static int mount_legacy_cgroups(
                                 return r;
 
                         r = symlink_idempotent(combined, target);
-                        if (r == -EINVAL) {
-                                log_error("Invalid existing symlink for combined hierarchy");
-                                return r;
-                        }
+                        if (r == -EINVAL)
+                                return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
                         if (r < 0)
                                 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
                 }
@@ -765,8 +919,10 @@ int mount_cgroups(
 
         if (unified_requested)
                 return mount_unified_cgroups(dest);
-        else
-                return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
+        else if (cg_ns_supported())
+                return mount_legacy_cgns_supported(userns, uid_shift, uid_range, selinux_apifs_context);
+
+        return mount_legacy_cgns_unsupported(dest, userns, uid_shift, uid_range, selinux_apifs_context);
 }
 
 int mount_systemd_cgroup_writable(
@@ -834,7 +990,7 @@ int setup_volatile_state(
                 return log_error_errno(errno, "Failed to create %s: %m", directory);
 
         options = "mode=755";
-        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
         if (r < 0)
                 return log_oom();
         if (r > 0)
@@ -870,7 +1026,7 @@ int setup_volatile(
                 return log_error_errno(errno, "Failed to create temporary directory: %m");
 
         options = "mode=755";
-        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
         if (r < 0)
                 return log_oom();
         if (r > 0)
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 73c56d7310..9e74a77758 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2594,9 +2594,24 @@ static int inner_child(
                 return -ESRCH;
         }
 
-        r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
-        if (r < 0)
-                return r;
+        if (cg_ns_supported()) {
+                r = unshare(CLONE_NEWCGROUP);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to unshare cgroup namespace");
+                r = mount_cgroups(
+                                "",
+                                arg_unified_cgroup_hierarchy,
+                                arg_userns_mode != USER_NAMESPACE_NO,
+                                arg_uid_shift,
+                                arg_uid_range,
+                                arg_selinux_apifs_context);
+                if (r < 0)
+                        return r;
+        } else {
+                r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+                if (r < 0)
+                        return r;
+        }
 
         r = reset_uid_gid();
         if (r < 0)
@@ -2978,15 +2993,17 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = mount_cgroups(
-                        directory,
-                        arg_unified_cgroup_hierarchy,
-                        arg_userns_mode != USER_NAMESPACE_NO,
-                        arg_uid_shift,
-                        arg_uid_range,
-                        arg_selinux_apifs_context);
-        if (r < 0)
-                return r;
+        if (!cg_ns_supported()) {
+                r = mount_cgroups(
+                                directory,
+                                arg_unified_cgroup_hierarchy,
+                                arg_userns_mode != USER_NAMESPACE_NO,
+                                arg_uid_shift,
+                                arg_uid_range,
+                                arg_selinux_apifs_context);
+                if (r < 0)
+                        return r;
+        }
 
         r = mount_move_root(directory);
         if (r < 0)
author	Christian Brauner <cbrauner@suse.de>	2016-06-23 13:41:56 +0200
committer	Christian Brauner <cbrauner@suse.de>	2016-07-09 06:34:11 +0200
commit	0996ef00fb5c0770d49670f81a230fcc2552af89 (patch)
tree	816306f265ce35d08f57044572bac32ca353abb1 /src/nspawn
parent	3228995c534c4cae609e97502f7c7ca1d4a14840 (diff)