summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-06-14 13:32:18 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2017-06-16 18:52:04 -0400
commita2c5a0f3e0f005899fd6de214ff0525b7415cb8a (patch)
tree8394b4d9e63233ec428d5d77a798feeda6658c23
parent76b7be5e7e8bd15b549aa12eabbe3a36fdc4e924 (diff)
nspawn: Divorce the code deciding the cgroup mounts from the code performing them
TODO: a better commit message
-rw-r--r--src/basic/cgroup-util.c37
-rw-r--r--src/basic/cgroup-util.h1
-rw-r--r--src/nspawn/nspawn-cgroup.c450
-rw-r--r--src/nspawn/nspawn-cgroup.h12
-rw-r--r--src/nspawn/nspawn.c100
5 files changed, 363 insertions, 237 deletions
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 5bf68105f2..624626c867 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -921,11 +921,18 @@ int cg_get_xattr(const char *controller, const char *path, const char *name, voi
return (int) n;
}
+/**
+ * Returns the cgroup path of the process under the hierarchy specified by @controller:
+ *
+ * controller : whichever hierarchy has @controller bound to it (with the special case that
+ * SYSTEMD_CGROUP_CONTROLLER selects whichever hierarchy systemd is using, even if it is the
+ * v2 (unified) hierarchy and thus the SYSTEMD_CGROUP_CONTROLLER doesn't actually exist)
+ *
+ * controller == NULL : equivalent to SYSTEMD_CGROUP_CONTROLLER
+ */
int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
_cleanup_fclose_ FILE *f = NULL;
- char line[LINE_MAX];
const char *fs;
- size_t cs = 0;
int unified;
assert(path);
@@ -940,20 +947,40 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
unified = cg_unified(controller);
if (unified < 0)
return unified;
- if (unified == 0)
- cs = strlen(controller);
fs = procfs_file_alloca(pid, "cgroup");
f = fopen(fs, "re");
if (!f)
return errno == ENOENT ? -ESRCH : -errno;
+ return cg_pid_get_path_internal(unified ? NULL : controller, f, path);
+}
+
+/**
+ * NB: The meaning of @controller is different here than for cg_pid_get_path():
+ *
+ * controller : the cgroup v1 hierarchy with this controller bound to it
+ * controller == NULL : the cgroup v2 (unified) hierarchy
+ */
+int cg_pid_get_path_internal(const char *controller, FILE *f, char **path) {
+ char line[LINE_MAX];
+ size_t cs = 0;
+
+ assert(path);
+ assert(f);
+
+ if (controller && !cg_controller_is_valid(controller))
+ return -EINVAL;
+
+ if (controller)
+ cs = strlen(controller);
+
FOREACH_LINE(line, f, return -errno) {
char *e, *p;
truncate_nl(line);
- if (unified) {
+ if (!controller) {
e = startswith(line, "0:");
if (!e)
continue;
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index d730f3490c..0e5b41103e 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -169,6 +169,7 @@ int cg_get_path(const char *controller, const char *path, const char *suffix, ch
int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs);
int cg_pid_get_path(const char *controller, pid_t pid, char **path);
+int cg_pid_get_path_internal(const char *controller, FILE *f, char **path);
int cg_trim(const char *controller, const char *path, bool delete_root);
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 795ad5f4ae..dafeb4a1ce 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -37,6 +37,66 @@
#include "nspawn-cgroup.h"
#include "nspawn-mount.h"
+/* Code for managing the list of CGMounts ***************************/
+
+typedef enum CGMountType {
+ CGMOUNT_SYMLINK,
+ CGMOUNT_TMPFS,
+ CGMOUNT_CGROUP1,
+ CGMOUNT_CGROUP2,
+ _CGMOUNT_MAX
+} CGMountType;
+
+struct CGMount {
+ CGMountType type;
+ char *src;
+ char *dst;
+};
+
+static CGMount *cgmount_add(CGMounts *mounts, CGMountType type, const char *src, const char *dst) {
+ char *hsrc = NULL, *hdst = NULL;
+ CGMount *c, *ret;
+
+ assert(mounts);
+ assert(type >= 0 && type < _CGMOUNT_MAX);
+ assert(src);
+ assert(dst);
+
+ hsrc = strdup(src);
+ hdst = strdup(dst);
+ if (!hsrc || !hdst) {
+ free(hsrc);
+ free(hdst);
+ return NULL;
+ }
+
+ c = realloc_multiply(mounts->mounts, sizeof(CGMount), mounts->n + 1);
+ if (!c)
+ return NULL;
+
+ mounts->mounts = c;
+ ret = &(mounts->mounts)[mounts->n];
+ (mounts->n)++;
+
+ *ret = (CGMount) {
+ .type = type,
+ .src = hsrc,
+ .dst = hdst,
+ };
+ return ret;
+}
+
+void cgroup_free_mounts(CGMounts *mounts) {
+ for (size_t i = 0; i < mounts->n; i++) {
+ free(mounts->mounts[i].src);
+ free(mounts->mounts[i].dst);
+ }
+ mounts->mounts = mfree(mounts->mounts);
+ mounts->n = 0;
+}
+
+/********************************************************************/
+
static int chown_cgroup_path(const char *path, uid_t uid_shift) {
_cleanup_close_ int fd = -1;
const char *fn;
@@ -293,84 +353,18 @@ static int get_v1_hierarchies(Set *subsystems) {
return 0;
}
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
- CGroupUnified inner_cgver, bool read_only) {
- const char *to, *fstype, *opts;
- int r;
-
- to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
-
- r = path_is_mount_point(to, 0);
- if (r < 0 && r != -ENOENT)
- return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
- if (r > 0)
- return 0;
-
- mkdir_p(to, 0755);
-
- /* The superblock mount options of the mount point need to be
- * identical to the hosts', and hence writable... */
- if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
- if (inner_cgver >= CGROUP_UNIFIED_SYSTEMD) {
- fstype = "cgroup2";
- opts = NULL;
- } else {
- fstype = "cgroup";
- opts = "none,name=systemd,xattr";
- }
- } else {
- fstype = "cgroup";
- opts = controller;
- }
-
- r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
- if (r < 0)
- return r;
-
- /* ... hence let's only make the bind mount read-only, not the superblock. */
- if (read_only) {
- r = mount_verbose(LOG_ERR, NULL, to, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
- if (r < 0)
- return r;
- }
-
- return 1;
-}
-
-/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
-static int mount_legacy_cgns_supported(
- CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift,
- uid_t uid_range, const char *selinux_apifs_context) {
+/* Decide the legacy cgroup mounts when cgroup namespaces are used. */
+static int cgroup_decide_mounts_sd_y_cgns(
+ CGMounts *ret_mounts,
+ CGroupUnified outer_cgver, CGroupUnified inner_cgver) {
+ _cleanup_(cgroup_free_mounts) CGMounts mounts = {};
_cleanup_set_free_free_ Set *hierarchies = NULL;
- const char *cgroup_root = "/sys/fs/cgroup", *c;
+ const char *c;
int r;
- (void) mkdir_p(cgroup_root, 0755);
-
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
- r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
- if (r < 0)
- return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
- if (r == 0) {
- _cleanup_free_ char *options = NULL;
-
- /* When cgroup namespaces are enabled and user namespaces are
- * used then the mount of the cgroupfs is done *inside* the new
- * user namespace. We're root in the new user namespace and the
- * kernel will happily translate our uid/gid to the correct
- * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
- * pass uid 0 and not uid_shift to tmpfs_patch_options().
- */
- r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options);
- if (r < 0)
- return log_oom();
-
- r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
- MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
- if (r < 0)
- return r;
- }
+ if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", ""))
+ return log_oom();
if (outer_cgver >= CGROUP_UNIFIED_ALL)
goto skip_controllers;
@@ -393,9 +387,8 @@ static int mount_legacy_cgns_supported(
if (streq(hierarchy, "name=systemd"))
continue;
- r = mount_legacy_cgroup_hierarchy("", hierarchy, hierarchy, inner_cgver, !userns);
- if (r < 0)
- return r;
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, hierarchy, hierarchy))
+ return log_oom();
/* When multiple hierarchies are co-mounted, make their
* constituting individual hierarchies a symlink to the
@@ -411,62 +404,43 @@ static int mount_legacy_cgns_supported(
if (r == 0)
break;
- target = prefix_root("/sys/fs/cgroup", controller);
- if (!target)
+ if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, hierarchy, controller))
return log_oom();
-
- if (streq(hierarchy, controller))
- break;
-
- r = symlink_idempotent(hierarchy, target);
- if (r == -EINVAL)
- return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
- if (r < 0)
- return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
}
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false);
- if (r < 0)
- return r;
+ switch (inner_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd"))
+ return log_oom();
+ break;
+ case CGROUP_UNIFIED_ALL:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd"))
+ return log_oom();
+ break;
+ default:
+ assert_not_reached("non-legacy cgroup version desired in legacy setup function");
+ return -EINVAL;
+ }
- if (!userns)
- return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
- MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+ *ret_mounts = mounts;
+ mounts = (CGMounts){};
return 0;
}
-/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
-static int mount_legacy_cgns_unsupported(
- const char *dest,
- CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range,
- const char *selinux_apifs_context) {
+/* Decide the legacy cgroup mounts when cgroup namespaces are not used. */
+static int cgroup_decide_mounts_sd_n_cgns(
+ CGMounts *ret_mounts,
+ CGroupUnified outer_cgver, CGroupUnified inner_cgver) {
+ _cleanup_(cgroup_free_mounts) CGMounts mounts = {};
_cleanup_set_free_free_ Set *controllers = NULL;
- const char *cgroup_root;
int r;
- cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
-
- (void) mkdir_p(cgroup_root, 0755);
-
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
- r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
- if (r < 0)
- return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
- if (r == 0) {
- _cleanup_free_ char *options = NULL;
-
- r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
- if (r < 0)
- return log_oom();
-
- r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
- MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
- if (r < 0)
- return r;
- }
+ if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", ""))
+ return log_oom();
if (outer_cgver >= CGROUP_UNIFIED_ALL)
goto skip_controllers;
@@ -494,19 +468,12 @@ static int mount_legacy_cgns_unsupported(
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
- r = mount_legacy_cgroup_hierarchy(dest, controller, controller, inner_cgver, true);
- if (r < 0)
- return r;
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, controller, controller))
+ return log_oom();
} else if (r < 0)
return log_error_errno(r, "Failed to read link %s: %m", origin);
else {
- _cleanup_free_ char *target = NULL;
-
- target = prefix_root(dest, origin);
- if (!target)
- return log_oom();
-
/* A symbolic link, a combination of controllers in one hierarchy */
if (!filename_is_valid(combined)) {
@@ -514,101 +481,184 @@ static int mount_legacy_cgns_unsupported(
continue;
}
- r = mount_legacy_cgroup_hierarchy(dest, combined, combined, inner_cgver, true);
- if (r < 0)
- return r;
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, combined, combined))
+ return log_oom();
- r = symlink_idempotent(combined, target);
- if (r == -EINVAL)
- return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
- if (r < 0)
- return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
+ if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, combined, controller))
+ return log_oom();
}
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false);
- if (r < 0)
- return r;
+ switch (inner_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd"))
+ return log_oom();
+ break;
+ case CGROUP_UNIFIED_ALL:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd"))
+ return log_oom();
+ break;
+ default:
+ assert_not_reached("non-legacy cgroup version desired in legacy setup function");
+ return -EINVAL;
+ }
- return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
- MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
-}
+ *ret_mounts = mounts;
+ mounts = (CGMounts){};
-static int mount_unified_cgroups(const char *dest) {
- const char *p;
- int r;
+ return 0;
+}
- assert(dest);
+int cgroup_decide_mounts(
+ CGMounts *ret_mounts,
+ CGroupUnified outer_cgver, CGroupUnified inner_cgver,
+ bool use_cgns) {
+ switch (inner_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ case CGROUP_UNIFIED_SYSTEMD:
+ if (use_cgns)
+ return cgroup_decide_mounts_sd_y_cgns(ret_mounts, outer_cgver, inner_cgver);
+ else
+ return cgroup_decide_mounts_sd_n_cgns(ret_mounts, outer_cgver, inner_cgver);
+ case CGROUP_UNIFIED_ALL:
+ if (!cgmount_add(ret_mounts, CGMOUNT_CGROUP2, "cgroup", ""))
+ return log_oom();
+ return 0;
+ default:
+ assert_not_reached("Invalid cgroup ver requested");
+ return -EINVAL;
+ }
+}
- p = prefix_roota(dest, "/sys/fs/cgroup");
+/********************************************************************/
- (void) mkdir_p(p, 0755);
+static int cgroup_mount_cg(
+ const char *mountpoint, const char *opts, CGMountType fstype,
+ FILE *cgfile, bool use_userns) {
+ const bool use_cgns = cgfile == NULL;
+ /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
+ * to not have access to the things that would make us want to mount it RO. Otherwise, we give the container
+ * RW access to its unified or name=systemd cgroup. */
+ const bool rw = (use_userns && use_cgns) || fstype == CGMOUNT_CGROUP2 || streq(mountpoint, "/sys/fs/cgroup/systemd");
+ int r;
- r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+ /* First the base mount; this is always RW, as to not change the superblock settings */
+ r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
if (r < 0)
- return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
- if (r > 0) {
- p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
- if (access(p, F_OK) >= 0)
- return 0;
- if (errno != ENOENT)
- return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
-
- log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
- return -EINVAL;
- }
-
- return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-}
+ return r;
-int mount_cgroups(
- const char *dest,
- CGroupUnified outer_cgver, CGroupUnified inner_cgver,
- bool userns, uid_t uid_shift, uid_t uid_range,
- const char *selinux_apifs_context,
- bool use_cgns) {
+ /* Now, if nescessary, we remount RO */
+ if (rw) {
+ if (!use_cgns) {
+ /* emulate cgns by mounting everything but our subcgroup RO */
+ const char *cgpath;
+ char *cgroup = NULL;
+ if (fstype == CGMOUNT_CGROUP2) {
+ rewind(cgfile);
+ r = cg_pid_get_path_internal(NULL, cgfile, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get child's cgroup v2 path");
+ } else {
+ const char *scontroller, *state;
+ size_t controller_len;
+ FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+ _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+ rewind(cgfile);
+ if (cg_pid_get_path_internal(controller, cgfile, &cgroup) == 0)
+ break;
+ }
+ if (!cgroup)
+ return log_error_errno(EBADMSG, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+ }
+ cgpath = prefix_roota(mountpoint, cgroup);
- if (inner_cgver >= CGROUP_UNIFIED_ALL)
- return mount_unified_cgroups(dest);
- else if (use_cgns)
- return mount_legacy_cgns_supported(outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context);
+ r = mount_verbose(LOG_ERR, cgpath, cgpath, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+ r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
+ }
+ } else {
+ r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
+ }
- return mount_legacy_cgns_unsupported(dest, outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context);
+ return 0;
}
-int mount_systemd_cgroup_writable(
- const char *dest,
- CGroupUnified inner_cgver) {
+int cgroup_mount_mounts(CGMounts m, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) {
+ const bool use_cgns = cgfile == NULL;
+ const bool use_userns = uid_shift != UID_INVALID;
- _cleanup_free_ char *own_cgroup_path = NULL;
- const char *systemd_root, *systemd_own;
- int r;
+ bool used_tmpfs = false;
- assert(dest);
+ for (size_t i = 0; i < m.n; i++) {
+ _cleanup_free_ char *options = NULL;
+ const char *dst;
+ int r;
- r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
- if (r < 0)
- return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+ dst = prefix_roota("/sys/fs/cgroup", m.mounts[i].dst);
- /* If we are living in the top-level, then there's nothing to do... */
- if (path_equal(own_cgroup_path, "/"))
- return 0;
+ /* The checks here to see if things are already mounted are kind of primative. Perhaps they should
+ * actually check the statfs() f_type to verify that the thing mounted is what we want to be mounted
+ * (similar to cgroup-util's detection logic)? But I don't really understand the use-case for having
+ * any of these already mounted, so I'm not sure if such increased strictness would be unwelcome. */
- if (inner_cgver >= CGROUP_UNIFIED_ALL) {
- systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
- } else {
- systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+ switch (m.mounts[i].type) {
+ case CGMOUNT_SYMLINK:
+ (void) mkdir_parents(dst, 0755);
+ r = symlink_idempotent(m.mounts[i].src, dst);
+ if (r < 0)
+ return r;
+ break;
+ case CGMOUNT_TMPFS:
+ used_tmpfs = true;
+ r = path_is_mount_point(dst, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+ if (r > 0)
+ continue;
+ r = tmpfs_patch_options(m.mounts[i].src, uid_shift, selinux_apifs_context, &options);
+ if (r < 0)
+ return log_oom();
+ r = mount_verbose(LOG_ERR, /*name*/"tmpfs", dst, /*fstype*/"tmpfs",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+ if (r < 0)
+ return r;
+ break;
+ case CGMOUNT_CGROUP1:
+ case CGMOUNT_CGROUP2:
+ r = path_is_mount_point(dst, 0);
+ if (r < 0 && r != -ENOENT)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+ if (r > 0) {
+ if (access(prefix_roota(dst, "cgroup.procs"), F_OK) >= 0)
+ continue;
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to determine if mount point %s is a cgroup hierarchy: %m", dst);
+ return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
+ }
+ (void) mkdir_p(dst, 0755);
+ r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
+ if (r < 0)
+ return r;
+ break;
+ default:
+ assert_not_reached("Invalid CGMount type");
+ return -EINVAL;
+ }
}
- /* Make our own cgroup a (writable) bind mount */
- r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
- if (r < 0)
- return r;
+ /* I'm going to be honest: I don't understand why we don't do this if we're using both userns and cgns. */
+ if (used_tmpfs && (!use_userns || !use_cgns))
+ return mount_verbose(LOG_ERR, NULL, "/sys/fs/cgroup", NULL,
+ MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
- /* And then remount the systemd cgroup root read-only */
- return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ return 0;
}
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index e677766726..46c669c446 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -24,7 +24,13 @@
#include "cgroup-util.h"
-int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift);
+typedef struct CGMount CGMount;
+typedef struct CGMounts {
+ CGMount *mounts;
+ size_t n;
+} CGMounts;
-int mount_cgroups(const char *dest, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
-int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver);
+int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift);
+int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns);
+int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context);
+void cgroup_free_mounts(CGMounts *mounts);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 5c4341e0ee..94c7eea9b7 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2648,7 +2648,7 @@ static int inner_child(
int kmsg_socket,
int rtnl_socket,
FDSet *fds,
- CGroupUnified outer_cgver) {
+ CGMounts cgmounts) {
_cleanup_free_ char *home = NULL;
char as_uuid[37];
@@ -2706,19 +2706,11 @@ static int inner_child(
r = unshare(CLONE_NEWCGROUP);
if (r < 0)
return log_error_errno(errno, "Failed to unshare cgroup namespace");
- r = mount_cgroups(
- "",
- outer_cgver,
- arg_unified_cgroup_hierarchy,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_apifs_context,
- true);
- if (r < 0)
- return r;
- } else {
- r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+ r = cgroup_mount_mounts(cgmounts,
+ NULL,
+ arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0,
+ arg_selinux_apifs_context);
+ cgroup_free_mounts(&cgmounts);
if (r < 0)
return r;
}
@@ -2915,6 +2907,7 @@ static int outer_child(
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
+ int cgroup_socket,
FDSet *fds,
CGroupUnified outer_cgver) {
@@ -2922,6 +2915,7 @@ static int outer_child(
ssize_t l;
int r;
_cleanup_close_ int fd = -1;
+ _cleanup_(cgroup_free_mounts) CGMounts cgmounts = {};
assert(barrier);
assert(directory);
@@ -3110,19 +3104,11 @@ static int outer_child(
if (r < 0)
return r;
- if (!arg_use_cgns) {
- r = mount_cgroups(
- directory,
- outer_cgver,
- arg_unified_cgroup_hierarchy,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_apifs_context,
- false);
- if (r < 0)
- return r;
- }
+ r = cgroup_decide_mounts(&cgmounts,
+ outer_cgver, arg_unified_cgroup_hierarchy,
+ arg_use_cgns);
+ if (r < 0)
+ return r;
r = mount_move_root(directory);
if (r < 0)
@@ -3143,12 +3129,13 @@ static int outer_child(
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
+ cgroup_socket = safe_close(cgroup_socket);
/* The inner child has all namespaces that are
* requested, so that we all are owned by the user if
* user namespaces are turned on. */
- r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, outer_cgver);
+ r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, cgmounts);
if (r < 0)
_exit(EXIT_FAILURE);
@@ -3175,11 +3162,42 @@ static int outer_child(
if (r < 0)
return log_error_errno(r, "Failed to send notify fd: %m");
+ /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a
+ * less privileged mountns (and using userns causes the mountns to be less privileged). */
+ if (!arg_use_cgns) {
+ /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've
+ * already chroot()ed, we don't have access to /proc. So the parent opens the file and sends it to
+ * us. */
+ int cgfd;
+ _cleanup_fclose_ FILE *cgfile = NULL;
+
+ assert(cgroup_socket);
+
+ cgfd = receive_one_fd(cgroup_socket, 0);
+ if (cgfd < 0)
+ return log_error_errno(cgfd, "Failed to recv cgroup fd: %m");
+
+ cgfile = fdopen(cgfd, "re");
+ if (!cgfile) {
+ r = -errno; /* in case safe_close sets errno */
+ cgfd = safe_close(cgfd);
+ return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m");
+ }
+
+ r = cgroup_mount_mounts(cgmounts,
+ cgfile,
+ arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
+ arg_selinux_apifs_context);
+ if (r < 0)
+ return r;
+ }
+
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
+ cgroup_socket = safe_close(cgroup_socket);
return 0;
}
@@ -3611,7 +3629,8 @@ static int run(int master,
pid_socket_pair[2] = { -1, -1 },
uuid_socket_pair[2] = { -1, -1 },
notify_socket_pair[2] = { -1, -1 },
- uid_shift_socket_pair[2] = { -1, -1 };
+ uid_shift_socket_pair[2] = { -1, -1 },
+ cgroup_socket_pair[2] = {-1, -1 };
_cleanup_close_ int notify_socket= -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
@@ -3662,6 +3681,10 @@ static int run(int master,
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+ if (!arg_use_cgns)
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create cgroup socket pair: %m");
+
/* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
* parent's blocking calls and give it a chance to call wait() and terminate. */
r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
@@ -3690,6 +3713,7 @@ static int run(int master,
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+ cgroup_socket_pair[0] = safe_close(cgroup_socket_pair[0]);
(void) reset_all_signal_handlers();
(void) reset_signal_mask();
@@ -3709,6 +3733,7 @@ static int run(int master,
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
+ cgroup_socket_pair[1],
fds,
outer_cgver);
if (r < 0)
@@ -3727,6 +3752,7 @@ static int run(int master,
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+ cgroup_socket_pair[1] = safe_close(cgroup_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* The child just let us know the UID shift it might have read from the image. */
@@ -3847,6 +3873,8 @@ static int run(int master,
}
if (arg_register) {
+ /* If the child is to be placed into a different cgroup,
+ * this is what does it. */
r = register_machine(
arg_machine,
*main_pid,
@@ -3867,6 +3895,20 @@ static int run(int master,
if (r < 0)
return r;
+ if (!arg_use_cgns) {
+ /* helper_pid won't exit until this happens */
+ const char *fs;
+ _cleanup_close_ int fd;
+
+ fs = procfs_file_alloca(*main_pid, "cgroup");
+ fd = open(fs, O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open cgroups of child: %m");
+
+ r = send_one_fd(cgroup_socket_pair[0], fd, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to send cgroup fd: %m");
+ }
/* Wait for the outer child. */
r = wait_for_terminate_and_warn("namespace helper", *helper_pid, NULL);