summaryrefslogtreecommitdiff
path: root/src/nspawn/nspawn-cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nspawn/nspawn-cgroup.c')
-rw-r--r--src/nspawn/nspawn-cgroup.c450
1 files changed, 250 insertions, 200 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 795ad5f4ae..dafeb4a1ce 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -37,6 +37,66 @@
#include "nspawn-cgroup.h"
#include "nspawn-mount.h"
+/* Code for managing the list of CGMounts ***************************/
+
+typedef enum CGMountType {
+ CGMOUNT_SYMLINK,
+ CGMOUNT_TMPFS,
+ CGMOUNT_CGROUP1,
+ CGMOUNT_CGROUP2,
+ _CGMOUNT_MAX
+} CGMountType;
+
+struct CGMount {
+ CGMountType type;
+ char *src;
+ char *dst;
+};
+
+static CGMount *cgmount_add(CGMounts *mounts, CGMountType type, const char *src, const char *dst) {
+ char *hsrc = NULL, *hdst = NULL;
+ CGMount *c, *ret;
+
+ assert(mounts);
+ assert(type >= 0 && type < _CGMOUNT_MAX);
+ assert(src);
+ assert(dst);
+
+ hsrc = strdup(src);
+ hdst = strdup(dst);
+ if (!hsrc || !hdst) {
+ free(hsrc);
+ free(hdst);
+ return NULL;
+ }
+
+ c = realloc_multiply(mounts->mounts, sizeof(CGMount), mounts->n + 1);
+ if (!c)
+ return NULL;
+
+ mounts->mounts = c;
+ ret = &(mounts->mounts)[mounts->n];
+ (mounts->n)++;
+
+ *ret = (CGMount) {
+ .type = type,
+ .src = hsrc,
+ .dst = hdst,
+ };
+ return ret;
+}
+
+void cgroup_free_mounts(CGMounts *mounts) {
+ for (size_t i = 0; i < mounts->n; i++) {
+ free(mounts->mounts[i].src);
+ free(mounts->mounts[i].dst);
+ }
+ mounts->mounts = mfree(mounts->mounts);
+ mounts->n = 0;
+}
+
+/********************************************************************/
+
static int chown_cgroup_path(const char *path, uid_t uid_shift) {
_cleanup_close_ int fd = -1;
const char *fn;
@@ -293,84 +353,18 @@ static int get_v1_hierarchies(Set *subsystems) {
return 0;
}
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
- CGroupUnified inner_cgver, bool read_only) {
- const char *to, *fstype, *opts;
- int r;
-
- to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
-
- r = path_is_mount_point(to, 0);
- if (r < 0 && r != -ENOENT)
- return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
- if (r > 0)
- return 0;
-
- mkdir_p(to, 0755);
-
- /* The superblock mount options of the mount point need to be
- * identical to the hosts', and hence writable... */
- if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
- if (inner_cgver >= CGROUP_UNIFIED_SYSTEMD) {
- fstype = "cgroup2";
- opts = NULL;
- } else {
- fstype = "cgroup";
- opts = "none,name=systemd,xattr";
- }
- } else {
- fstype = "cgroup";
- opts = controller;
- }
-
- r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
- if (r < 0)
- return r;
-
- /* ... hence let's only make the bind mount read-only, not the superblock. */
- if (read_only) {
- r = mount_verbose(LOG_ERR, NULL, to, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
- if (r < 0)
- return r;
- }
-
- return 1;
-}
-
-/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
-static int mount_legacy_cgns_supported(
- CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift,
- uid_t uid_range, const char *selinux_apifs_context) {
+/* Decide the legacy cgroup mounts when cgroup namespaces are used. */
+static int cgroup_decide_mounts_sd_y_cgns(
+ CGMounts *ret_mounts,
+ CGroupUnified outer_cgver, CGroupUnified inner_cgver) {
+ _cleanup_(cgroup_free_mounts) CGMounts mounts = {};
_cleanup_set_free_free_ Set *hierarchies = NULL;
- const char *cgroup_root = "/sys/fs/cgroup", *c;
+ const char *c;
int r;
- (void) mkdir_p(cgroup_root, 0755);
-
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
- r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
- if (r < 0)
- return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
- if (r == 0) {
- _cleanup_free_ char *options = NULL;
-
- /* When cgroup namespaces are enabled and user namespaces are
- * used then the mount of the cgroupfs is done *inside* the new
- * user namespace. We're root in the new user namespace and the
- * kernel will happily translate our uid/gid to the correct
- * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
- * pass uid 0 and not uid_shift to tmpfs_patch_options().
- */
- r = tmpfs_patch_options("mode=755", 0, selinux_apifs_context, &options);
- if (r < 0)
- return log_oom();
-
- r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
- MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
- if (r < 0)
- return r;
- }
+ if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", ""))
+ return log_oom();
if (outer_cgver >= CGROUP_UNIFIED_ALL)
goto skip_controllers;
@@ -393,9 +387,8 @@ static int mount_legacy_cgns_supported(
if (streq(hierarchy, "name=systemd"))
continue;
- r = mount_legacy_cgroup_hierarchy("", hierarchy, hierarchy, inner_cgver, !userns);
- if (r < 0)
- return r;
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, hierarchy, hierarchy))
+ return log_oom();
/* When multiple hierarchies are co-mounted, make their
* constituting individual hierarchies a symlink to the
@@ -411,62 +404,43 @@ static int mount_legacy_cgns_supported(
if (r == 0)
break;
- target = prefix_root("/sys/fs/cgroup", controller);
- if (!target)
+ if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, hierarchy, controller))
return log_oom();
-
- if (streq(hierarchy, controller))
- break;
-
- r = symlink_idempotent(hierarchy, target);
- if (r == -EINVAL)
- return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
- if (r < 0)
- return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
}
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false);
- if (r < 0)
- return r;
+ switch (inner_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd"))
+ return log_oom();
+ break;
+ case CGROUP_UNIFIED_ALL:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd"))
+ return log_oom();
+ break;
+ default:
+ assert_not_reached("non-legacy cgroup version desired in legacy setup function");
+ return -EINVAL;
+ }
- if (!userns)
- return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
- MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+ *ret_mounts = mounts;
+ mounts = (CGMounts){};
return 0;
}
-/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
-static int mount_legacy_cgns_unsupported(
- const char *dest,
- CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range,
- const char *selinux_apifs_context) {
+/* Decide the legacy cgroup mounts when cgroup namespaces are not used. */
+static int cgroup_decide_mounts_sd_n_cgns(
+ CGMounts *ret_mounts,
+ CGroupUnified outer_cgver, CGroupUnified inner_cgver) {
+ _cleanup_(cgroup_free_mounts) CGMounts mounts = {};
_cleanup_set_free_free_ Set *controllers = NULL;
- const char *cgroup_root;
int r;
- cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
-
- (void) mkdir_p(cgroup_root, 0755);
-
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
- r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
- if (r < 0)
- return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
- if (r == 0) {
- _cleanup_free_ char *options = NULL;
-
- r = tmpfs_patch_options("mode=755", uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
- if (r < 0)
- return log_oom();
-
- r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
- MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
- if (r < 0)
- return r;
- }
+ if (!cgmount_add(&mounts, CGMOUNT_TMPFS, "mode=755", ""))
+ return log_oom();
if (outer_cgver >= CGROUP_UNIFIED_ALL)
goto skip_controllers;
@@ -494,19 +468,12 @@ static int mount_legacy_cgns_unsupported(
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
- r = mount_legacy_cgroup_hierarchy(dest, controller, controller, inner_cgver, true);
- if (r < 0)
- return r;
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, controller, controller))
+ return log_oom();
} else if (r < 0)
return log_error_errno(r, "Failed to read link %s: %m", origin);
else {
- _cleanup_free_ char *target = NULL;
-
- target = prefix_root(dest, origin);
- if (!target)
- return log_oom();
-
/* A symbolic link, a combination of controllers in one hierarchy */
if (!filename_is_valid(combined)) {
@@ -514,101 +481,184 @@ static int mount_legacy_cgns_unsupported(
continue;
}
- r = mount_legacy_cgroup_hierarchy(dest, combined, combined, inner_cgver, true);
- if (r < 0)
- return r;
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, combined, combined))
+ return log_oom();
- r = symlink_idempotent(combined, target);
- if (r == -EINVAL)
- return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
- if (r < 0)
- return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
+ if (!cgmount_add(&mounts, CGMOUNT_SYMLINK, combined, controller))
+ return log_oom();
}
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", inner_cgver, false);
- if (r < 0)
- return r;
+ switch (inner_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP1, "name=systmed", "systemd"))
+ return log_oom();
+ break;
+ case CGROUP_UNIFIED_ALL:
+ if (!cgmount_add(&mounts, CGMOUNT_CGROUP2, "", "systemd"))
+ return log_oom();
+ break;
+ default:
+ assert_not_reached("non-legacy cgroup version desired in legacy setup function");
+ return -EINVAL;
+ }
- return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
- MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
-}
+ *ret_mounts = mounts;
+ mounts = (CGMounts){};
-static int mount_unified_cgroups(const char *dest) {
- const char *p;
- int r;
+ return 0;
+}
- assert(dest);
+int cgroup_decide_mounts(
+ CGMounts *ret_mounts,
+ CGroupUnified outer_cgver, CGroupUnified inner_cgver,
+ bool use_cgns) {
+ switch (inner_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ case CGROUP_UNIFIED_SYSTEMD:
+ if (use_cgns)
+ return cgroup_decide_mounts_sd_y_cgns(ret_mounts, outer_cgver, inner_cgver);
+ else
+ return cgroup_decide_mounts_sd_n_cgns(ret_mounts, outer_cgver, inner_cgver);
+ case CGROUP_UNIFIED_ALL:
+ if (!cgmount_add(ret_mounts, CGMOUNT_CGROUP2, "cgroup", ""))
+ return log_oom();
+ return 0;
+ default:
+ assert_not_reached("Invalid cgroup ver requested");
+ return -EINVAL;
+ }
+}
- p = prefix_roota(dest, "/sys/fs/cgroup");
+/********************************************************************/
- (void) mkdir_p(p, 0755);
+static int cgroup_mount_cg(
+ const char *mountpoint, const char *opts, CGMountType fstype,
+ FILE *cgfile, bool use_userns) {
+ const bool use_cgns = cgfile == NULL;
+ /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
+ * to not have access to the things that would make us want to mount it RO. Otherwise, we give the container
+ * RW access to its unified or name=systemd cgroup. */
+ const bool rw = (use_userns && use_cgns) || fstype == CGMOUNT_CGROUP2 || streq(mountpoint, "/sys/fs/cgroup/systemd");
+ int r;
- r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+ /* First the base mount; this is always RW, as to not change the superblock settings */
+ r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
if (r < 0)
- return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
- if (r > 0) {
- p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
- if (access(p, F_OK) >= 0)
- return 0;
- if (errno != ENOENT)
- return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
-
- log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
- return -EINVAL;
- }
-
- return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-}
+ return r;
-int mount_cgroups(
- const char *dest,
- CGroupUnified outer_cgver, CGroupUnified inner_cgver,
- bool userns, uid_t uid_shift, uid_t uid_range,
- const char *selinux_apifs_context,
- bool use_cgns) {
+ /* Now, if nescessary, we remount RO */
+ if (rw) {
+ if (!use_cgns) {
+ /* emulate cgns by mounting everything but our subcgroup RO */
+ const char *cgpath;
+ char *cgroup = NULL;
+ if (fstype == CGMOUNT_CGROUP2) {
+ rewind(cgfile);
+ r = cg_pid_get_path_internal(NULL, cgfile, &cgroup);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get child's cgroup v2 path");
+ } else {
+ const char *scontroller, *state;
+ size_t controller_len;
+ FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+ _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+ rewind(cgfile);
+ if (cg_pid_get_path_internal(controller, cgfile, &cgroup) == 0)
+ break;
+ }
+ if (!cgroup)
+ return log_error_errno(EBADMSG, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+ }
+ cgpath = prefix_roota(mountpoint, cgroup);
- if (inner_cgver >= CGROUP_UNIFIED_ALL)
- return mount_unified_cgroups(dest);
- else if (use_cgns)
- return mount_legacy_cgns_supported(outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context);
+ r = mount_verbose(LOG_ERR, cgpath, cgpath, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+ r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
+ }
+ } else {
+ r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
+ }
- return mount_legacy_cgns_unsupported(dest, outer_cgver, inner_cgver, userns, uid_shift, uid_range, selinux_apifs_context);
+ return 0;
}
-int mount_systemd_cgroup_writable(
- const char *dest,
- CGroupUnified inner_cgver) {
+int cgroup_mount_mounts(CGMounts m, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) {
+ const bool use_cgns = cgfile == NULL;
+ const bool use_userns = uid_shift != UID_INVALID;
- _cleanup_free_ char *own_cgroup_path = NULL;
- const char *systemd_root, *systemd_own;
- int r;
+ bool used_tmpfs = false;
- assert(dest);
+ for (size_t i = 0; i < m.n; i++) {
+ _cleanup_free_ char *options = NULL;
+ const char *dst;
+ int r;
- r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
- if (r < 0)
- return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+ dst = prefix_roota("/sys/fs/cgroup", m.mounts[i].dst);
- /* If we are living in the top-level, then there's nothing to do... */
- if (path_equal(own_cgroup_path, "/"))
- return 0;
+ /* The checks here to see if things are already mounted are kind of primative. Perhaps they should
+ * actually check the statfs() f_type to verify that the thing mounted is what we want to be mounted
+ * (similar to cgroup-util's detection logic)? But I don't really understand the use-case for having
+ * any of these already mounted, so I'm not sure if such increased strictness would be unwelcome. */
- if (inner_cgver >= CGROUP_UNIFIED_ALL) {
- systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
- } else {
- systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
- systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+ switch (m.mounts[i].type) {
+ case CGMOUNT_SYMLINK:
+ (void) mkdir_parents(dst, 0755);
+ r = symlink_idempotent(m.mounts[i].src, dst);
+ if (r < 0)
+ return r;
+ break;
+ case CGMOUNT_TMPFS:
+ used_tmpfs = true;
+ r = path_is_mount_point(dst, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+ if (r > 0)
+ continue;
+ r = tmpfs_patch_options(m.mounts[i].src, uid_shift, selinux_apifs_context, &options);
+ if (r < 0)
+ return log_oom();
+ r = mount_verbose(LOG_ERR, /*name*/"tmpfs", dst, /*fstype*/"tmpfs",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+ if (r < 0)
+ return r;
+ break;
+ case CGMOUNT_CGROUP1:
+ case CGMOUNT_CGROUP2:
+ r = path_is_mount_point(dst, 0);
+ if (r < 0 && r != -ENOENT)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+ if (r > 0) {
+ if (access(prefix_roota(dst, "cgroup.procs"), F_OK) >= 0)
+ continue;
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to determine if mount point %s is a cgroup hierarchy: %m", dst);
+ return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
+ }
+ (void) mkdir_p(dst, 0755);
+ r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
+ if (r < 0)
+ return r;
+ break;
+ default:
+ assert_not_reached("Invalid CGMount type");
+ return -EINVAL;
+ }
}
- /* Make our own cgroup a (writable) bind mount */
- r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
- if (r < 0)
- return r;
+ /* I'm going to be honest: I don't understand why we don't do this if we're using both userns and cgns. */
+ if (used_tmpfs && (!use_userns || !use_cgns))
+ return mount_verbose(LOG_ERR, NULL, "/sys/fs/cgroup", NULL,
+ MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
- /* And then remount the systemd cgroup root read-only */
- return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ return 0;
}