summaryrefslogtreecommitdiff
path: root/src/nspawn
diff options
context:
space:
mode:
authorSergiusz Urbaniak <sergiusz.urbaniak@gmail.com>2016-10-14 14:00:15 +0200
committerSergiusz Urbaniak <sergiusz.urbaniak@gmail.com>2016-11-18 09:50:40 +0100
commit4f086aab52812472a24c9b8b627589880a38696e (patch)
tree22fd04bdbb8346cbfda4201627eabf57c5b9ce51 /src/nspawn
parent843d5baf6aad6c53fc00ea8d95d83209a4f92de1 (diff)
nspawn: R/W support for /sys, and /proc/sys
This commit adds the possibility to leave /sys, and /proc/sys read-write. It introduces a new (undocumented) env var SYSTEMD_NSPAWN_API_VFS_WRITABLE to enable this feature. If set to "yes", /sys, and /proc/sys will be read-write. If set to "no", /sys, and /proc/sys will be read-only. If set to "network" /proc/sys/net will be read-write. This is useful in use-cases, where systemd-nspawn is used in an external network namespace. This adds the possibility to start privileged containers which need more control over settings in the /proc, and /sys filesystem. This is also a follow-up on the discussion from https://github.com/systemd/systemd/pull/4018#r76971862 where an introduction of a simple env var to enable R/W support for those directories was already discussed.
Diffstat (limited to 'src/nspawn')
-rw-r--r--src/nspawn/nspawn-mount.c70
-rw-r--r--src/nspawn/nspawn-mount.h13
-rw-r--r--src/nspawn/nspawn.c53
3 files changed, 99 insertions, 37 deletions
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 0c24b8e18a..95bb3c09b0 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -225,9 +225,10 @@ static int tmpfs_patch_options(
return !!buf;
}
-int mount_sysfs(const char *dest) {
+int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
const char *full, *top, *x;
int r;
+ unsigned long extra_flags = 0;
top = prefix_roota(dest, "/sys");
r = path_check_fstype(top, SYSFS_MAGIC);
@@ -244,8 +245,11 @@ int mount_sysfs(const char *dest) {
(void) mkdir(full, 0755);
+ if (mount_settings & MOUNT_APPLY_APIVFS_RO)
+ extra_flags |= MS_RDONLY;
+
r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
- MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
if (r < 0)
return r;
@@ -267,7 +271,7 @@ int mount_sysfs(const char *dest) {
return r;
r = mount_verbose(LOG_ERR, NULL, to, NULL,
- MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+ MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
if (r < 0)
return r;
}
@@ -291,7 +295,7 @@ int mount_sysfs(const char *dest) {
}
return mount_verbose(LOG_ERR, NULL, top, NULL,
- MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+ MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
}
static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
@@ -348,8 +352,7 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, boo
}
int mount_all(const char *dest,
- bool use_userns, bool in_userns,
- bool use_netns,
+ MountSettingsMask mount_settings,
uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
@@ -359,41 +362,52 @@ int mount_all(const char *dest,
const char *type;
const char *options;
unsigned long flags;
- bool fatal;
- bool in_userns;
- bool use_netns;
+ MountSettingsMask mount_settings;
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
- { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
- { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, false, true, false }, /* Bind mount first ...*/
- { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true, false }, /* ... then, make it r/o */
- { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, true, false },
+ /* inner child mounts */
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ...*/
+ { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
+ { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ...*/
+ { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, MOUNT_FATAL|MOUNT_IN_USERNS },
+
+ /* outer child mounts */
+ { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
+ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
+
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
#endif
};
unsigned k;
int r;
+ bool use_userns = (mount_settings & MOUNT_USE_USERNS);
+ bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
+ bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
+ bool in_userns = (mount_settings & MOUNT_IN_USERNS);
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
const char *o;
+ bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
+
+ if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
+ continue;
- if (in_userns != mount_table[k].in_userns)
+ if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
continue;
- if (!use_netns && mount_table[k].use_netns)
+ if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
continue;
where = prefix_root(dest, mount_table[k].where);
@@ -410,7 +424,7 @@ int mount_all(const char *dest,
r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
if (r < 0 && r != -EEXIST) {
- if (mount_table[k].fatal)
+ if (fatal)
return log_error_errno(r, "Failed to create directory %s: %m", where);
log_debug_errno(r, "Failed to create directory %s: %m", where);
@@ -429,13 +443,13 @@ int mount_all(const char *dest,
o = options;
}
- r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_DEBUG,
+ r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
mount_table[k].what,
where,
mount_table[k].type,
mount_table[k].flags,
o);
- if (r < 0 && mount_table[k].fatal)
+ if (r < 0 && fatal)
return r;
}
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index 7307a838a5..74aee7ee7f 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -23,6 +23,15 @@
#include "cgroup-util.h"
+typedef enum MountSettingsMask {
+ MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */
+ MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
+ MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
+ MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
+ MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
+ Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+} MountSettingsMask;
+
typedef enum VolatileMode {
VOLATILE_NO,
VOLATILE_YES,
@@ -57,8 +66,8 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
int custom_mount_compare(const void *a, const void *b);
-int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
-int mount_sysfs(const char *dest);
+int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
+int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 9b9ae909c9..69b9efe320 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -195,6 +195,7 @@ static const char *arg_container_service_name = "systemd-nspawn";
static bool arg_notify_ready = false;
static bool arg_use_cgns = true;
static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
+static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -378,6 +379,31 @@ static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
}
+static void parse_mount_settings_env(void) {
+ int r;
+ const char *e;
+
+ e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
+ if (!e)
+ return;
+
+ if (streq(e, "network")) {
+ arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
+ return;
+ }
+
+ r = parse_boolean(e);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
+ return;
+ } else if (r > 0)
+ arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
+ else
+ arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
+
+ arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
+}
+
static int parse_argv(int argc, char *argv[]) {
enum {
@@ -1070,6 +1096,14 @@ static int parse_argv(int argc, char *argv[]) {
parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+ if (arg_userns_mode != USER_NAMESPACE_NO)
+ arg_mount_settings |= MOUNT_USE_USERNS;
+
+ if (arg_private_network)
+ arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
+
+ parse_mount_settings_env();
+
if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
!(arg_clone_ns_flags & CLONE_NEWUTS)) {
arg_register = false;
@@ -1164,6 +1198,15 @@ static int parse_argv(int argc, char *argv[]) {
}
static int verify_arguments(void) {
+ if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
+ log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
+ return -EINVAL;
+ }
+
+ if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
+ log_error("Cannot combine --private-users with read-write mounts.");
+ return -EINVAL;
+ }
if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
@@ -2700,9 +2743,7 @@ static int inner_child(
return log_error_errno(r, "Couldn't become new root: %m");
r = mount_all(NULL,
- arg_userns_mode != USER_NAMESPACE_NO,
- true,
- arg_private_network,
+ arg_mount_settings | MOUNT_IN_USERNS,
arg_uid_shift,
arg_uid_range,
arg_selinux_apifs_context);
@@ -2710,7 +2751,7 @@ static int inner_child(
if (r < 0)
return r;
- r = mount_sysfs(NULL);
+ r = mount_sysfs(NULL, arg_mount_settings);
if (r < 0)
return r;
@@ -3077,9 +3118,7 @@ static int outer_child(
}
r = mount_all(directory,
- arg_userns_mode != USER_NAMESPACE_NO,
- false,
- arg_private_network,
+ arg_mount_settings,
arg_uid_shift,
arg_uid_range,
arg_selinux_apifs_context);