diff options
Diffstat (limited to 'src/core/namespace.c')
-rw-r--r-- | src/core/namespace.c | 760 |
1 files changed, 519 insertions, 241 deletions
diff --git a/src/core/namespace.c b/src/core/namespace.c index 045321e1d4..43a2f4ba6e 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1,5 +1,3 @@ -/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ - /*** This file is part of systemd. @@ -20,23 +18,34 @@ ***/ #include <errno.h> -#include <sys/mount.h> -#include <string.h> +#include <sched.h> #include <stdio.h> -#include <unistd.h> +#include <string.h> +#include <sys/mount.h> #include <sys/stat.h> -#include <sched.h> +#include <unistd.h> #include <linux/fs.h> -#include "strv.h" -#include "util.h" -#include "path-util.h" -#include "missing.h" -#include "loopback-setup.h" +#include "alloc-util.h" #include "dev-setup.h" -#include "selinux-util.h" -#include "namespace.h" +#include "fd-util.h" +#include "fs-util.h" +#include "loopback-setup.h" +#include "missing.h" #include "mkdir.h" +#include "mount-util.h" +#include "namespace.h" +#include "path-util.h" +#include "selinux-util.h" +#include "socket-util.h" +#include "string-table.h" +#include "string-util.h" +#include "strv.h" +#include "umask-util.h" +#include "user-util.h" +#include "util.h" + +#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC) typedef enum MountMode { /* This is ordered by priority! */ @@ -45,62 +54,230 @@ typedef enum MountMode { PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, - PRIVATE_BUS_ENDPOINT, - READWRITE + READWRITE, } MountMode; typedef struct BindMount { - const char *path; + const char *path; /* stack memory, doesn't need to be freed explicitly */ + char *chased; /* malloc()ed memory, needs to be freed */ MountMode mode; - bool done; - bool ignore; + bool ignore; /* Ignore if path does not exist */ } BindMount; +typedef struct TargetMount { + const char *path; + MountMode mode; + bool ignore; /* Ignore if path does not exist */ +} TargetMount; + +/* + * The following Protect tables are to protect paths and mark some of them + * READONLY, in case a path is covered by an option from another table, then + * it is marked READWRITE in the current one, and the more restrictive mode is + * applied from that other table. This way all options can be combined in a + * safe and comprehensible way for users. + */ + +/* ProtectKernelTunables= option and the related filesystem APIs */ +static const TargetMount protect_kernel_tunables_table[] = { + { "/proc/sys", READONLY, false }, + { "/proc/sysrq-trigger", READONLY, true }, + { "/proc/latency_stats", READONLY, true }, + { "/proc/mtrr", READONLY, true }, + { "/proc/apm", READONLY, true }, + { "/proc/acpi", READONLY, true }, + { "/proc/timer_stats", READONLY, true }, + { "/proc/asound", READONLY, true }, + { "/proc/bus", READONLY, true }, + { "/proc/fs", READONLY, true }, + { "/proc/irq", READONLY, true }, + { "/sys", READONLY, false }, + { "/sys/kernel/debug", READONLY, true }, + { "/sys/kernel/tracing", READONLY, true }, + { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ +}; + +/* + * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of + * system should be protected by ProtectSystem= + */ +static const TargetMount protect_home_read_only_table[] = { + { "/home", READONLY, true }, + { "/run/user", READONLY, true }, + { "/root", READONLY, true }, +}; + +/* ProtectHome=yes table */ +static const TargetMount protect_home_yes_table[] = { + { "/home", INACCESSIBLE, true }, + { "/run/user", INACCESSIBLE, true }, + { "/root", INACCESSIBLE, true }, +}; + +/* ProtectSystem=yes table */ +static const TargetMount protect_system_yes_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, +}; + +/* ProtectSystem=full includes ProtectSystem=yes */ +static const TargetMount protect_system_full_table[] = { + { "/usr", READONLY, false }, + { "/boot", READONLY, true }, + { "/efi", READONLY, true }, + { "/etc", READONLY, false }, +}; + +/* + * ProtectSystem=strict table. In this strict mode, we mount everything + * read-only, except for /proc, /dev, /sys which are the kernel API VFS, + * which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. + * (And of course /home and friends are also left writable, as ProtectHome= + * shall manage those, orthogonally). + */ +static const TargetMount protect_system_strict_table[] = { + { "/", READONLY, false }, + { "/proc", READWRITE, false }, /* ProtectKernelTunables= */ + { "/sys", READWRITE, false }, /* ProtectKernelTunables= */ + { "/dev", READWRITE, false }, /* PrivateDevices= */ + { "/home", READWRITE, true }, /* ProtectHome= */ + { "/run/user", READWRITE, true }, /* ProtectHome= */ + { "/root", READWRITE, true }, /* ProtectHome= */ +}; + +static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) { + (*p)->path = path; + (*p)->mode = mode; + (*p)->ignore = ignore; +} + static int append_mounts(BindMount **p, char **strv, MountMode mode) { char **i; assert(p); STRV_FOREACH(i, strv) { + bool ignore = false; - (*p)->ignore = false; - (*p)->done = false; - - if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') { - (*p)->ignore = true; + if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) { (*i)++; + ignore = true; } if (!path_is_absolute(*i)) return -EINVAL; - (*p)->path = *i; - (*p)->mode = mode; + set_bind_mount(p, *i, mode, ignore); (*p)++; } return 0; } -static int mount_path_compare(const void *a, const void *b) { - const BindMount *p = a, *q = b; - int d; +static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) { + unsigned i; - d = path_compare(p->path, q->path); + assert(p); + assert(mounts); + + for (i = 0; i < size; i++) { + /* + * Here we assume that the ignore field is set during + * declaration we do not support "-" at the beginning. + */ + const TargetMount *m = &mounts[i]; + const char *path = prefix_roota(root_directory, m->path); + + if (!path_is_absolute(path)) + return -EINVAL; + + set_bind_mount(p, path, m->mode, m->ignore); + (*p)++; + } + + return 0; +} + +static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) { + assert(p); + + return append_target_mounts(p, root_directory, protect_kernel_tunables_table, + ELEMENTSOF(protect_kernel_tunables_table)); +} + +static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) { + int r = 0; + + assert(p); + + if (protect_home == PROTECT_HOME_NO) + return 0; + + switch (protect_home) { + case PROTECT_HOME_READ_ONLY: + r = append_target_mounts(p, root_directory, protect_home_read_only_table, + ELEMENTSOF(protect_home_read_only_table)); + break; + case PROTECT_HOME_YES: + r = append_target_mounts(p, root_directory, protect_home_yes_table, + ELEMENTSOF(protect_home_yes_table)); + break; + default: + r = -EINVAL; + break; + } + + return r; +} - if (d == 0) { - /* If the paths are equal, check the mode */ - if (p->mode < q->mode) - return -1; +static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) { + int r = 0; - if (p->mode > q->mode) - return 1; + assert(p); + if (protect_system == PROTECT_SYSTEM_NO) return 0; + + switch (protect_system) { + case PROTECT_SYSTEM_STRICT: + r = append_target_mounts(p, root_directory, protect_system_strict_table, + ELEMENTSOF(protect_system_strict_table)); + break; + case PROTECT_SYSTEM_YES: + r = append_target_mounts(p, root_directory, protect_system_yes_table, + ELEMENTSOF(protect_system_yes_table)); + break; + case PROTECT_SYSTEM_FULL: + r = append_target_mounts(p, root_directory, protect_system_full_table, + ELEMENTSOF(protect_system_full_table)); + break; + default: + r = -EINVAL; + break; } + return r; +} + +static int mount_path_compare(const void *a, const void *b) { + const BindMount *p = a, *q = b; + int d; + /* If the paths are not equal, then order prefixes first */ - return d; + d = path_compare(p->path, q->path); + if (d != 0) + return d; + + /* If the paths are equal, check the mode */ + if (p->mode < q->mode) + return -1; + + if (p->mode > q->mode) + return 1; + + return 0; } static void drop_duplicates(BindMount *m, unsigned *n) { @@ -109,16 +286,110 @@ static void drop_duplicates(BindMount *m, unsigned *n) { assert(m); assert(n); + /* Drops duplicate entries. Expects that the array is properly ordered already. */ + for (f = m, t = m, previous = NULL; f < m+*n; f++) { - /* The first one wins */ - if (previous && path_equal(f->path, previous->path)) + /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare() + * above. */ + if (previous && path_equal(f->path, previous->path)) { + log_debug("%s is duplicate.", f->path); continue; + } *t = *f; - previous = t; + t++; + } + + *n = t - m; +} + +static void drop_inaccessible(BindMount *m, unsigned *n) { + BindMount *f, *t; + const char *clear = NULL; + + assert(m); + assert(n); + + /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly + * ordered already. */ + + for (f = m, t = m; f < m+*n; f++) { + + /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop + * it, as inaccessible paths really should drop the entire subtree. */ + if (clear && path_startswith(f->path, clear)) { + log_debug("%s is masked by %s.", f->path, clear); + continue; + } + + clear = f->mode == INACCESSIBLE ? f->path : NULL; + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_nop(BindMount *m, unsigned *n) { + BindMount *f, *t; + + assert(m); + assert(n); + + /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the + * list is ordered by prefixes. */ + + for (f = m, t = m; f < m+*n; f++) { + + /* Only suppress such subtrees for READONLY and READWRITE entries */ + if (IN_SET(f->mode, READONLY, READWRITE)) { + BindMount *p; + bool found = false; + + /* Now let's find the first parent of the entry we are looking at. */ + for (p = t-1; p >= m; p--) { + if (path_startswith(f->path, p->path)) { + found = true; + break; + } + } + + /* We found it, let's see if it's the same mode, if so, we can drop this entry */ + if (found && p->mode == f->mode) { + log_debug("%s is redundant by %s", f->path, p->path); + continue; + } + } + + *t = *f; + t++; + } + + *n = t - m; +} + +static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) { + BindMount *f, *t; + + assert(m); + assert(n); + + if (!root_directory) + return; + + /* Drops all mounts that are outside of the root directory. */ + for (f = m, t = m; f < m+*n; f++) { + + if (!path_startswith(f->path, root_directory)) { + log_debug("%s is outside of root directory.", f->path); + continue; + } + + *t = *f; t++; } @@ -148,7 +419,7 @@ static int mount_dev(BindMount *m) { dev = strjoina(temporary_mount, "/dev"); (void) mkdir(dev, 0755); - if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) { + if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) { r = -errno; goto fail; } @@ -232,6 +503,8 @@ static int mount_dev(BindMount *m) { */ (void) mkdir_p_label(m->path, 0755); + /* Unmount everything in old /dev */ + umount_recursive(m->path, 0); if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) { r = -errno; goto fail; @@ -262,79 +535,6 @@ fail: return r; } -static int mount_kdbus(BindMount *m) { - - char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX"; - _cleanup_free_ char *basepath = NULL; - _cleanup_umask_ mode_t u; - char *busnode = NULL, *root; - struct stat st; - int r; - - assert(m); - - u = umask(0000); - - if (!mkdtemp(temporary_mount)) - return log_error_errno(errno, "Failed create temp dir: %m"); - - root = strjoina(temporary_mount, "/kdbus"); - (void) mkdir(root, 0755); - if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) { - r = -errno; - goto fail; - } - - /* create a new /dev/null dev node copy so we have some fodder to - * bind-mount the custom endpoint over. */ - if (stat("/dev/null", &st) < 0) { - log_error_errno(errno, "Failed to stat /dev/null: %m"); - r = -errno; - goto fail; - } - - busnode = strjoina(root, "/bus"); - if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) { - log_error_errno(errno, "mknod() for %s failed: %m", busnode); - r = -errno; - goto fail; - } - - r = mount(m->path, busnode, NULL, MS_BIND, NULL); - if (r < 0) { - log_error_errno(errno, "bind mount of %s failed: %m", m->path); - r = -errno; - goto fail; - } - - basepath = dirname_malloc(m->path); - if (!basepath) { - r = -ENOMEM; - goto fail; - } - - if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) { - log_error_errno(errno, "bind mount of %s failed: %m", basepath); - r = -errno; - goto fail; - } - - rmdir(temporary_mount); - return 0; - -fail: - if (busnode) { - umount(busnode); - unlink(busnode); - } - - umount(root); - rmdir(root); - rmdir(temporary_mount); - - return r; -} - static int apply_mount( BindMount *m, const char *tmp_dir, @@ -345,23 +545,41 @@ static int apply_mount( assert(m); + log_debug("Applying namespace mount on %s", m->path); + switch (m->mode) { - case INACCESSIBLE: + case INACCESSIBLE: { + struct stat target; /* First, get rid of everything that is below if there * is anything... Then, overmount it with an - * inaccessible directory. */ - umount_recursive(m->path, 0); + * inaccessible path. */ + (void) umount_recursive(m->path, 0); - what = "/run/systemd/inaccessible"; + if (lstat(m->path, &target) < 0) + return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path); + + what = mode_to_inaccessible_node(target.st_mode); + if (!what) { + log_debug("File type not supported for inaccessible mounts. Note that symlinks are not allowed"); + return -ELOOP; + } break; + } case READONLY: case READWRITE: - /* Nothing to mount here, we just later toggle the - * MS_RDONLY bit for the mount point */ - return 0; + + r = path_is_mount_point(m->path, 0); + if (r < 0) + return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path); + if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */ + return 0; + + /* This isn't a mount point yet, let's make it one. */ + what = m->path; + break; case PRIVATE_TMP: what = tmp_dir; @@ -374,85 +592,155 @@ static int apply_mount( case PRIVATE_DEV: return mount_dev(m); - case PRIVATE_BUS_ENDPOINT: - return mount_kdbus(m); - default: assert_not_reached("Unknown mode"); } assert(what); - r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL); - if (r >= 0) - log_debug("Successfully mounted %s to %s", what, m->path); - else if (m->ignore && errno == ENOENT) + if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) + return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path); + + log_debug("Successfully mounted %s to %s", what, m->path); + return 0; +} + +static int make_read_only(BindMount *m, char **blacklist) { + int r = 0; + + assert(m); + + if (IN_SET(m->mode, INACCESSIBLE, READONLY)) + r = bind_remount_recursive(m->path, true, blacklist); + else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/ + if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0) + r = -errno; + } else return 0; + /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only + * already stays this way. This improves compatibility with container managers, where we won't attempt to undo + * read-only mounts already applied. */ + return r; } -static int make_read_only(BindMount *m) { +static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) { + BindMount *f, *t; int r; assert(m); + assert(n); - if (IN_SET(m->mode, INACCESSIBLE, READONLY)) - r = bind_remount_recursive(m->path, true); - else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV)) - r = bind_remount_recursive(m->path, false); - else - r = 0; + /* Since mount() will always follow symlinks and we need to take the different root directory into account we + * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we + * can't resolve the path, and which have been marked for such removal. */ - if (m->ignore && r == -ENOENT) - return 0; + for (f = m, t = m; f < m+*n; f++) { - return r; + r = chase_symlinks(f->path, root_directory, &f->chased); + if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */ + continue; + if (r < 0) + return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path); + + if (path_equal(f->path, f->chased)) + f->chased = mfree(f->chased); + else { + log_debug("Chased %s → %s", f->path, f->chased); + f->path = f->chased; + } + + *t = *f; + t++; + } + + *n = t - m; + return 0; +} + +static unsigned namespace_calculate_mounts( + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, + const char* tmp_dir, + const char* var_tmp_dir, + bool private_dev, + bool protect_sysctl, + bool protect_cgroups, + ProtectHome protect_home, + ProtectSystem protect_system) { + + unsigned protect_home_cnt; + unsigned protect_system_cnt = + (protect_system == PROTECT_SYSTEM_STRICT ? + ELEMENTSOF(protect_system_strict_table) : + ((protect_system == PROTECT_SYSTEM_FULL) ? + ELEMENTSOF(protect_system_full_table) : + ((protect_system == PROTECT_SYSTEM_YES) ? + ELEMENTSOF(protect_system_yes_table) : 0))); + + protect_home_cnt = + (protect_home == PROTECT_HOME_YES ? + ELEMENTSOF(protect_home_yes_table) : + ((protect_home == PROTECT_HOME_READ_ONLY) ? + ELEMENTSOF(protect_home_read_only_table) : 0)); + + return !!tmp_dir + !!var_tmp_dir + + strv_length(read_write_paths) + + strv_length(read_only_paths) + + strv_length(inaccessible_paths) + + private_dev + + (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (protect_cgroups ? 1 : 0) + + protect_home_cnt + protect_system_cnt; } int setup_namespace( const char* root_directory, - char** read_write_dirs, - char** read_only_dirs, - char** inaccessible_dirs, + char** read_write_paths, + char** read_only_paths, + char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - const char* bus_endpoint_path, bool private_dev, + bool protect_sysctl, + bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags) { BindMount *m, *mounts = NULL; + bool make_slave = false; unsigned n; int r = 0; if (mount_flags == 0) mount_flags = MS_SHARED; - if (unshare(CLONE_NEWNS) < 0) - return -errno; + n = namespace_calculate_mounts(read_write_paths, + read_only_paths, + inaccessible_paths, + tmp_dir, var_tmp_dir, + private_dev, protect_sysctl, + protect_cgroups, protect_home, + protect_system); - n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path + - strv_length(read_write_dirs) + - strv_length(read_only_dirs) + - strv_length(inaccessible_dirs) + - private_dev + - (protect_home != PROTECT_HOME_NO ? 3 : 0) + - (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) + - (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0); + /* Set mount slave mode */ + if (root_directory || n > 0) + make_slave = true; if (n > 0) { m = mounts = (BindMount *) alloca0(n * sizeof(BindMount)); - r = append_mounts(&m, read_write_dirs, READWRITE); + r = append_mounts(&m, read_write_paths, READWRITE); if (r < 0) return r; - r = append_mounts(&m, read_only_dirs, READONLY); + r = append_mounts(&m, read_only_paths, READONLY); if (r < 0) return r; - r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE); + r = append_mounts(&m, inaccessible_paths, INACCESSIBLE); if (r < 0) return r; @@ -474,101 +762,112 @@ int setup_namespace( m++; } - if (bus_endpoint_path) { - m->path = prefix_roota(root_directory, bus_endpoint_path); - m->mode = PRIVATE_BUS_ENDPOINT; - m++; - } - - if (protect_home != PROTECT_HOME_NO) { - const char *home_dir, *run_user_dir, *root_dir; + if (protect_sysctl) + append_protect_kernel_tunables(&m, root_directory); - home_dir = prefix_roota(root_directory, "/home"); - home_dir = strjoina("-", home_dir); - run_user_dir = prefix_roota(root_directory, "/run/user"); - run_user_dir = strjoina("-", run_user_dir); - root_dir = prefix_roota(root_directory, "/root"); - root_dir = strjoina("-", root_dir); - - r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir), - protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE); - if (r < 0) - return r; + if (protect_cgroups) { + m->path = prefix_roota(root_directory, "/sys/fs/cgroup"); + m->mode = READONLY; + m++; } - if (protect_system != PROTECT_SYSTEM_NO) { - const char *usr_dir, *boot_dir, *etc_dir; - - usr_dir = prefix_roota(root_directory, "/usr"); - boot_dir = prefix_roota(root_directory, "/boot"); - boot_dir = strjoina("-", boot_dir); - etc_dir = prefix_roota(root_directory, "/etc"); + r = append_protect_home(&m, root_directory, protect_home); + if (r < 0) + return r; - r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL - ? STRV_MAKE(usr_dir, boot_dir, etc_dir) - : STRV_MAKE(usr_dir, boot_dir), READONLY); - if (r < 0) - return r; - } + r = append_protect_system(&m, root_directory, protect_system); + if (r < 0) + return r; assert(mounts + n == m); + /* Resolve symlinks manually first, as mount() will always follow them relative to the host's + * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit + * racy. */ + r = chase_all_symlinks(root_directory, mounts, &n); + if (r < 0) + goto finish; + qsort(mounts, n, sizeof(BindMount), mount_path_compare); + drop_duplicates(mounts, &n); + drop_outside_root(root_directory, mounts, &n); + drop_inaccessible(mounts, &n); + drop_nop(mounts, &n); + } + + if (unshare(CLONE_NEWNS) < 0) { + r = -errno; + goto finish; } - if (n > 0 || root_directory) { + if (make_slave) { /* Remount / as SLAVE so that nothing now mounted in the namespace shows up in the parent */ - if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) - return -errno; + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { + r = -errno; + goto finish; + } } if (root_directory) { - /* Turn directory into bind mount */ - if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) - return -errno; + /* Turn directory into bind mount, if it isn't one yet */ + r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW); + if (r < 0) + goto finish; + if (r == 0) { + if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) { + r = -errno; + goto finish; + } + } } if (n > 0) { + char **blacklist; + unsigned j; + + /* First round, add in all special mounts we need */ for (m = mounts; m < mounts + n; ++m) { r = apply_mount(m, tmp_dir, var_tmp_dir); if (r < 0) - goto fail; + goto finish; } + /* Create a blacklist we can pass to bind_mount_recursive() */ + blacklist = newa(char*, n+1); + for (j = 0; j < n; j++) + blacklist[j] = (char*) mounts[j].path; + blacklist[j] = NULL; + + /* Second round, flip the ro bits if necessary. */ for (m = mounts; m < mounts + n; ++m) { - r = make_read_only(m); + r = make_read_only(m, blacklist); if (r < 0) - goto fail; + goto finish; } } if (root_directory) { /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */ r = mount_move_root(root_directory); - - /* at this point, we cannot rollback */ if (r < 0) - return r; + goto finish; } /* Remount / as the desired mode. Not that this will not * reestablish propagation from our side to the host, since * what's disconnected is disconnected. */ if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) { - /* at this point, we cannot rollback */ - return -errno; + r = -errno; + goto finish; } - return 0; + r = 0; -fail: - if (n > 0) { - for (m = mounts; m < mounts + n; ++m) - if (m->done) - (void) umount2(m->path, MNT_DETACH); - } +finish: + for (m = mounts; m < mounts + n; m++) + free(m->chased); return r; } @@ -645,16 +944,7 @@ int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) { int setup_netns(int netns_storage_socket[2]) { _cleanup_close_ int netns = -1; - union { - struct cmsghdr cmsghdr; - uint8_t buf[CMSG_SPACE(sizeof(int))]; - } control = {}; - struct msghdr mh = { - .msg_control = &control, - .msg_controllen = sizeof(control), - }; - struct cmsghdr *cmsg; - int r; + int r, q; assert(netns_storage_socket); assert(netns_storage_socket[0] >= 0); @@ -671,12 +961,8 @@ int setup_netns(int netns_storage_socket[2]) { if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) return -errno; - if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) { - if (errno != EAGAIN) { - r = -errno; - goto fail; - } - + netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); + if (netns == -EAGAIN) { /* Nothing stored yet, so let's create a new namespace */ if (unshare(CLONE_NEWNET) < 0) { @@ -693,15 +979,13 @@ int setup_netns(int netns_storage_socket[2]) { } r = 1; - } else { - /* Yay, found something, so let's join the namespace */ - CMSG_FOREACH(cmsg, &mh) - if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { - assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); - netns = *(int*) CMSG_DATA(cmsg); - } + } else if (netns < 0) { + r = netns; + goto fail; + } else { + /* Yay, found something, so let's join the namespace */ if (setns(netns, CLONE_NEWNET) < 0) { r = -errno; goto fail; @@ -710,21 +994,14 @@ int setup_netns(int netns_storage_socket[2]) { r = 0; } - cmsg = CMSG_FIRSTHDR(&mh); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - memcpy(CMSG_DATA(cmsg), &netns, sizeof(int)); - mh.msg_controllen = cmsg->cmsg_len; - - if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) { - r = -errno; + q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); + if (q < 0) { + r = q; goto fail; } fail: - lockf(netns_storage_socket[0], F_ULOCK, 0); - + (void) lockf(netns_storage_socket[0], F_ULOCK, 0); return r; } @@ -740,6 +1017,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { [PROTECT_SYSTEM_NO] = "no", [PROTECT_SYSTEM_YES] = "yes", [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", }; DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem); |