diff options
author | Lennart Poettering <lennart@poettering.net> | 2015-05-21 16:30:58 +0200 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2015-05-21 16:32:01 +0200 |
commit | 03cfe0d51499e86b1573d121337594719d9f2012 (patch) | |
tree | cb76a92a501b16909f879944a4ad29ce88eefbad /src/nspawn | |
parent | ef6fc8ee57eff8a2b612de0270c9a25e066ee290 (diff) |
nspawn: finish user namespace support
Diffstat (limited to 'src/nspawn')
-rw-r--r-- | src/nspawn/nspawn.c | 1528 |
1 files changed, 890 insertions, 638 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 1f919c082b..d6b24c6f8f 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -222,6 +222,8 @@ static void help(void) { " --uuid=UUID Set a specific machine UUID for the container\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" + " --private-users[=UIDBASE[:NUIDS]]\n" + " Run within user namespace\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -238,8 +240,6 @@ static void help(void) { " Add a virtual ethernet connection between host\n" " and container and add it to an existing bridge on\n" " the host\n" - " --private-users[=UIDBASE[:NUIDS]]\n" - " Run within user namespace\n" " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" " Expose a container IP port on the host\n" " -Z --selinux-context=SECLABEL\n" @@ -1020,7 +1020,44 @@ static int parse_argv(int argc, char *argv[]) { return 1; } -static int mount_all(const char *dest) { +static int tmpfs_patch_options(const char *options, char **ret) { + char *buf = NULL; + + if (arg_userns && arg_uid_shift != 0) { + + if (options) + asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift); + else + asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift); + if (!buf) + return -ENOMEM; + + options = buf; + } + +#ifdef HAVE_SELINUX + if (arg_selinux_apifs_context) { + char *t; + + if (options) + t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL); + else + t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL); + if (!t) { + free(buf); + return -ENOMEM; + } + + free(buf); + buf = t; + } +#endif + + *ret = buf; + return !!buf; +} + +static int mount_all(const char *dest, bool userns) { typedef struct MountPoint { const char *what; @@ -1029,88 +1066,63 @@ static int mount_all(const char *dest) { const char *options; unsigned long flags; bool fatal; + bool userns; } MountPoint; static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */ - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true }, - { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true }, - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true }, - { "devpts", "/dev/pts", "devpts", "newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true }, - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true }, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true, true }, /* Then, make it r/o */ + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, + { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false }, #ifdef HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */ + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false, false }, /* Then, make it r/o */ #endif }; unsigned k; - int r = 0; + int r; for (k = 0; k < ELEMENTSOF(mount_table); k++) { _cleanup_free_ char *where = NULL, *options = NULL; const char *o; - int t; - where = strjoin(dest, "/", mount_table[k].where, NULL); + if (userns != mount_table[k].userns) + continue; + + where = prefix_root(dest, mount_table[k].where); if (!where) return log_oom(); - t = path_is_mount_point(where, true); - if (t < 0 && t != -ENOENT) { - log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where); - - if (r == 0) - r = t; - - continue; - } + r = path_is_mount_point(where, true); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); /* Skip this entry if it is not a remount. */ - if (mount_table[k].what && t > 0) + if (mount_table[k].what && r > 0) continue; - t = mkdir_p(where, 0755); - if (t < 0) { - if (mount_table[k].fatal) { - log_error_errno(t, "Failed to create directory %s: %m", where); - - if (r == 0) - r = t; - } else - log_warning_errno(t, "Failed to create directory %s: %m", where); + r = mkdir_p(where, 0755); + if (r < 0) { + if (mount_table[k].fatal) + return log_error_errno(r, "Failed to create directory %s: %m", where); + log_warning_errno(r, "Failed to create directory %s: %m", where); continue; } -#ifdef HAVE_SELINUX - if (arg_selinux_apifs_context && - (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) { - options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL); - if (!options) - return log_oom(); - - o = options; - } else -#endif - o = mount_table[k].options; - - if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) { - char *uid_options = NULL; - - if (o) - asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift); - else - asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift); - if (!uid_options) + o = mount_table[k].options; + if (streq_ptr(mount_table[k].type, "tmpfs")) { + r = tmpfs_patch_options(o, &options); + if (r < 0) return log_oom(); - - free(options); - o = options = uid_options; + if (r > 0) + o = options; } if (mount(mount_table[k].what, @@ -1119,34 +1131,29 @@ static int mount_all(const char *dest) { mount_table[k].flags, o) < 0) { - if (mount_table[k].fatal) { - log_error_errno(errno, "mount(%s) failed: %m", where); + if (mount_table[k].fatal) + return log_error_errno(errno, "mount(%s) failed: %m", where); - if (r == 0) - r = -errno; - } else - log_warning_errno(errno, "mount(%s) failed: %m", where); + log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where); } } - return r; + return 0; } static int mount_bind(const char *dest, CustomMount *m) { struct stat source_st, dest_st; - char *where; + const char *where; int r; - assert(dest); assert(m); if (stat(m->source, &source_st) < 0) return log_error_errno(errno, "Failed to stat %s: %m", m->source); - where = strjoina(dest, m->destination); + where = prefix_roota(dest, m->destination); - r = stat(where, &dest_st); - if (r >= 0) { + if (stat(where, &dest_st) >= 0) { if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) { log_error("Cannot bind mount directory %s on file %s.", m->source, where); return -EINVAL; @@ -1190,19 +1197,25 @@ static int mount_bind(const char *dest, CustomMount *m) { } static int mount_tmpfs(const char *dest, CustomMount *m) { - char *where; + const char *where, *options; + _cleanup_free_ char *buf = NULL; int r; assert(dest); assert(m); - where = strjoina(dest, m->destination); + where = prefix_roota(dest, m->destination); - r = mkdir_label(where, 0755); + r = mkdir_p_label(where, 0755); if (r < 0 && r != -EEXIST) return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where); - if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, m->options) < 0) + r = tmpfs_patch_options(m->options, &buf); + if (r < 0) + return log_oom(); + options = r > 0 ? buf : m->options; + + if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0) return log_error_errno(errno, "tmpfs mount to %s failed: %m", where); return 0; @@ -1210,13 +1223,13 @@ static int mount_tmpfs(const char *dest, CustomMount *m) { static int mount_overlay(const char *dest, CustomMount *m) { _cleanup_free_ char *lower = NULL; - char *where, *options; + const char *where, *options; int r; assert(dest); assert(m); - where = strjoina(dest, m->destination); + where = prefix_roota(dest, m->destination); r = mkdir_label(where, 0755); if (r < 0 && r != -EEXIST) @@ -1227,7 +1240,6 @@ static int mount_overlay(const char *dest, CustomMount *m) { strv_reverse(m->lower); lower = strv_join(m->lower, ":"); strv_reverse(m->lower); - if (!lower) return log_oom(); @@ -1310,8 +1322,7 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons static int mount_cgroup(const char *dest) { _cleanup_set_free_free_ Set *controllers = NULL; - _cleanup_free_ char *own_cgroup_path = NULL; - const char *cgroup_root, *systemd_root, *systemd_own; + const char *cgroup_root; int r; controllers = set_new(&string_hash_ops); @@ -1322,10 +1333,6 @@ static int mount_cgroup(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to determine cgroup controllers: %m"); - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); - for (;;) { _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; @@ -1333,7 +1340,7 @@ static int mount_cgroup(const char *dest) { if (!controller) break; - origin = strappend("/sys/fs/cgroup/", controller); + origin = prefix_root("/sys/fs/cgroup/", controller); if (!origin) return log_oom(); @@ -1350,7 +1357,7 @@ static int mount_cgroup(const char *dest) { else { _cleanup_free_ char *target = NULL; - target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL); + target = prefix_root(dest, origin); if (!target) return log_oom(); @@ -1379,25 +1386,82 @@ static int mount_cgroup(const char *dest) { if (r < 0) return r; + cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) + return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); + + return 0; +} + +static int mount_systemd_cgroup_writable(const char *dest) { + _cleanup_free_ char *own_cgroup_path = NULL; + const char *systemd_root, *systemd_own; + int r; + + assert(dest); + + r = cg_pid_get_path(NULL, 0, &own_cgroup_path); + if (r < 0) + return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + /* Make our own cgroup a (writable) bind mount */ systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0) return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path); /* And then remount the systemd cgroup root read-only */ - systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd"); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) return log_error_errno(errno, "Failed to mount cgroup root read-only: %m"); - cgroup_root = strjoina(dest, "/sys/fs/cgroup"); - if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) - return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); + return 0; +} + +static int userns_lchown(const char *p, uid_t uid, gid_t gid) { + assert(p); + + if (!arg_userns) + return 0; + + if (uid == UID_INVALID && gid == GID_INVALID) + return 0; + + if (uid != UID_INVALID) { + uid += arg_uid_shift; + + if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range) + return -EOVERFLOW; + } + + if (gid != GID_INVALID) { + gid += (gid_t) arg_uid_shift; + + if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range)) + return -EOVERFLOW; + } + + if (lchown(p, uid, gid) < 0) + return -errno; return 0; } +static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) { + const char *q; + + q = prefix_roota(root, path); + if (mkdir(q, mode) < 0) { + if (errno == EEXIST) + return 0; + return -errno; + } + + return userns_lchown(q, uid, gid); +} + static int setup_timezone(const char *dest) { - _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL; + _cleanup_free_ char *p = NULL, *q = NULL; + const char *where, *check, *what; char *z, *y; int r; @@ -1418,10 +1482,7 @@ static int setup_timezone(const char *dest) { return 0; } - where = strappend(dest, "/etc/localtime"); - if (!where) - return log_oom(); - + where = prefix_roota(dest, "/etc/localtime"); r = readlink_malloc(where, &q); if (r >= 0) { y = path_startswith(q, "../usr/share/zoneinfo/"); @@ -1433,43 +1494,34 @@ static int setup_timezone(const char *dest) { return 0; } - check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL); - if (!check) - return log_oom(); - - if (access(check, F_OK) < 0) { + check = strjoina("/usr/share/zoneinfo/", z); + check = prefix_root(dest, check); + if (laccess(check, F_OK) < 0) { log_warning("Timezone %s does not exist in container, not updating container timezone.", z); return 0; } - what = strappend("../usr/share/zoneinfo/", z); - if (!what) - return log_oom(); - - r = mkdir_parents(where, 0755); - if (r < 0) { - log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where); - - return 0; - } - r = unlink(where); if (r < 0 && errno != ENOENT) { log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where); - return 0; } + what = strjoina("../usr/share/zoneinfo/", z); if (symlink(what, where) < 0) { log_error_errno(errno, "Failed to correct timezone of container: %m"); return 0; } + r = userns_lchown(where, 0, 0); + if (r < 0) + return log_warning_errno(r, "Failed to chown /etc/localtime: %m"); + return 0; } static int setup_resolv_conf(const char *dest) { - _cleanup_free_ char *where = NULL; + const char *where = NULL; int r; assert(dest); @@ -1478,31 +1530,24 @@ static int setup_resolv_conf(const char *dest) { return 0; /* Fix resolv.conf, if possible */ - where = strappend(dest, "/etc/resolv.conf"); - if (!where) - return log_oom(); - - /* We don't really care for the results of this really. If it - * fails, it fails, but meh... */ - r = mkdir_parents(where, 0755); - if (r < 0) { - log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where); - - return 0; - } + where = prefix_roota(dest, "/etc/resolv.conf"); r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0); if (r < 0) { log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where); - return 0; } + r = userns_lchown(where, 0, 0); + if (r < 0) + log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m"); + return 0; } static int setup_volatile_state(const char *directory) { - const char *p; + _cleanup_free_ char *buf = NULL; + const char *p, *options; int r; assert(directory); @@ -1517,12 +1562,19 @@ static int setup_volatile_state(const char *directory) { if (r < 0) return log_error_errno(r, "Failed to remount %s read-only: %m", directory); - p = strjoina(directory, "/var"); + p = prefix_roota(directory, "/var"); r = mkdir(p, 0755); if (r < 0 && errno != EEXIST) return log_error_errno(errno, "Failed to create %s: %m", directory); - if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) + options = "mode=755"; + r = tmpfs_patch_options(options, &buf); + if (r < 0) + return log_oom(); + if (r > 0) + options = buf; + + if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0) return log_error_errno(errno, "Failed to mount tmpfs to /var: %m"); return 0; @@ -1531,7 +1583,8 @@ static int setup_volatile_state(const char *directory) { static int setup_volatile(const char *directory) { bool tmpfs_mounted = false, bind_mounted = false; char template[] = "/tmp/nspawn-volatile-XXXXXX"; - const char *f, *t; + _cleanup_free_ char *buf = NULL; + const char *f, *t, *options; int r; assert(directory); @@ -1545,27 +1598,31 @@ static int setup_volatile(const char *directory) { if (!mkdtemp(template)) return log_error_errno(errno, "Failed to create temporary directory: %m"); - if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) { - log_error_errno(errno, "Failed to mount tmpfs for root directory: %m"); - r = -errno; + options = "mode=755"; + r = tmpfs_patch_options(options, &buf); + if (r < 0) + return log_oom(); + if (r > 0) + options = buf; + + if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) { + r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m"); goto fail; } tmpfs_mounted = true; - f = strjoina(directory, "/usr"); - t = strjoina(template, "/usr"); + f = prefix_roota(directory, "/usr"); + t = prefix_roota(template, "/usr"); r = mkdir(t, 0755); if (r < 0 && errno != EEXIST) { - log_error_errno(errno, "Failed to create %s: %m", t); - r = -errno; + r = log_error_errno(errno, "Failed to create %s: %m", t); goto fail; } if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) { - log_error_errno(errno, "Failed to create /usr bind mount: %m"); - r = -errno; + r = log_error_errno(errno, "Failed to create /usr bind mount: %m"); goto fail; } @@ -1578,25 +1635,26 @@ static int setup_volatile(const char *directory) { } if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) { - log_error_errno(errno, "Failed to move root mount: %m"); - r = -errno; + r = log_error_errno(errno, "Failed to move root mount: %m"); goto fail; } - rmdir(template); + (void) rmdir(template); return 0; fail: if (bind_mounted) - umount(t); + (void) umount(t); + if (tmpfs_mounted) - umount(template); - rmdir(template); + (void) umount(template); + (void) rmdir(template); return r; } static char* id128_format_as_uuid(sd_id128_t id, char s[37]) { + assert(s); snprintf(s, 37, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", @@ -1606,23 +1664,19 @@ static char* id128_format_as_uuid(sd_id128_t id, char s[37]) { } static int setup_boot_id(const char *dest) { - _cleanup_free_ char *from = NULL, *to = NULL; + const char *from, *to; sd_id128_t rnd = {}; char as_uuid[37]; int r; - assert(dest); - if (arg_share_system) return 0; /* Generate a new randomized boot ID, so that each boot-up of * the container gets a new one */ - from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id"); - to = strappend(dest, "/proc/sys/kernel/random/boot_id"); - if (!from || !to) - return log_oom(); + from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id"); + to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id"); r = sd_id128_randomize(&rnd); if (r < 0) @@ -1634,10 +1688,9 @@ static int setup_boot_id(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to write boot id: %m"); - if (mount(from, to, NULL, MS_BIND, NULL) < 0) { - log_error_errno(errno, "Failed to bind mount boot id: %m"); - r = -errno; - } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL)) + if (mount(from, to, NULL, MS_BIND, NULL) < 0) + r = log_error_errno(errno, "Failed to bind mount boot id: %m"); + else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0) log_warning_errno(errno, "Failed to make boot id read-only: %m"); unlink(from); @@ -1663,14 +1716,16 @@ static int copy_devnodes(const char *dest) { u = umask(0000); + /* Create /dev/net, so that we can create /dev/net/tun in it */ + if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0) + return log_error_errno(r, "Failed to create /dev/net directory: %m"); + NULSTR_FOREACH(d, devnodes) { _cleanup_free_ char *from = NULL, *to = NULL; struct stat st; from = strappend("/dev/", d); - to = strjoin(dest, "/dev/", d, NULL); - if (!from || !to) - return log_oom(); + to = prefix_root(dest, from); if (stat(from, &st) < 0) { @@ -1679,16 +1734,10 @@ static int copy_devnodes(const char *dest) { } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) { - log_error("%s is not a char or block device, cannot copy", from); + log_error("%s is not a char or block device, cannot copy.", from); return -EIO; } else { - r = mkdir_parents(to, 0775); - if (r < 0) { - log_error_errno(r, "Failed to create parent directory of %s: %m", to); - return -r; - } - if (mknod(to, st.st_mode, st.st_rdev) < 0) { if (errno != EPERM) return log_error_errno(errno, "mknod(%s) failed: %m", to); @@ -1702,28 +1751,56 @@ static int copy_devnodes(const char *dest) { return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to); } - if (arg_userns && arg_uid_shift != UID_INVALID) - if (lchown(to, arg_uid_shift, arg_uid_shift) < 0) - return log_error_errno(errno, "chown() of device node %s failed: %m", to); + r = userns_lchown(to, 0, 0); + if (r < 0) + return log_error_errno(r, "chown() of device node %s failed: %m", to); } } return r; } -static int setup_ptmx(const char *dest) { - _cleanup_free_ char *p = NULL; +static int setup_pts(const char *dest) { + _cleanup_free_ char *options = NULL; + const char *p; + +#ifdef HAVE_SELINUX + if (arg_selinux_apifs_context) + (void) asprintf(&options, + "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"", + arg_uid_shift, + arg_uid_shift + TTY_GID, + arg_selinux_apifs_context); + else +#endif + (void) asprintf(&options, + "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT, + arg_uid_shift, + arg_uid_shift + TTY_GID); - p = strappend(dest, "/dev/ptmx"); - if (!p) + if (!options) return log_oom(); + /* Mount /dev/pts itself */ + p = prefix_root(dest, "/dev/pts"); + if (mkdir(p, 0755) < 0) + return log_error_errno(errno, "Failed to create /dev/pts: %m"); + if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0) + return log_error_errno(errno, "Failed to mount /dev/pts: %m"); + if (userns_lchown(p, 0, 0) < 0) + return log_error_errno(errno, "Failed to chown /dev/pts: %m"); + + /* Create /dev/ptmx symlink */ + p = prefix_roota(dest, "/dev/ptmx"); if (symlink("pts/ptmx", p) < 0) return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m"); + if (userns_lchown(p, 0, 0) < 0) + return log_error_errno(errno, "Failed to chown /dev/ptmx: %m"); - if (arg_userns && arg_uid_shift != UID_INVALID) - if (lchown(p, arg_uid_shift, arg_uid_shift) < 0) - return log_error_errno(errno, "lchown() of symlink %s failed: %m", p); + /* And fix /dev/pts/ptmx ownership */ + p = prefix_roota(dest, "/dev/pts/ptmx"); + if (userns_lchown(p, 0, 0) < 0) + return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m"); return 0; } @@ -1738,7 +1815,7 @@ static int setup_dev_console(const char *dest, const char *console) { u = umask(0000); - r = chmod_and_chown(console, 0600, 0, 0); + r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift); if (r < 0) return log_error_errno(r, "Failed to correct access mode for TTY: %m"); @@ -1746,7 +1823,7 @@ static int setup_dev_console(const char *dest, const char *console) { * ptys can only exist on pts file systems. To have something * to bind mount things on we create a empty regular file. */ - to = strjoina(dest, "/dev/console"); + to = prefix_roota(dest, "/dev/console"); r = touch(to); if (r < 0) return log_error_errno(r, "touch() for /dev/console failed: %m"); @@ -1758,9 +1835,9 @@ static int setup_dev_console(const char *dest, const char *console) { } static int setup_kmsg(const char *dest, int kmsg_socket) { - _cleanup_free_ char *from = NULL, *to = NULL; + const char *from, *to; _cleanup_umask_ mode_t u; - int r, fd, k; + int fd, k; union { struct cmsghdr cmsghdr; uint8_t buf[CMSG_SPACE(sizeof(int))]; @@ -1771,29 +1848,22 @@ static int setup_kmsg(const char *dest, int kmsg_socket) { }; struct cmsghdr *cmsg; - assert(dest); assert(kmsg_socket >= 0); u = umask(0000); - /* We create the kmsg FIFO as /dev/kmsg, but immediately + /* We create the kmsg FIFO as /run/kmsg, but immediately * delete it after bind mounting it to /proc/kmsg. While FIFOs * on the reading side behave very similar to /proc/kmsg, * their writing side behaves differently from /dev/kmsg in * that writing blocks when nothing is reading. In order to * avoid any problems with containers deadlocking due to this * we simply make /dev/kmsg unavailable to the container. */ - if (asprintf(&from, "%s/dev/kmsg", dest) < 0 || - asprintf(&to, "%s/proc/kmsg", dest) < 0) - return log_oom(); + from = prefix_roota(dest, "/run/kmsg"); + to = prefix_roota(dest, "/proc/kmsg"); if (mkfifo(from, 0600) < 0) - return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m"); - - r = chmod_and_chown(from, 0600, 0, 0); - if (r < 0) - return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m"); - + return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m"); if (mount(from, to, NULL, MS_BIND, NULL) < 0) return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m"); @@ -1817,8 +1887,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) { if (k < 0) return log_error_errno(errno, "Failed to send FIFO fd: %m"); - /* And now make the FIFO unavailable as /dev/kmsg... */ - unlink(from); + /* And now make the FIFO unavailable as /run/kmsg... */ + (void) unlink(from); + return 0; } @@ -1842,7 +1913,7 @@ static int send_rtnl(int send_fd) { fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE); if (fd < 0) - return log_error_errno(errno, "failed to allocate container netlink: %m"); + return log_error_errno(errno, "Failed to allocate container netlink: %m"); cmsg = CMSG_FIRSTHDR(&mh); cmsg->cmsg_level = SOL_SOCKET; @@ -2027,7 +2098,8 @@ static int setup_hostname(void) { static int setup_journal(const char *directory) { sd_id128_t machine_id, this_id; - _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL; + _cleanup_free_ char *b = NULL, *d = NULL; + const char *etc_machine_id, *p, *q; char *id; int r; @@ -2035,15 +2107,13 @@ static int setup_journal(const char *directory) { if (arg_ephemeral) return 0; - p = strappend(directory, "/etc/machine-id"); - if (!p) - return log_oom(); + etc_machine_id = prefix_roota(directory, "/etc/machine-id"); - r = read_one_line_file(p, &b); + r = read_one_line_file(etc_machine_id, &b); if (r == -ENOENT && arg_link_journal == LINK_AUTO) return 0; else if (r < 0) - return log_error_errno(r, "Failed to read machine ID from %s: %m", p); + return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id); id = strstrip(b); if (isempty(id) && arg_link_journal == LINK_AUTO) @@ -2052,7 +2122,7 @@ static int setup_journal(const char *directory) { /* Verify validity */ r = sd_id128_from_string(id, &machine_id); if (r < 0) - return log_error_errno(r, "Failed to parse machine ID from %s: %m", p); + return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id); r = sd_id128_get_machine(&this_id); if (r < 0) @@ -2069,11 +2139,20 @@ static int setup_journal(const char *directory) { if (arg_link_journal == LINK_NO) return 0; - free(p); - p = strappend("/var/log/journal/", id); - q = strjoin(directory, "/var/log/journal/", id, NULL); - if (!p || !q) - return log_oom(); + r = userns_mkdir(directory, "/var", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /var: %m"); + + r = userns_mkdir(directory, "/var/log", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /var/log: %m"); + + r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /var/log/journal: %m"); + + p = strjoina("/var/log/journal/", id); + q = prefix_roota(directory, p); if (path_is_mount_point(p, false) > 0) { if (arg_link_journal != LINK_AUTO) { @@ -2099,7 +2178,7 @@ static int setup_journal(const char *directory) { arg_link_journal == LINK_AUTO) && path_equal(d, q)) { - r = mkdir_p(q, 0755); + r = userns_mkdir(directory, p, 0755, 0, 0); if (r < 0) log_warning_errno(errno, "Failed to create directory %s: %m", q); return 0; @@ -2137,7 +2216,7 @@ static int setup_journal(const char *directory) { } } - r = mkdir_p(q, 0755); + r = userns_mkdir(directory, p, 0755, 0, 0); if (r < 0) log_warning_errno(errno, "Failed to create directory %s: %m", q); return 0; @@ -2163,7 +2242,7 @@ static int setup_journal(const char *directory) { if (dir_is_empty(q) == 0) log_warning("%s is not empty, proceeding anyway.", q); - r = mkdir_p(q, 0755); + r = userns_mkdir(directory, p, 0755, 0, 0); if (r < 0) { log_error_errno(errno, "Failed to create %s: %m", q); return r; @@ -2941,10 +3020,16 @@ static int setup_propagate(const char *root) { p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); (void) mkdir_p(p, 0600); - q = strjoina(root, "/run/systemd/nspawn/incoming"); - mkdir_parents(q, 0755); - mkdir_p(q, 0600); + if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0) + return log_error_errno(errno, "Failed to create /run/systemd: %m"); + if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0) + return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m"); + + if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0) + return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m"); + + q = prefix_roota(root, "/run/systemd/nspawn/incoming"); if (mount(p, q, NULL, MS_BIND, NULL) < 0) return log_error_errno(errno, "Failed to install propagation bind mount."); @@ -3603,14 +3688,9 @@ static int change_uid_gid(char **_home) { if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) { /* Reset everything fully to 0, just in case */ - if (setgroups(0, NULL) < 0) - return log_error_errno(errno, "setgroups() failed: %m"); - - if (setresgid(0, 0, 0) < 0) - return log_error_errno(errno, "setregid() failed: %m"); - - if (setresuid(0, 0, 0) < 0) - return log_error_errno(errno, "setreuid() failed: %m"); + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Failed to become root: %m"); *_home = NULL; return 0; @@ -3754,9 +3834,9 @@ static int change_uid_gid(char **_home) { if (r < 0 && r != -EEXIST) return log_error_errno(r, "Failed to make home directory: %m"); - fchown(STDIN_FILENO, uid, gid); - fchown(STDOUT_FILENO, uid, gid); - fchown(STDERR_FILENO, uid, gid); + (void) fchown(STDIN_FILENO, uid, gid); + (void) fchown(STDOUT_FILENO, uid, gid); + (void) fchown(STDERR_FILENO, uid, gid); if (setgroups(n_uids, uids) < 0) return log_error_errno(errno, "Failed to set auxiliary groups: %m"); @@ -3925,23 +4005,25 @@ static int determine_names(void) { return 0; } -static int determine_uid_shift(void) { +static int determine_uid_shift(const char *directory) { int r; - if (!arg_userns) + if (!arg_userns) { + arg_uid_shift = 0; return 0; + } if (arg_uid_shift == UID_INVALID) { struct stat st; - r = stat(arg_directory, &st); + r = stat(directory, &st); if (r < 0) - return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory); + return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory); arg_uid_shift = st.st_uid & UINT32_C(0xffff0000); if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) { - log_error("UID and GID base of %s don't match.", arg_directory); + log_error("UID and GID base of %s don't match.", directory); return -EINVAL; } @@ -3957,6 +4039,430 @@ static int determine_uid_shift(void) { return 0; } +static int inner_child( + Barrier *barrier, + const char *directory, + bool secondary, + int kmsg_socket, + int rtnl_socket, + FDSet *fds, + int argc, + char *argv[]) { + + _cleanup_free_ char *home = NULL; + unsigned n_env = 2; + const char *envp[] = { + "PATH=" DEFAULT_PATH_SPLIT_USR, + "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */ + NULL, /* TERM */ + NULL, /* HOME */ + NULL, /* USER */ + NULL, /* LOGNAME */ + NULL, /* container_uuid */ + NULL, /* LISTEN_FDS */ + NULL, /* LISTEN_PID */ + NULL + }; + + char **env_use; + int r; + + assert(barrier); + assert(directory); + assert(kmsg_socket >= 0); + + if (arg_userns) { + /* Tell the parent, that it now can write the UID map. */ + (void) barrier_place(barrier); /* #1 */ + + /* Wait until the parent wrote the UID map */ + if (!barrier_place_and_sync(barrier)) { /* #2 */ + log_error("Parent died too early"); + return -ESRCH; + } + } + + r = mount_all(NULL, true); + if (r < 0) + return r; + + /* Wait until we are cgroup-ified, so that we + * can mount the right cgroup path writable */ + if (!barrier_place_and_sync(barrier)) { /* #3 */ + log_error("Parent died too early"); + return -ESRCH; + } + + r = mount_systemd_cgroup_writable(""); + if (r < 0) + return r; + + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Couldn't become new root: %m"); + + r = setup_boot_id(NULL); + if (r < 0) + return r; + + r = setup_kmsg(NULL, kmsg_socket); + if (r < 0) + return r; + kmsg_socket = safe_close(kmsg_socket); + + umask(0022); + + if (setsid() < 0) + return log_error_errno(errno, "setsid() failed: %m"); + + if (arg_private_network) + loopback_setup(); + + r = send_rtnl(rtnl_socket); + if (r < 0) + return r; + rtnl_socket = safe_close(rtnl_socket); + + if (drop_capabilities() < 0) + return log_error_errno(errno, "drop_capabilities() failed: %m"); + + setup_hostname(); + + if (arg_personality != 0xffffffffLU) { + if (personality(arg_personality) < 0) + return log_error_errno(errno, "personality() failed: %m"); + } else if (secondary) { + if (personality(PER_LINUX32) < 0) + return log_error_errno(errno, "personality() failed: %m"); + } + +#ifdef HAVE_SELINUX + if (arg_selinux_context) + if (setexeccon((security_context_t) arg_selinux_context) < 0) + return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); +#endif + + r = change_uid_gid(&home); + if (r < 0) + return r; + + envp[n_env] = strv_find_prefix(environ, "TERM="); + if (envp[n_env]) + n_env ++; + + if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) || + (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) || + (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) + return log_oom(); + + if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) { + char as_uuid[37]; + + if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) + return log_oom(); + } + + if (fdset_size(fds) > 0) { + r = fdset_cloexec(fds, false); + if (r < 0) + return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors."); + + if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) || + (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) + return log_oom(); + } + + if (!strv_isempty(arg_setenv)) { + char **n; + + n = strv_env_merge(2, envp, arg_setenv); + if (!n) + return log_oom(); + + env_use = n; + } else + env_use = (char**) envp; + + /* Let the parent know that we are ready and + * wait until the parent is ready with the + * setup, too... */ + if (!barrier_place_and_sync(barrier)) { /* #4 */ + log_error("Parent died too early"); + return -ESRCH; + } + + /* Now, explicitly close the log, so that we + * then can close all remaining fds. Closing + * the log explicitly first has the benefit + * that the logging subsystem knows about it, + * and is thus ready to be reopened should we + * need it again. Note that the other fds + * closed here are at least the locking and + * barrier fds. */ + log_close(); + (void) fdset_close_others(fds); + + if (arg_boot) { + char **a; + size_t m; + + /* Automatically search for the init system */ + + m = 1 + argc - optind; + a = newa(char*, m + 1); + memcpy(a + 1, argv + optind, m * sizeof(char*)); + + a[0] = (char*) "/usr/lib/systemd/systemd"; + execve(a[0], a, env_use); + + a[0] = (char*) "/lib/systemd/systemd"; + execve(a[0], a, env_use); + + a[0] = (char*) "/sbin/init"; + execve(a[0], a, env_use); + } else if (argc > optind) + execvpe(argv[optind], argv + optind, env_use); + else { + chdir(home ? home : "/root"); + execle("/bin/bash", "-bash", NULL, env_use); + execle("/bin/sh", "-sh", NULL, env_use); + } + + (void) log_open(); + return log_error_errno(errno, "execv() failed: %m"); +} + +static int outer_child( + Barrier *barrier, + const char *directory, + const char *console, + const char *root_device, bool root_device_rw, + const char *home_device, bool home_device_rw, + const char *srv_device, bool srv_device_rw, + bool interactive, + bool secondary, + int pid_socket, + int kmsg_socket, + int rtnl_socket, + FDSet *fds, + int argc, + char *argv[]) { + + pid_t pid; + ssize_t l; + int r; + + assert(barrier); + assert(directory); + assert(console); + assert(pid_socket >= 0); + assert(kmsg_socket >= 0); + + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) + return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); + + if (interactive) { + close_nointr(STDIN_FILENO); + close_nointr(STDOUT_FILENO); + close_nointr(STDERR_FILENO); + + r = open_terminal(console, O_RDWR); + if (r != STDIN_FILENO) { + if (r >= 0) { + safe_close(r); + r = -EINVAL; + } + + return log_error_errno(r, "Failed to open console: %m"); + } + + if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO || + dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) + return log_error_errno(errno, "Failed to duplicate console: %m"); + } + + r = reset_audit_loginuid(); + if (r < 0) + return r; + + /* Mark everything as slave, so that we still + * receive mounts from the real root, but don't + * propagate mounts to the real root. */ + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) + return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m"); + + r = mount_devices(directory, + root_device, root_device_rw, + home_device, home_device_rw, + srv_device, srv_device_rw); + if (r < 0) + return r; + + r = determine_uid_shift(directory); + if (r < 0) + return r; + + /* Turn directory into bind mount */ + if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) + return log_error_errno(errno, "Failed to make bind mount: %m"); + + access("alive12", F_OK); + + r = setup_volatile(directory); + if (r < 0) + return r; + + access("alive3", F_OK); + + r = setup_volatile_state(directory); + if (r < 0) + return r; + + access("alive4", F_OK); + + r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift); + if (r < 0) + return r; + + access("alive5", F_OK); + + if (arg_read_only) { + r = bind_remount_recursive(directory, true); + if (r < 0) + return log_error_errno(r, "Failed to make tree read-only: %m"); + } + + access("alive6", F_OK); + + r = mount_all(directory, false); + if (r < 0) + return r; + + if (copy_devnodes(directory) < 0) + return r; + + dev_setup(directory, arg_uid_shift, arg_uid_shift); + + if (setup_pts(directory) < 0) + return r; + + r = setup_propagate(directory); + if (r < 0) + return r; + + r = setup_dev_console(directory, console); + if (r < 0) + return r; + + r = setup_seccomp(); + if (r < 0) + return r; + + r = setup_timezone(directory); + if (r < 0) + return r; + + r = setup_resolv_conf(directory); + if (r < 0) + return r; + + r = setup_journal(directory); + if (r < 0) + return r; + + r = mount_custom(directory); + if (r < 0) + return r; + + r = mount_cgroup(directory); + if (r < 0) + return r; + + r = mount_move_root(directory); + if (r < 0) + return log_error_errno(r, "Failed to move root directory: %m"); + + pid = raw_clone(SIGCHLD|CLONE_NEWNS| + (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | + (arg_private_network ? CLONE_NEWNET : 0) | + (arg_userns ? CLONE_NEWUSER : 0), + NULL); + if (pid < 0) + return log_error_errno(errno, "Failed to fork inner child: %m"); + + if (pid == 0) { + pid_socket = safe_close(pid_socket); + + /* The inner child has all namespaces that are + * requested, so that we all are owned by the user if + * user namespaces are turned on. */ + + r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv); + if (r < 0) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + + l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send PID: %m"); + if (l != sizeof(pid)) { + log_error("Short write while sending PID."); + return -EIO; + } + + pid_socket = safe_close(pid_socket); + + return 0; +} + +static int setup_uid_map(pid_t pid) { + char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; + int r; + + assert(pid > 1); + + xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid); + xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range); + r = write_string_file(uid_map, line); + if (r < 0) + return log_error_errno(r, "Failed to write UID map: %m"); + + /* We always assign the same UID and GID ranges */ + xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid); + r = write_string_file(uid_map, line); + if (r < 0) + return log_error_errno(r, "Failed to write GID map: %m"); + + return 0; +} + +static int chown_cgroup(pid_t pid) { + _cleanup_free_ char *path = NULL, *fs = NULL; + _cleanup_close_ int fd = -1; + const char *fn; + int r; + + r = cg_pid_get_path(NULL, pid, &path); + if (r < 0) + return log_error_errno(r, "Failed to get container cgroup path: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", fs); + + FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children") + if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0) + log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn); + + return 0; +} + int main(int argc, char *argv[]) { _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL; @@ -4136,15 +4642,13 @@ int main(int argc, char *argv[]) { goto finish; } - r = determine_uid_shift(); - if (r < 0) - goto finish; - r = custom_mounts_prepare(); if (r < 0) goto finish; - interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0; + interactive = + isatty(STDIN_FILENO) > 0 && + isatty(STDOUT_FILENO) > 0; master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY); if (master < 0) { @@ -4174,14 +4678,25 @@ int main(int argc, char *argv[]) { assert_se(sigemptyset(&mask_chld) == 0); assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); + if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) { + r = log_error_errno(errno, "Failed to become subreaper: %m"); + goto finish; + } + for (;;) { - _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }; + _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }; ContainerStatus container_status; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; - struct sigaction sa = { + static const struct sigaction sa = { .sa_handler = nop_handler, .sa_flags = SA_NOCLDSTOP, }; + int ifi = 0; + ssize_t l; + _cleanup_event_unref_ sd_event *event = NULL; + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL; + char last_char = 0; r = barrier_create(&barrier); if (r < 0) { @@ -4199,6 +4714,11 @@ int main(int argc, char *argv[]) { goto finish; } + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) { + r = log_error_errno(errno, "Failed to create pid socket pair: %m"); + goto finish; + } + /* Child can be killed before execv(), so handle SIGCHLD * in order to interrupt parent's blocking calls and * give it a chance to call wait() and terminate. */ @@ -4214,9 +4734,7 @@ int main(int argc, char *argv[]) { goto finish; } - pid = raw_clone(SIGCHLD|CLONE_NEWNS| - (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)| - (arg_private_network ? CLONE_NEWNET : 0), NULL); + pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL); if (pid < 0) { if (errno == EINVAL) r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); @@ -4227,457 +4745,191 @@ int main(int argc, char *argv[]) { } if (pid == 0) { - /* child */ - _cleanup_free_ char *home = NULL; - unsigned n_env = 2; - const char *envp[] = { - "PATH=" DEFAULT_PATH_SPLIT_USR, - "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */ - NULL, /* TERM */ - NULL, /* HOME */ - NULL, /* USER */ - NULL, /* LOGNAME */ - NULL, /* container_uuid */ - NULL, /* LISTEN_FDS */ - NULL, /* LISTEN_PID */ - NULL - }; - char **env_use; - + /* The outer child only has a file system namespace. */ barrier_set_role(&barrier, BARRIER_CHILD); - envp[n_env] = strv_find_prefix(environ, "TERM="); - if (envp[n_env]) - n_env ++; - master = safe_close(master); kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]); rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); + pid_socket_pair[0] = safe_close(pid_socket_pair[0]); reset_all_signal_handlers(); reset_signal_mask(); - if (interactive) { - close_nointr(STDIN_FILENO); - close_nointr(STDOUT_FILENO); - close_nointr(STDERR_FILENO); - - r = open_terminal(console, O_RDWR); - if (r != STDIN_FILENO) { - if (r >= 0) { - safe_close(r); - r = -EINVAL; - } - - log_error_errno(r, "Failed to open console: %m"); - _exit(EXIT_FAILURE); - } - - if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO || - dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) { - log_error_errno(errno, "Failed to duplicate console: %m"); - _exit(EXIT_FAILURE); - } - } - - if (setsid() < 0) { - log_error_errno(errno, "setsid() failed: %m"); - _exit(EXIT_FAILURE); - } - - if (reset_audit_loginuid() < 0) - _exit(EXIT_FAILURE); - - if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { - log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); - _exit(EXIT_FAILURE); - } - - if (arg_private_network) - loopback_setup(); - - /* Mark everything as slave, so that we still - * receive mounts from the real root, but don't - * propagate mounts to the real root. */ - if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { - log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m"); - _exit(EXIT_FAILURE); - } - - if (mount_devices(arg_directory, - root_device, root_device_rw, - home_device, home_device_rw, - srv_device, srv_device_rw) < 0) - _exit(EXIT_FAILURE); - - /* Turn directory into bind mount */ - if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) { - log_error_errno(errno, "Failed to make bind mount: %m"); - _exit(EXIT_FAILURE); - } - - r = setup_volatile(arg_directory); - if (r < 0) - _exit(EXIT_FAILURE); - - if (setup_volatile_state(arg_directory) < 0) - _exit(EXIT_FAILURE); - - r = base_filesystem_create(arg_directory); - if (r < 0) - _exit(EXIT_FAILURE); - - if (arg_read_only) { - r = bind_remount_recursive(arg_directory, true); - if (r < 0) { - log_error_errno(r, "Failed to make tree read-only: %m"); - _exit(EXIT_FAILURE); - } - } - - if (mount_all(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (copy_devnodes(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (setup_ptmx(arg_directory) < 0) - _exit(EXIT_FAILURE); - - dev_setup(arg_directory); - - if (setup_propagate(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (setup_seccomp() < 0) - _exit(EXIT_FAILURE); - - if (setup_dev_console(arg_directory, console) < 0) - _exit(EXIT_FAILURE); - - if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0) - _exit(EXIT_FAILURE); - kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); - - if (send_rtnl(rtnl_socket_pair[1]) < 0) - _exit(EXIT_FAILURE); - rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); - - /* Tell the parent that we are ready, and that - * it can cgroupify us to that we lack access - * to certain devices and resources. */ - (void) barrier_place(&barrier); /* #1 */ - - if (setup_boot_id(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (setup_timezone(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (setup_resolv_conf(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (setup_journal(arg_directory) < 0) - _exit(EXIT_FAILURE); - - if (mount_custom(arg_directory) < 0) - _exit(EXIT_FAILURE); - - /* Wait until we are cgroup-ified, so that we - * can mount the right cgroup path writable */ - (void) barrier_place_and_sync(&barrier); /* #2 */ - - if (mount_cgroup(arg_directory) < 0) - _exit(EXIT_FAILURE); - - r = mount_move_root(arg_directory); - if (r < 0) { - log_error_errno(r, "Failed to move root directory: %m"); - _exit(EXIT_FAILURE); - } - - if (arg_userns) { - if (unshare(CLONE_NEWUSER) < 0) { - log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m"); - _exit(EXIT_FAILURE); - } - - /* Tell the parent, that it now can - * write the UID map. */ - (void) barrier_place(&barrier); /* #3 */ - - /* Wait until the parent wrote the UID - * map */ - (void) barrier_place_and_sync(&barrier); /* #4 */ - } - - umask(0022); - - if (drop_capabilities() < 0) { - log_error_errno(errno, "drop_capabilities() failed: %m"); - _exit(EXIT_FAILURE); - } - - setup_hostname(); - - if (arg_personality != 0xffffffffLU) { - if (personality(arg_personality) < 0) { - log_error_errno(errno, "personality() failed: %m"); - _exit(EXIT_FAILURE); - } - } else if (secondary) { - if (personality(PER_LINUX32) < 0) { - log_error_errno(errno, "personality() failed: %m"); - _exit(EXIT_FAILURE); - } - } - -#ifdef HAVE_SELINUX - if (arg_selinux_context) - if (setexeccon((security_context_t) arg_selinux_context) < 0) { - log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); - _exit(EXIT_FAILURE); - } -#endif - - r = change_uid_gid(&home); + r = outer_child(&barrier, + arg_directory, + console, + root_device, root_device_rw, + home_device, home_device_rw, + srv_device, srv_device_rw, + interactive, + secondary, + pid_socket_pair[1], + kmsg_socket_pair[1], + rtnl_socket_pair[1], + fds, + argc, argv); if (r < 0) _exit(EXIT_FAILURE); - if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) || - (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) || - (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) { - log_oom(); - _exit(EXIT_FAILURE); - } - - if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) { - char as_uuid[37]; - - if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) { - log_oom(); - _exit(EXIT_FAILURE); - } - } - - if (fdset_size(fds) > 0) { - r = fdset_cloexec(fds, false); - if (r < 0) { - log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors."); - _exit(EXIT_FAILURE); - } - - if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) || - (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) { - log_oom(); - _exit(EXIT_FAILURE); - } - } - - if (!strv_isempty(arg_setenv)) { - char **n; - - n = strv_env_merge(2, envp, arg_setenv); - if (!n) { - log_oom(); - _exit(EXIT_FAILURE); - } - - env_use = n; - } else - env_use = (char**) envp; - - /* Let the parent know that we are ready and - * wait until the parent is ready with the - * setup, too... */ - (void) barrier_place_and_sync(&barrier); /* #5 */ - - /* Now, explicitly close the log, so that we - * then can close all remaining fds. Closing - * the log explicitly first has the benefit - * that the logging subsystem knows about it, - * and is thus ready to be reopened should we - * need it again. Note that the other fds - * closed here are at least the locking and - * barrier fds. */ - log_close(); - (void) fdset_close_others(fds); - - if (arg_boot) { - char **a; - size_t l; - - /* Automatically search for the init system */ - - l = 1 + argc - optind; - a = newa(char*, l + 1); - memcpy(a + 1, argv + optind, l * sizeof(char*)); - - a[0] = (char*) "/usr/lib/systemd/systemd"; - execve(a[0], a, env_use); - - a[0] = (char*) "/lib/systemd/systemd"; - execve(a[0], a, env_use); - - a[0] = (char*) "/sbin/init"; - execve(a[0], a, env_use); - } else if (argc > optind) - execvpe(argv[optind], argv + optind, env_use); - else { - chdir(home ? home : "/root"); - execle("/bin/bash", "-bash", NULL, env_use); - execle("/bin/sh", "-sh", NULL, env_use); - } - - (void) log_open(); - log_error_errno(errno, "execv() failed: %m"); - _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); } barrier_set_role(&barrier, BARRIER_PARENT); + fdset_free(fds); fds = NULL; kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); + pid_socket_pair[1] = safe_close(pid_socket_pair[1]); - (void) barrier_place(&barrier); /* #1 */ + /* Wait for the outer child. */ + r = wait_for_terminate_and_warn("namespace helper", pid, NULL); + if (r < 0) + goto finish; + if (r != 0) { + r = -EIO; + goto finish; + } + pid = 0; - /* Wait for the most basic Child-setup to be done, - * before we add hardware to it, and place it in a - * cgroup. */ - if (barrier_sync(&barrier)) { /* #1 */ - int ifi = 0; + /* And now retrieve the PID of the inner child. */ + l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0); + if (l < 0) { + r = log_error_errno(errno, "Failed to read inner child PID: %m"); + goto finish; + } + if (l != sizeof(pid)) { + log_error("Short read while reading inner child PID: %m"); + r = EIO; + goto finish; + } - r = move_network_interfaces(pid); - if (r < 0) - goto finish; + log_debug("Init process invoked as PID " PID_FMT, pid); - r = setup_veth(pid, veth_name, &ifi); - if (r < 0) + if (arg_userns) { + if (!barrier_place_and_sync(&barrier)) { /* #1 */ + log_error("Child died too early."); + r = -ESRCH; goto finish; + } - r = setup_bridge(veth_name, &ifi); + r = setup_uid_map(pid); if (r < 0) goto finish; - r = setup_macvlan(pid); - if (r < 0) - goto finish; + (void) barrier_place(&barrier); /* #2 */ + } - r = setup_ipvlan(pid); - if (r < 0) - goto finish; + r = move_network_interfaces(pid); + if (r < 0) + goto finish; - r = register_machine(pid, ifi); - if (r < 0) - goto finish; + r = setup_veth(pid, veth_name, &ifi); + if (r < 0) + goto finish; - /* Notify the child that the parent is ready with all - * its setup, and that the child can now hand over - * control to the code to run inside the container. */ - (void) barrier_place(&barrier); /* #2 */ + r = setup_bridge(veth_name, &ifi); + if (r < 0) + goto finish; - if (arg_userns) { - char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; + r = setup_macvlan(pid); + if (r < 0) + goto finish; - (void) barrier_place_and_sync(&barrier); /* #3 */ + r = setup_ipvlan(pid); + if (r < 0) + goto finish; - xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid); - xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range); - r = write_string_file(uid_map, line); - if (r < 0) { - log_error_errno(r, "Failed to write UID map: %m"); - goto finish; - } + r = register_machine(pid, ifi); + if (r < 0) + goto finish; - /* We always assign the same UID and GID ranges */ - xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid); - r = write_string_file(uid_map, line); - if (r < 0) { - log_error_errno(r, "Failed to write GID map: %m"); - goto finish; - } + r = chown_cgroup(pid); + if (r < 0) + goto finish; - (void) barrier_place(&barrier); /* #4 */ - } + /* Notify the child that the parent is ready with all + * its setup (including cgroup-ification), and that + * the child can now hand over control to the code to + * run inside the container. */ + (void) barrier_place(&barrier); /* #3 */ - /* Block SIGCHLD here, before notifying child. - * process_pty() will handle it with the other signals. */ - r = sigprocmask(SIG_BLOCK, &mask_chld, NULL); - if (r < 0) - goto finish; + /* Block SIGCHLD here, before notifying child. + * process_pty() will handle it with the other signals. */ + assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0); - /* Reset signal to default */ - r = default_signals(SIGCHLD, -1); - if (r < 0) - goto finish; + /* Reset signal to default */ + r = default_signals(SIGCHLD, -1); + if (r < 0) { + log_error_errno(r, "Failed to reset SIGCHLD: %m"); + goto finish; + } - /* Let the child know that we are ready and wait that the child is completely ready now. */ - if (barrier_place_and_sync(&barrier)) { /* #5 */ - _cleanup_event_unref_ sd_event *event = NULL; - _cleanup_(pty_forward_freep) PTYForward *forward = NULL; - _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL; - char last_char = 0; + /* Let the child know that we are ready and wait that the child is completely ready now. */ + if (!barrier_place_and_sync(&barrier)) { /* #5 */ + log_error("Client died too early."); + r = -ESRCH; + goto finish; + } - sd_notifyf(false, - "READY=1\n" - "STATUS=Container running.\n" - "X_NSPAWN_LEADER_PID=" PID_FMT, pid); + sd_notifyf(false, + "READY=1\n" + "STATUS=Container running.\n" + "X_NSPAWN_LEADER_PID=" PID_FMT, pid); - r = sd_event_new(&event); - if (r < 0) { - log_error_errno(r, "Failed to get default event source: %m"); - goto finish; - } + r = sd_event_new(&event); + if (r < 0) { + log_error_errno(r, "Failed to get default event source: %m"); + goto finish; + } - if (arg_kill_signal > 0) { - /* Try to kill the init system on SIGINT or SIGTERM */ - sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid)); - sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid)); - } else { - /* Immediately exit */ - sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); - sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); - } + if (arg_kill_signal > 0) { + /* Try to kill the init system on SIGINT or SIGTERM */ + sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid)); + sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid)); + } else { + /* Immediately exit */ + sd_event_add_signal(event, NULL, SIGINT, NULL, NULL); + sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL); + } - /* simply exit on sigchld */ - sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL); + /* simply exit on sigchld */ + sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL); - if (arg_expose_ports) { - r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl); - if (r < 0) - goto finish; + if (arg_expose_ports) { + r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl); + if (r < 0) + goto finish; - (void) expose_ports(rtnl, &exposed); - } + (void) expose_ports(rtnl, &exposed); + } - rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); + rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); - r = pty_forward_new(event, master, true, !interactive, &forward); - if (r < 0) { - log_error_errno(r, "Failed to create PTY forwarder: %m"); - goto finish; - } + r = pty_forward_new(event, master, true, !interactive, &forward); + if (r < 0) { + log_error_errno(r, "Failed to create PTY forwarder: %m"); + goto finish; + } - r = sd_event_loop(event); - if (r < 0) { - log_error_errno(r, "Failed to run event loop: %m"); - goto finish; - } + r = sd_event_loop(event); + if (r < 0) { + log_error_errno(r, "Failed to run event loop: %m"); + goto finish; + } - pty_forward_get_last_char(forward, &last_char); + pty_forward_get_last_char(forward, &last_char); - forward = pty_forward_free(forward); + forward = pty_forward_free(forward); - if (!arg_quiet && last_char != '\n') - putc('\n', stdout); + if (!arg_quiet && last_char != '\n') + putc('\n', stdout); - /* Kill if it is not dead yet anyway */ - terminate_machine(pid); - } - } + /* Kill if it is not dead yet anyway */ + terminate_machine(pid); /* Normally redundant, but better safe than sorry */ kill(pid, SIGKILL); @@ -4723,11 +4975,11 @@ finish: "STOPPING=1\n" "STATUS=Terminating..."); - loop_remove(loop_nr, &image_fd); - if (pid > 0) kill(pid, SIGKILL); + loop_remove(loop_nr, &image_fd); + if (remove_subvol && arg_directory) { int k; |