From 6402d5c628f1872a4874508bbe975aaac1cc747b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Apr 2016 12:47:36 +0200 Subject: util: copy_file_range() returns EBADF when used on a tty In nspawn we invoke copy_bytes() on a TTY fd. copy_file_range() returns EBADF on a TTY and this error is considered fatal by copy_bytes() so far. Correct that, so that nspawn's copy_bytes() operation works again. This is a follow-up for a44202e98b638024c45e50ad404c7069c7835c04. --- src/basic/copy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/basic/copy.c b/src/basic/copy.c index e2db4be9ff..03487a6878 100644 --- a/src/basic/copy.c +++ b/src/basic/copy.c @@ -102,7 +102,7 @@ int copy_bytes(int fdf, int fdt, uint64_t max_bytes, bool try_reflink) { if (try_cfr) { n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u); if (n < 0) { - if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV)) + if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF)) return n; try_cfr = false; -- cgit v1.2.3-54-g00ecf From 7336138eedf1c9b09b432428c4cccc2da25ab9e0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 20 Apr 2016 22:53:39 +0200 Subject: nspawn: optionally fix up OS tree uid/gids for userns This adds a new --private-userns-chown switch that may be used in combination with --private-userns. If it is passed a recursive chmod() operation is run on the OS tree, fixing all file owner UID/GIDs to the right ranges. This should make user namespacing pretty workable, as the OS trees don't need to be prepared manually anymore. --- .gitignore | 1 + Makefile.am | 13 ++ src/nspawn/nspawn-patch-uid.c | 417 ++++++++++++++++++++++++++++++++++++++++++ src/nspawn/nspawn-patch-uid.h | 23 +++ src/nspawn/nspawn.c | 47 ++++- src/nspawn/test-patch-uid.c | 61 ++++++ 6 files changed, 560 insertions(+), 2 deletions(-) create mode 100644 src/nspawn/nspawn-patch-uid.c create mode 100644 src/nspawn/nspawn-patch-uid.h create mode 100644 src/nspawn/test-patch-uid.c (limited to 'src') diff --git a/.gitignore b/.gitignore index 02ba86ef6f..c7eb14452d 100644 --- a/.gitignore +++ b/.gitignore @@ -240,6 +240,7 @@ /test-ns /test-nss /test-parse-util +/test-patch-uid /test-path /test-path-lookup /test-path-util diff --git a/Makefile.am b/Makefile.am index 0f475c6d09..b323de55c6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3021,6 +3021,8 @@ systemd_nspawn_SOURCES = \ src/nspawn/nspawn-setuid.h \ src/nspawn/nspawn-stub-pid1.c \ src/nspawn/nspawn-stub-pid1.h \ + src/nspawn/nspawn-patch-uid.c \ + src/nspawn/nspawn-patch-uid.h \ src/core/mount-setup.c \ src/core/mount-setup.h \ src/core/loopback-setup.c \ @@ -3048,6 +3050,17 @@ systemd_nspawn_LDADD += \ libfirewall.la endif +test_patch_uid_SOURCES = \ + src/nspawn/nspawn-patch-uid.c \ + src/nspawn/nspawn-patch-uid.h \ + src/nspawn/test-patch-uid.c + +test_patch_uid_LDADD = \ + libshared.la + +manual_tests += \ + test-patch-uid + # ------------------------------------------------------------------------------ systemd_run_SOURCES = \ src/run/run.c diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c new file mode 100644 index 0000000000..f53164fbbf --- /dev/null +++ b/src/nspawn/nspawn-patch-uid.c @@ -0,0 +1,417 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#ifdef HAVE_ACL +#include +#endif +#include +#include + +#include "acl-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "nspawn-patch-uid.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +#ifdef HAVE_ACL + +static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + acl_t acl; + + assert(fd >= 0); + assert(ret); + + if (name) { + _cleanup_close_ int child_fd = -1; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + xsprintf(procfs_path, "/proc/self/fd/%i", child_fd); + acl = acl_get_file(procfs_path, type); + } else if (type == ACL_TYPE_ACCESS) + acl = acl_get_fd(fd); + else { + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + acl = acl_get_file(procfs_path, type); + } + if (!acl) + return -errno; + + *ret = acl; + return 0; +} + +static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + int r; + + assert(fd >= 0); + assert(acl); + + if (name) { + _cleanup_close_ int child_fd = -1; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + xsprintf(procfs_path, "/proc/self/fd/%i", child_fd); + r = acl_set_file(procfs_path, type, acl); + } else if (type == ACL_TYPE_ACCESS) + r = acl_set_fd(fd, acl); + else { + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + r = acl_set_file(procfs_path, type, acl); + } + if (r < 0) + return -errno; + + return 0; +} + +static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) { + _cleanup_(acl_freep) acl_t copy = NULL; + acl_entry_t i; + int r; + + assert(acl); + assert(ret); + + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + while (r > 0) { + uid_t *old_uid, new_uid; + bool modify = false; + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (IN_SET(tag, ACL_USER, ACL_GROUP)) { + + /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that + * this is actually OK */ + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + old_uid = acl_get_qualifier(i); + if (!old_uid) + return -errno; + + new_uid = shift | (*old_uid & UINT32_C(0xFFFF)); + if (!uid_is_valid(new_uid)) + return -EINVAL; + + modify = new_uid != *old_uid; + if (modify && !copy) { + int n; + + /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the + * beginning, so that we copy all entries, starting from the first, this time. */ + + n = acl_entries(acl); + if (n < 0) + return -errno; + + copy = acl_init(n); + if (!copy) + return -errno; + + /* Seek back to the beginning */ + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + continue; + } + } + + if (copy) { + acl_entry_t new_entry; + + if (acl_create_entry(©, &new_entry) < 0) + return -errno; + + if (acl_copy_entry(new_entry, i) < 0) + return -errno; + + if (modify) + if (acl_set_qualifier(new_entry, &new_uid) < 0) + return -errno; + } + + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i); + if (r < 0) + return -errno; + } + + *ret = copy; + copy = NULL; + + return !!*ret; +} + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + /* ACLs are not supported on symlinks, there's no point in trying */ + if (S_ISLNK(st->st_mode)) + return 0; + + r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl); + if (r == -EOPNOTSUPP) + return 0; + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted); + if (r < 0) + return r; + + changed = true; + } + + if (S_ISDIR(st->st_mode)) { + acl_free(acl); + acl_free(shifted); + + acl = shifted = NULL; + + r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl); + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted); + if (r < 0) + return r; + + changed = true; + } + } + + return changed; +} + +#else + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + return 0; +} + +#endif + +static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) { + uid_t new_uid; + gid_t new_gid; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + new_uid = shift | (st->st_uid & UINT32_C(0xFFFF)); + new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF)); + + if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid)) + return -EINVAL; + + if (st->st_uid != new_uid || st->st_gid != new_gid) { + if (name) + r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW); + else + r = fchown(fd, new_uid, new_gid); + if (r < 0) + return -errno; + + /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */ + if (name && !S_ISLNK(st->st_mode)) + r = fchmodat(fd, name, st->st_mode, 0); + else + r = fchmod(fd, st->st_mode); + if (r < 0) + return -errno; + + changed = true; + } + + r = patch_acls(fd, name, st, shift); + if (r < 0) + return r; + + return r > 0 || changed; +} + +static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift) { + bool changed = false; + int r; + + assert(fd >= 0); + + r = patch_fd(fd, NULL, st, shift); + if (r < 0) + goto finish; + + if (S_ISDIR(st->st_mode)) { + _cleanup_closedir_ DIR *d = NULL; + struct dirent *de; + + if (!donate_fd) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + fd = copy; + donate_fd = true; + } + + d = fdopendir(fd); + if (!d) { + r = -errno; + goto finish; + } + fd = -1; + + FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) { + struct stat fst; + + if (STR_IN_SET(de->d_name, ".", "..")) + continue; + + if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) { + r = -errno; + goto finish; + } + + if (S_ISDIR(fst.st_mode)) { + int subdir_fd; + + subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (subdir_fd < 0) { + r = -errno; + goto finish; + + } + + r = recurse_fd(subdir_fd, true, &fst, shift); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + + } else { + r = patch_fd(dirfd(d), de->d_name, &fst, shift); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + } + } + } + + r = changed; + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) { + struct stat st; + int r; + + assert(fd >= 0); + + /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an + * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges + * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual + * UID within the container. */ + + if ((shift & 0xFFFF) != 0) { + /* We only support containers where the shift starts at a 2^16 boundary */ + r = -EOPNOTSUPP; + goto finish; + } + + if (range != 0x10000) { + /* We only support containers with 16bit UID ranges for the patching logic */ + r = -EOPNOTSUPP; + goto finish; + } + + if (fstat(fd, &st) < 0) { + r = -errno; + goto finish; + } + + if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) { + /* We only support containers where the uid/gid container ID match */ + r = -EBADE; + goto finish; + } + + /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume + * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */ + if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0) + return 0; + + return recurse_fd(fd, donate_fd, &st, shift); + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +int fd_patch_uid(int fd, uid_t shift, uid_t range) { + return fd_patch_uid_internal(fd, false, shift, range); +} + +int path_patch_uid(const char *path, uid_t shift, uid_t range) { + int fd; + + fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (fd < 0) + return -errno; + + return fd_patch_uid_internal(fd, true, shift, range); +} diff --git a/src/nspawn/nspawn-patch-uid.h b/src/nspawn/nspawn-patch-uid.h new file mode 100644 index 0000000000..55d0990016 --- /dev/null +++ b/src/nspawn/nspawn-patch-uid.h @@ -0,0 +1,23 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +int fd_patch_uid(int fd, uid_t shift, uid_t range); +int path_patch_uid(const char *path, uid_t shift, uid_t range); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index e1d37d383a..a3190545fa 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -75,6 +75,7 @@ #include "nspawn-expose-ports.h" #include "nspawn-mount.h" #include "nspawn-network.h" +#include "nspawn-patch-uid.h" #include "nspawn-register.h" #include "nspawn-settings.h" #include "nspawn-setuid.h" @@ -175,6 +176,7 @@ static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; +static bool arg_userns_chown = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -204,6 +206,7 @@ static void help(void) { " --property=NAME=VALUE Set scope unit property\n" " --private-users[=UIDBASE[:NUIDS]]\n" " Run within user namespace\n" + " --private-user-chown Adjust OS tree file ownership for private user range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -349,6 +352,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_KILL_SIGNAL, ARG_SETTINGS, ARG_CHDIR, + ARG_PRIVATE_USERS_CHOWN, }; static const struct option options[] = { @@ -392,6 +396,7 @@ static int parse_argv(int argc, char *argv[]) { { "port", required_argument, NULL, 'p' }, { "property", required_argument, NULL, ARG_PROPERTY }, { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, + { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN}, { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, { "settings", required_argument, NULL, ARG_SETTINGS }, { "chdir", required_argument, NULL, ARG_CHDIR }, @@ -825,6 +830,10 @@ static int parse_argv(int argc, char *argv[]) { arg_userns = true; break; + case ARG_PRIVATE_USERS_CHOWN: + arg_userns_chown = true; + break; + case ARG_KILL_SIGNAL: arg_kill_signal = signal_from_string_try_harder(optarg); if (arg_kill_signal < 0) { @@ -933,8 +942,15 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) - return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support."); + if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) { + log_error("--private-users= is not supported, kernel compiled without user namespace support."); + return -EOPNOTSUPP; + } + + if (arg_userns_chown && arg_read_only) { + log_error("--read-only and --private-users-chown may not be combined."); + return -EINVAL; + } if (argc > optind) { arg_parameters = strv_copy(argv + optind); @@ -2218,6 +2234,29 @@ static int setup_machine_id(const char *directory) { return 0; } +static int recursive_chown(const char *directory, uid_t shift, uid_t range) { + int r; + + assert(directory); + + if (!arg_userns || !arg_userns_chown) + return 0; + + r = path_patch_uid(directory, arg_uid_shift, arg_uid_range); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16."); + if (r == -EBADE) + return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match."); + if (r < 0) + return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m"); + if (r == 0) + log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation."); + else + log_debug("Patched directory tree to match UID/GID range."); + + return r; +} + static int mount_devices( const char *where, const char *root_device, bool root_device_rw, @@ -2763,6 +2802,10 @@ static int outer_child( if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) return log_error_errno(errno, "Failed to make bind mount: %m"); + r = recursive_chown(directory, arg_uid_shift, arg_uid_range); + if (r < 0) + return r; + r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); if (r < 0) return r; diff --git a/src/nspawn/test-patch-uid.c b/src/nspawn/test-patch-uid.c new file mode 100644 index 0000000000..11c5321788 --- /dev/null +++ b/src/nspawn/test-patch-uid.c @@ -0,0 +1,61 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "log.h" +#include "nspawn-patch-uid.h" +#include "user-util.h" +#include "util.h" + +int main(int argc, char *argv[]) { + uid_t shift, range; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + log_open(); + + if (argc != 4) { + log_error("Expected PATH SHIFT RANGE parameters."); + return EXIT_FAILURE; + } + + r = parse_uid(argv[2], &shift); + if (r < 0) { + log_error_errno(r, "Failed to parse UID shift %s.", argv[2]); + return EXIT_FAILURE; + } + + r = parse_gid(argv[3], &range); + if (r < 0) { + log_error_errno(r, "Failed to parse UID range %s.", argv[3]); + return EXIT_FAILURE; + } + + r = path_patch_uid(argv[1], shift, range); + if (r < 0) { + log_error_errno(r, "Failed to patch directory tree: %m"); + return EXIT_FAILURE; + } + + log_info("Changed: %s", yes_no(r)); + + return EXIT_SUCCESS; +} -- cgit v1.2.3-54-g00ecf From 0e7ac7515f2fe0782f4062bb223904e2748b535d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 11:28:09 +0200 Subject: nspawn: optionally, automatically allocate a UID/GID range for userns containers This adds the new value "pick" to --private-users=. When specified a new UID/GID range of 65536 users is automatically and randomly allocated from the host range 0x00080000-0xDFFF0000 and used for the container. The setting implies --private-users-chown, so that container directory is recursively chown()ed to the newly allocated UID/GID range, if that's necessary. As an optimization before picking a randomized UID/GID the UID of the container's root directory is used as starting point and used if currently not used otherwise. To protect against using the same UID/GID range multiple times a few mechanisms are in place: - The first and the last UID and GID of the range are checked with getpwuid() and getgrgid(). If an entry already exists a different range is picked. Note that by "last" UID the user 65534 is used, as 65535 is the 16bit (uid_t) -1. - A lock file for the range is taken in /run/systemd/nspawn-uid/. Since the ranges are taken in a non-overlapping fashion, and always start on 64K boundaries this allows us to maintain a single lock file for each range that can be randomly picked. This protects nspawn from picking the same range in two parallel instances. - If possible the /etc/passwd lock file is taken while a new range is selected until the container is up. This means adduser/addgroup should safely avoid the range as long as nss-mymachines is used, since the allocated range will then show up in the user database. The UID/GID range nspawn picks from is compiled in and not configurable at the moment. That should probably stay that way, since we already provide ways how users can pick their own ranges manually if they don't like the automatic logic. The new --private-users=pick logic makes user namespacing pretty useful now, as it relieves the user from managing UID/GID ranges. --- src/nspawn/nspawn.c | 226 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 189 insertions(+), 37 deletions(-) (limited to 'src') diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a3190545fa..c330456ff9 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -22,7 +22,9 @@ #endif #include #include +#include #include +#include #include #ifdef HAVE_SECCOMP #include @@ -102,6 +104,11 @@ #include "user-util.h" #include "util.h" +/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit + * UID range here */ +#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000)) +#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000)) + typedef enum ContainerStatus { CONTAINER_TERMINATED, CONTAINER_REBOOTED @@ -174,9 +181,10 @@ static char *arg_image = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; -static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; static bool arg_userns_chown = false; +static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; +static bool arg_uid_shift_pick = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -275,9 +283,15 @@ static int custom_mounts_prepare(void) { for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; - if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) { - log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); - return -EINVAL; + if (path_equal(m->destination, "/") && arg_userns) { + + if (arg_userns_chown) { + log_error("--private-users-chown may not be combined with custom root mounts."); + return -EINVAL; + } else if (arg_uid_shift == UID_INVALID) { + log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); + return -EINVAL; + } } if (m->type != CUSTOM_MOUNT_OVERLAY) @@ -806,25 +820,37 @@ static int parse_argv(int argc, char *argv[]) { _cleanup_free_ char *buffer = NULL; const char *range, *shift; - range = strchr(optarg, ':'); - if (range) { - buffer = strndup(optarg, range - optarg); - if (!buffer) - return log_oom(); - shift = buffer; - - range++; - if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { - log_error("Failed to parse UID range: %s", range); + if (streq(optarg, "pick")) { + arg_uid_shift = UID_INVALID; + arg_uid_range = 0x10000U; + arg_uid_shift_pick = true; + } else { + range = strchr(optarg, ':'); + if (range) { + buffer = strndup(optarg, range - optarg); + if (!buffer) + return log_oom(); + shift = buffer; + + range++; + if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { + log_error("Failed to parse UID range: %s", range); + return -EINVAL; + } + } else + shift = optarg; + + if (parse_uid(shift, &arg_uid_shift) < 0) { + log_error("Failed to parse UID: %s", optarg); return -EINVAL; } - } else - shift = optarg; - if (parse_uid(shift, &arg_uid_shift) < 0) { - log_error("Failed to parse UID: %s", optarg); - return -EINVAL; + arg_uid_shift_pick = false; } + } else { + arg_uid_shift = UID_INVALID; + arg_uid_range = 0x10000U; + arg_uid_shift_pick = false; } arg_userns = true; @@ -902,6 +928,9 @@ static int parse_argv(int argc, char *argv[]) { if (arg_share_system) arg_register = false; + if (arg_uid_shift_pick) + arg_userns_chown = true; + if (arg_start_mode != START_PID1 && arg_share_system) { log_error("--boot and --share-system may not be combined."); return -EINVAL; @@ -2501,7 +2530,6 @@ static int determine_uid_shift(const char *directory) { return -EINVAL; } - log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); return 0; } @@ -2789,6 +2817,7 @@ static int outer_child( return r; if (arg_userns) { + /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); if (l < 0) return log_error_errno(errno, "Failed to send UID shift: %m"); @@ -2796,6 +2825,22 @@ static int outer_child( log_error("Short write while sending UID shift."); return -EIO; } + + if (arg_uid_shift_pick) { + /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift + * we just read from the image is available. If yes, it will send the UID shift back to us, if + * not it will pick a different one, and send it back to us. */ + + l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) + return log_error_errno(errno, "Failed to recv UID shift: %m"); + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while recieving UID shift."); + return -EIO; + } + } + + log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } /* Turn directory into bind mount */ @@ -2925,6 +2970,61 @@ static int outer_child( return 0; } +static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { + unsigned n_tries = 100; + uid_t candidate; + int r; + + assert(shift); + assert(ret_lock_file); + assert(arg_uid_shift_pick); + assert(arg_uid_range == 0x10000U); + + candidate = *shift; + + (void) mkdir("/run/systemd/nspawn-uid", 0755); + + for (;;) { + char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT; + + if (--n_tries <= 0) + return -EBUSY; + + if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX) + goto next; + if ((candidate & UINT32_C(0xFFFF)) != 0) + goto next; + + xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate); + r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf); + if (r == -EBUSY) /* Range already taken by another nspawn instance */ + goto next; + if (r < 0) + return r; + + /* Make some superficial checks whether the range is currently known in the user database */ + if (getpwuid(candidate)) + goto next; + if (getpwuid(candidate + UINT32_C(0xFFFE))) + goto next; + if (getgrgid(candidate)) + goto next; + if (getgrgid(candidate + UINT32_C(0xFFFE))) + goto next; + + *ret_lock_file = lf; + lf = (struct LockFile) LOCK_FILE_INIT; + *shift = candidate; + return 0; + + next: + random_bytes(&candidate, sizeof(candidate)); + candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN; + candidate &= (uid_t) UINT32_C(0xFFFF0000); + } +} + static int setup_uid_map(pid_t pid) { char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; int r; @@ -3394,20 +3494,42 @@ int main(int argc, char *argv[]) { } for (;;) { - _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, - pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 }; - ContainerStatus container_status; - _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; static const struct sigaction sa = { .sa_handler = nop_signal_handler, .sa_flags = SA_NOCLDSTOP, }; - int ifi = 0; - ssize_t l; + + _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT; + _cleanup_close_ int etc_passwd_lock = -1; + _cleanup_close_pair_ int + kmsg_socket_pair[2] = { -1, -1 }, + rtnl_socket_pair[2] = { -1, -1 }, + pid_socket_pair[2] = { -1, -1 }, + uuid_socket_pair[2] = { -1, -1 }, + uid_shift_socket_pair[2] = { -1, -1 }; + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + ContainerStatus container_status; char last_char = 0; + int ifi = 0; + ssize_t l; + + if (arg_uid_shift_pick) { + /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely + * check with getpwuid() if the specific user already exists. Note that /etc might be + * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we + * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are + * really just an extra safety net. We kinda assume that the UID range we allocate from is + * really ours. */ + + etc_passwd_lock = take_etc_passwd_lock(NULL); + if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) { + log_error_errno(r, "Failed to take /etc/passwd lock: %m"); + goto finish; + } + } r = barrier_create(&barrier); if (r < 0) { @@ -3511,6 +3633,43 @@ int main(int argc, char *argv[]) { uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); + if (arg_userns) { + /* The child just let us know the UID shift it might have read from the image. */ + l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) { + r = log_error_errno(errno, "Failed to read UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while reading UID shift."); + r = EIO; + goto finish; + } + + if (arg_uid_shift_pick) { + /* If we are supposed to pick the UID shift, let's try to use the shift read from the + * image, but if that's already in use, pick a new one, and report back to the child, + * which one we now picked. */ + + r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock); + if (r < 0) { + log_error_errno(r, "Failed to pick suitable UID/GID range: %m"); + goto finish; + } + + l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); + if (l < 0) { + r = log_error_errno(errno, "Failed to send UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short write while writing UID shift."); + r = -EIO; + goto finish; + } + } + } + /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", pid, NULL); if (r < 0) @@ -3554,17 +3713,6 @@ int main(int argc, char *argv[]) { goto finish; } - l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); - if (l < 0) { - r = log_error_errno(errno, "Failed to read UID shift: %m"); - goto finish; - } - if (l != sizeof(arg_uid_shift)) { - log_error("Short read while reading UID shift."); - r = EIO; - goto finish; - } - r = setup_uid_map(pid); if (r < 0) goto finish; @@ -3662,6 +3810,10 @@ int main(int argc, char *argv[]) { goto finish; } + /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear + * in getpwuid(), thus we can release the /etc/passwd lock. */ + etc_passwd_lock = safe_close(etc_passwd_lock); + sd_notifyf(false, "READY=1\n" "STATUS=Container running.\n" -- cgit v1.2.3-54-g00ecf From 19aac838fc3b7bcaed272f19a0bec3962eef7418 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 11:47:35 +0200 Subject: nspawn: add -U as shortcut for --private-users=pick Given that user namespacing is pretty useful now, let's add a shortcut command line switch for the logic. --- src/nspawn/nspawn.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index c330456ff9..3e32f59f75 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -212,9 +212,10 @@ static void help(void) { " --uuid=UUID Set a specific machine UUID for the container\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" + " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n" " --private-users[=UIDBASE[:NUIDS]]\n" - " Run within user namespace\n" - " --private-user-chown Adjust OS tree file ownership for private user range\n" + " Run within user namespace, user configured UID/GID range\n" + " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -425,7 +426,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argc >= 0); assert(argv); - while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0) switch (c) { @@ -860,6 +861,14 @@ static int parse_argv(int argc, char *argv[]) { arg_userns_chown = true; break; + case 'U': + arg_userns = true; + arg_userns_chown = true; + arg_uid_shift = UID_INVALID; + arg_uid_range = 0x10000U; + arg_uid_shift_pick = true; + break; + case ARG_KILL_SIGNAL: arg_kill_signal = signal_from_string_try_harder(optarg); if (arg_kill_signal < 0) { -- cgit v1.2.3-54-g00ecf From 0de7accea9e0f7b696f108a43bff7e454fa8f016 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 13:02:53 +0200 Subject: nspawn: allow configuration of user namespaces in .nspawn files In order to implement this we change the bool arg_userns into an enum UserNamespaceMode, which can take one of NO, PICK or FIXED, and replace the arg_uid_range_pick bool with it. --- src/nspawn/nspawn-gperf.gperf | 4 +- src/nspawn/nspawn-settings.c | 86 ++++++++++++++++++++ src/nspawn/nspawn-settings.h | 15 +++- src/nspawn/nspawn.c | 183 ++++++++++++++++++++++++++++-------------- 4 files changed, 225 insertions(+), 63 deletions(-) (limited to 'src') diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index 116655cdd2..34e1310e29 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -16,7 +16,7 @@ struct ConfigPerfItem; %includes %% Exec.Boot, config_parse_boot, 0, 0 -Exec.ProcessTwo, config_parse_pid2, 0, 0, +Exec.ProcessTwo, config_parse_pid2, 0, 0 Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) Exec.User, config_parse_string, 0, offsetof(Settings, user) @@ -26,11 +26,13 @@ Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) +Exec.PrivateUsers, config_parse_private_users, 0, 0 Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 Files.BindReadOnly, config_parse_bind, 1, 0 Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 4fb0054698..b98a79fd09 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -25,7 +25,9 @@ #include "parse-util.h" #include "process-util.h" #include "strv.h" +#include "user-util.h" #include "util.h" +#include "string-util.h" int settings_load(FILE *f, const char *path, Settings **ret) { _cleanup_(settings_freep) Settings *s = NULL; @@ -40,9 +42,13 @@ int settings_load(FILE *f, const char *path, Settings **ret) { s->start_mode = _START_MODE_INVALID; s->personality = PERSONALITY_INVALID; + s->userns_mode = _USER_NAMESPACE_MODE_INVALID; + s->uid_shift = UID_INVALID; + s->uid_range = UID_INVALID; s->read_only = -1; s->volatile_mode = _VOLATILE_MODE_INVALID; + s->userns_chown = -1; s->private_network = -1; s->network_veth = -1; @@ -59,6 +65,16 @@ int settings_load(FILE *f, const char *path, Settings **ret) { if (r < 0) return r; + /* Make sure that if userns_mode is set, userns_chown is set to something appropriate, and vice versa. Either + * both fields shall be initialized or neither. */ + if (s->userns_mode == USER_NAMESPACE_PICK) + s->userns_chown = true; + else if (s->userns_mode != _USER_NAMESPACE_MODE_INVALID && s->userns_chown < 0) + s->userns_chown = false; + + if (s->userns_chown >= 0 && s->userns_mode == _USER_NAMESPACE_MODE_INVALID) + s->userns_mode = USER_NAMESPACE_NO; + *ret = s; s = NULL; @@ -392,3 +408,73 @@ conflict: log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring."); return 0; } + +int config_parse_private_users( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r == 0) { + /* no: User namespacing off */ + settings->userns_mode = USER_NAMESPACE_NO; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (streq(rvalue, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + settings->userns_mode = USER_NAMESPACE_PICK; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else { + const char *range, *shift; + uid_t sh, rn; + + /* anything else: User namespacing on, UID range is explicitly configured */ + + range = strchr(rvalue, ':'); + if (range) { + shift = strndupa(rvalue, range - rvalue); + range++; + + r = safe_atou32(range, &rn); + if (r < 0 || rn <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID range invalid, ignoring: %s", range); + return 0; + } + } else { + shift = rvalue; + rn = UINT32_C(0x10000); + } + + r = parse_uid(shift, &sh); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID shift invalid, ignoring: %s", range); + return 0; + } + + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = sh; + settings->uid_range = rn; + } + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index a017405cd9..e12e91b886 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -33,6 +33,14 @@ typedef enum StartMode { _START_MODE_INVALID = -1 } StartMode; +typedef enum UserNamespaceMode { + USER_NAMESPACE_NO, + USER_NAMESPACE_FIXED, + USER_NAMESPACE_PICK, + _USER_NAMESPACE_MODE_MAX, + _USER_NAMESPACE_MODE_INVALID = -1, +} UserNamespaceMode; + typedef enum SettingsMask { SETTING_START_MODE = 1 << 0, SETTING_ENVIRONMENT = 1 << 1, @@ -47,7 +55,8 @@ typedef enum SettingsMask { SETTING_VOLATILE_MODE = 1 << 10, SETTING_CUSTOM_MOUNTS = 1 << 11, SETTING_WORKING_DIRECTORY = 1 << 12, - _SETTINGS_MASK_ALL = (1 << 13) -1 + SETTING_USERNS = 1 << 13, + _SETTINGS_MASK_ALL = (1 << 14) -1 } SettingsMask; typedef struct Settings { @@ -62,12 +71,15 @@ typedef struct Settings { unsigned long personality; sd_id128_t machine_id; char *working_directory; + UserNamespaceMode userns_mode; + uid_t uid_shift, uid_range; /* [Image] */ int read_only; VolatileMode volatile_mode; CustomMount *custom_mounts; unsigned n_custom_mounts; + int userns_chown; /* [Network] */ int private_network; @@ -99,3 +111,4 @@ int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, co int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 3e32f59f75..40e3d5a3fe 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -181,10 +181,9 @@ static char *arg_image = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; -static bool arg_userns = false; -static bool arg_userns_chown = false; +static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; -static bool arg_uid_shift_pick = false; +static bool arg_userns_chown = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -284,7 +283,7 @@ static int custom_mounts_prepare(void) { for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; - if (path_equal(m->destination, "/") && arg_userns) { + if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) { if (arg_userns_chown) { log_error("--private-users-chown may not be combined with custom root mounts."); @@ -817,56 +816,67 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_PRIVATE_USERS: - if (optarg) { + + r = optarg ? parse_boolean(optarg) : 1; + if (r == 0) { + /* no: User namespacing off */ + arg_userns_mode = USER_NAMESPACE_NO; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + arg_userns_mode = USER_NAMESPACE_FIXED; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (streq(optarg, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else { _cleanup_free_ char *buffer = NULL; const char *range, *shift; - if (streq(optarg, "pick")) { - arg_uid_shift = UID_INVALID; - arg_uid_range = 0x10000U; - arg_uid_shift_pick = true; - } else { - range = strchr(optarg, ':'); - if (range) { - buffer = strndup(optarg, range - optarg); - if (!buffer) - return log_oom(); - shift = buffer; - - range++; - if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { - log_error("Failed to parse UID range: %s", range); - return -EINVAL; - } - } else - shift = optarg; - - if (parse_uid(shift, &arg_uid_shift) < 0) { - log_error("Failed to parse UID: %s", optarg); + /* anything else: User namespacing on, UID range is explicitly configured */ + + range = strchr(optarg, ':'); + if (range) { + buffer = strndup(optarg, range - optarg); + if (!buffer) + return log_oom(); + shift = buffer; + + range++; + if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { + log_error("Failed to parse UID range: %s", range); return -EINVAL; } + } else + shift = optarg; - arg_uid_shift_pick = false; + if (parse_uid(shift, &arg_uid_shift) < 0) { + log_error("Failed to parse UID: %s", optarg); + return -EINVAL; } - } else { - arg_uid_shift = UID_INVALID; - arg_uid_range = 0x10000U; - arg_uid_shift_pick = false; + + arg_userns_mode = USER_NAMESPACE_FIXED; } - arg_userns = true; + arg_settings_mask |= SETTING_USERNS; break; - case ARG_PRIVATE_USERS_CHOWN: - arg_userns_chown = true; + case 'U': + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + arg_settings_mask |= SETTING_USERNS; break; - case 'U': - arg_userns = true; + case ARG_PRIVATE_USERS_CHOWN: arg_userns_chown = true; - arg_uid_shift = UID_INVALID; - arg_uid_range = 0x10000U; - arg_uid_shift_pick = true; + + arg_settings_mask |= SETTING_USERNS; break; case ARG_KILL_SIGNAL: @@ -937,7 +947,7 @@ static int parse_argv(int argc, char *argv[]) { if (arg_share_system) arg_register = false; - if (arg_uid_shift_pick) + if (arg_userns_mode == USER_NAMESPACE_PICK) arg_userns_chown = true; if (arg_start_mode != START_PID1 && arg_share_system) { @@ -980,7 +990,7 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) { + if (arg_userns_mode != USER_NAMESPACE_NO && access("/proc/self/uid_map", F_OK) < 0) { log_error("--private-users= is not supported, kernel compiled without user namespace support."); return -EOPNOTSUPP; } @@ -1047,7 +1057,7 @@ static int verify_arguments(void) { static int userns_lchown(const char *p, uid_t uid, gid_t gid) { assert(p); - if (!arg_userns) + if (arg_userns_mode == USER_NAMESPACE_NO) return 0; if (uid == UID_INVALID && gid == GID_INVALID) @@ -2277,7 +2287,7 @@ static int recursive_chown(const char *directory, uid_t shift, uid_t range) { assert(directory); - if (!arg_userns || !arg_userns_chown) + if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown) return 0; r = path_patch_uid(directory, arg_uid_shift, arg_uid_range); @@ -2512,7 +2522,7 @@ static int determine_names(void) { static int determine_uid_shift(const char *directory) { int r; - if (!arg_userns) { + if (arg_userns_mode == USER_NAMESPACE_NO) { arg_uid_shift = 0; return 0; } @@ -2575,7 +2585,7 @@ static int inner_child( cg_unified_flush(); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Tell the parent, that it now can write the UID map. */ (void) barrier_place(barrier); /* #1 */ @@ -2586,7 +2596,14 @@ static int inner_child( } } - r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context); + r = mount_all(NULL, + arg_userns_mode != USER_NAMESPACE_NO, + true, + arg_private_network, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); + if (r < 0) return r; @@ -2825,7 +2842,7 @@ static int outer_child( if (r < 0) return r; - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); if (l < 0) @@ -2835,7 +2852,7 @@ static int outer_child( return -EIO; } - if (arg_uid_shift_pick) { + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift * we just read from the image is available. If yes, it will send the UID shift back to us, if * not it will pick a different one, and send it back to us. */ @@ -2860,11 +2877,23 @@ static int outer_child( if (r < 0) return r; - r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); + r = setup_volatile( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); if (r < 0) return r; - r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); + r = setup_volatile_state( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); if (r < 0) return r; @@ -2878,7 +2907,13 @@ static int outer_child( return log_error_errno(r, "Failed to make tree read-only: %m"); } - r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_all(directory, + arg_userns_mode != USER_NAMESPACE_NO, + false, + arg_private_network, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; @@ -2920,11 +2955,24 @@ static int outer_child( if (r < 0) return r; - r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_custom( + directory, + arg_custom_mounts, + arg_n_custom_mounts, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; - r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_cgroups( + directory, + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; @@ -2935,7 +2983,7 @@ static int outer_child( pid = raw_clone(SIGCHLD|CLONE_NEWNS| (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | (arg_private_network ? CLONE_NEWNET : 0) | - (arg_userns ? CLONE_NEWUSER : 0), + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0), NULL); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); @@ -2986,7 +3034,7 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { assert(shift); assert(ret_lock_file); - assert(arg_uid_shift_pick); + assert(arg_userns_mode == USER_NAMESPACE_PICK); assert(arg_uid_range == 0x10000U); candidate = *shift; @@ -3265,6 +3313,19 @@ static int load_settings(void) { } } + if ((arg_settings_mask & SETTING_USERNS) == 0 && + settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) { + + if (!arg_settings_trusted) + log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p); + else { + arg_userns_mode = settings->userns_mode; + arg_uid_shift = settings->uid_shift; + arg_uid_range = settings->uid_range; + arg_userns_chown = settings->userns_chown; + } + } + return 0; } @@ -3525,7 +3586,7 @@ int main(int argc, char *argv[]) { int ifi = 0; ssize_t l; - if (arg_uid_shift_pick) { + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely * check with getpwuid() if the specific user already exists. Note that /etc might be * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we @@ -3566,7 +3627,7 @@ int main(int argc, char *argv[]) { goto finish; } - if (arg_userns) + if (arg_userns_mode != USER_NAMESPACE_NO) if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); goto finish; @@ -3642,7 +3703,7 @@ int main(int argc, char *argv[]) { uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* The child just let us know the UID shift it might have read from the image. */ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); if (l < 0) { @@ -3655,7 +3716,7 @@ int main(int argc, char *argv[]) { goto finish; } - if (arg_uid_shift_pick) { + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* If we are supposed to pick the UID shift, let's try to use the shift read from the * image, but if that's already in use, pick a new one, and report back to the child, * which one we now picked. */ @@ -3715,7 +3776,7 @@ int main(int argc, char *argv[]) { log_debug("Init process invoked as PID " PID_FMT, pid); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { if (!barrier_place_and_sync(&barrier)) { /* #1 */ log_error("Child died too early."); r = -ESRCH; -- cgit v1.2.3-54-g00ecf From ccabee0d6465f06b9d339cf449fd8eea0db13373 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 14:10:09 +0200 Subject: nspawn: make -U a tiny bit smarter With this change -U will turn on user namespacing only if the kernel actually supports it and otherwise gracefully degrade to non-userns mode. --- man/systemd-nspawn.xml | 4 +++- src/basic/user-util.h | 5 +++++ src/nspawn/nspawn.c | 13 ++++++++----- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index ea0c6562f8..bd688a0ee1 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -444,7 +444,9 @@ - Equivalent to . + If the kernel supports the user namespaces feature, equivalent to + , otherwise equivalent to + . diff --git a/src/basic/user-util.h b/src/basic/user-util.h index c23f1d485d..8026eca3f4 100644 --- a/src/basic/user-util.h +++ b/src/basic/user-util.h @@ -21,6 +21,7 @@ #include #include +#include bool uid_is_valid(uid_t uid); @@ -63,3 +64,7 @@ int take_etc_passwd_lock(const char *root); #define PTR_TO_GID(p) ((gid_t) (((uintptr_t) (p))-1)) #define GID_TO_PTR(u) ((void*) (((uintptr_t) (u))+1)) + +static inline bool userns_supported(void) { + return access("/proc/self/uid_map", F_OK) >= 0; +} diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 40e3d5a3fe..c8a7ec71a3 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -866,11 +866,14 @@ static int parse_argv(int argc, char *argv[]) { break; case 'U': - arg_userns_mode = USER_NAMESPACE_PICK; - arg_uid_shift = UID_INVALID; - arg_uid_range = UINT32_C(0x10000); + if (userns_supported()) { + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + arg_settings_mask |= SETTING_USERNS; + } - arg_settings_mask |= SETTING_USERNS; break; case ARG_PRIVATE_USERS_CHOWN: @@ -990,7 +993,7 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns_mode != USER_NAMESPACE_NO && access("/proc/self/uid_map", F_OK) < 0) { + if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) { log_error("--private-users= is not supported, kernel compiled without user namespace support."); return -EOPNOTSUPP; } -- cgit v1.2.3-54-g00ecf From 88cd066e11aef5dd73b563c1753ad8bf4dfd9f62 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 18:10:16 +0200 Subject: nspawn: don't try to patch UIDs/GIDs of procfs and suchlike --- src/basic/missing.h | 4 ++++ src/nspawn/nspawn-patch-uid.c | 44 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/basic/missing.h b/src/basic/missing.h index 6616f0b720..b389e94cf7 100644 --- a/src/basic/missing.h +++ b/src/basic/missing.h @@ -445,6 +445,10 @@ struct btrfs_ioctl_quota_ctl_args { #define TMPFS_MAGIC 0x01021994 #endif +#ifndef MQUEUE_MAGIC +#define MQUEUE_MAGIC 0x19800202 +#endif + #ifndef MS_MOVE #define MS_MOVE 8192 #endif diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c index f53164fbbf..429c45a3a7 100644 --- a/src/nspawn/nspawn-patch-uid.c +++ b/src/nspawn/nspawn-patch-uid.c @@ -18,16 +18,20 @@ ***/ #include +#include #ifdef HAVE_ACL #include #endif #include +#include #include #include "acl-util.h" #include "dirent-util.h" #include "fd-util.h" +#include "missing.h" #include "nspawn-patch-uid.h" +#include "stat-util.h" #include "stdio-util.h" #include "string-util.h" #include "strv.h" @@ -276,12 +280,46 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift return r > 0 || changed; } +static int is_procfs_sysfs_or_suchlike(int fd) { + struct statfs sfs; + + assert(fd >= 0); + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); +} + static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift) { bool changed = false; int r; assert(fd >= 0); + /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we + * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's + * stop the recursion when we hit a procfs or sysfs file system. */ + r = is_procfs_sysfs_or_suchlike(fd); + if (r < 0) + goto finish; + if (r > 0) { + r = 0; /* don't recurse */ + goto finish; + } + r = patch_fd(fd, NULL, st, shift); if (r < 0) goto finish; @@ -294,8 +332,10 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift int copy; copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); - if (copy < 0) - return -errno; + if (copy < 0) { + r = -errno; + goto finish; + } fd = copy; donate_fd = true; -- cgit v1.2.3-54-g00ecf From 4aeb20f5aaec25ef969989b64d37377913b2a1ef Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 25 Apr 2016 12:48:05 +0200 Subject: nspawn: when readjusting UID/GID ownership of OS trees, skip read-only subtrees This should allow tools like rkt to pre-mount read-only subtrees in the OS tree, without breaking the patching code. Note that the code will still fail, if the top-level directory is already read-only. --- src/basic/fd-util.c | 10 ++++++++++ src/basic/fd-util.h | 2 ++ src/nspawn/nspawn-patch-uid.c | 18 +++++++++++++++--- 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/basic/fd-util.c b/src/basic/fd-util.c index ec9560cd07..3d46d708c7 100644 --- a/src/basic/fd-util.c +++ b/src/basic/fd-util.c @@ -25,11 +25,13 @@ #include #include "fd-util.h" +#include "fs-util.h" #include "macro.h" #include "missing.h" #include "parse-util.h" #include "path-util.h" #include "socket-util.h" +#include "stdio-util.h" #include "util.h" int close_nointr(int fd) { @@ -356,3 +358,11 @@ bool fdname_is_valid(const char *s) { return p - s < 256; } + +int fd_get_path(int fd, char **ret) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; + + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + + return readlink_malloc(procfs_path, ret); +} diff --git a/src/basic/fd-util.h b/src/basic/fd-util.h index 44528c6e35..b86e41698a 100644 --- a/src/basic/fd-util.h +++ b/src/basic/fd-util.h @@ -72,6 +72,8 @@ void cmsg_close_all(struct msghdr *mh); bool fdname_is_valid(const char *s); +int fd_get_path(int fd, char **ret); + /* Hint: ENETUNREACH happens if we try to connect to "non-existing" special IP addresses, such as ::5 */ #define ERRNO_IS_DISCONNECT(r) \ IN_SET(r, ENOTCONN, ECONNRESET, ECONNREFUSED, ECONNABORTED, EPIPE, ENETUNREACH) diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c index 429c45a3a7..c7382d412d 100644 --- a/src/nspawn/nspawn-patch-uid.c +++ b/src/nspawn/nspawn-patch-uid.c @@ -303,7 +303,7 @@ static int is_procfs_sysfs_or_suchlike(int fd) { F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); } -static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift) { +static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) { bool changed = false; int r; @@ -321,6 +321,18 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift } r = patch_fd(fd, NULL, st, shift); + if (r == -EROFS) { + _cleanup_free_ char *name = NULL; + + if (!is_toplevel) { + /* When we hit a ready-only subtree we simply skip it, but log about it. */ + (void) fd_get_path(fd, &name); + log_debug("Skippping read-only file or directory %s.", strna(name)); + r = 0; + } + + goto finish; + } if (r < 0) goto finish; @@ -369,7 +381,7 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift } - r = recurse_fd(subdir_fd, true, &fst, shift); + r = recurse_fd(subdir_fd, true, &fst, shift, false); if (r < 0) goto finish; if (r > 0) @@ -433,7 +445,7 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0) return 0; - return recurse_fd(fd, donate_fd, &st, shift); + return recurse_fd(fd, donate_fd, &st, shift, true); finish: if (donate_fd) -- cgit v1.2.3-54-g00ecf