diff options
Diffstat (limited to 'src/systemd-nspawn')
-rw-r--r-- | src/systemd-nspawn/Makefile | 17 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-cgroup.c | 9 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-gperf.gperf | 7 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-mount.c | 25 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-network.c | 184 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-network.h | 5 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-patch-uid.c | 469 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-patch-uid.h | 23 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-register.c | 1 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-settings.c | 124 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-settings.h | 17 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn.c | 592 | ||||
-rw-r--r-- | src/systemd-nspawn/test-patch-uid.c | 61 |
13 files changed, 1408 insertions, 126 deletions
diff --git a/src/systemd-nspawn/Makefile b/src/systemd-nspawn/Makefile index 380266ea7f..9702abcbc5 100644 --- a/src/systemd-nspawn/Makefile +++ b/src/systemd-nspawn/Makefile @@ -41,10 +41,14 @@ systemd_nspawn_SOURCES = \ src/nspawn/nspawn-setuid.h \ src/nspawn/nspawn-stub-pid1.c \ src/nspawn/nspawn-stub-pid1.h \ + src/nspawn/nspawn-patch-uid.c \ + src/nspawn/nspawn-patch-uid.h \ src/core/mount-setup.c \ src/core/mount-setup.h \ src/core/loopback-setup.c \ - src/core/loopback-setup.h + src/core/loopback-setup.h \ + src/core/machine-id-setup.c \ + src/core/machine-id-setup.h nodist_systemd_nspawn_SOURCES = \ src/nspawn/nspawn-gperf.c @@ -66,6 +70,17 @@ systemd_nspawn_LDADD += \ libfirewall.la endif # HAVE_LIBIPTC +test_patch_uid_SOURCES = \ + src/nspawn/nspawn-patch-uid.c \ + src/nspawn/nspawn-patch-uid.h \ + src/nspawn/test-patch-uid.c + +test_patch_uid_LDADD = \ + libshared.la + +manual_tests += \ + test-patch-uid + bin_PROGRAMS += systemd-nspawn systemd_nspawn_LDADD += libsystemd.la # was hidden by libshared->libsystemd systemd_nspawn_LDADD += libbasic.la # was hidden by libshared->libsystemd->libbasic diff --git a/src/systemd-nspawn/nspawn-cgroup.c b/src/systemd-nspawn/nspawn-cgroup.c index 1db5ba7116..f50f1ad6c2 100644 --- a/src/systemd-nspawn/nspawn-cgroup.c +++ b/src/systemd-nspawn/nspawn-cgroup.c @@ -55,8 +55,7 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) { "cgroup.events", "cgroup.clone_children", "cgroup.controllers", - "cgroup.subtree_control", - "cgroup.populated") + "cgroup.subtree_control") if (fchownat(fd, fn, uid_shift, uid_shift, 0) < 0) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to chown() cgroup file %s, ignoring: %m", fn); @@ -73,7 +72,7 @@ int sync_cgroup(pid_t pid, bool unified_requested) { unified = cg_unified(); if (unified < 0) - return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m"); if ((unified > 0) == unified_requested) return 0; @@ -94,7 +93,7 @@ int sync_cgroup(pid_t pid, bool unified_requested) { if (unified) r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); else - r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior"); + r = mount("cgroup", tree, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); if (r < 0) { r = log_error_errno(errno, "Failed to mount unified hierarchy: %m"); goto finish; @@ -135,7 +134,7 @@ int create_subcgroup(pid_t pid, bool unified_requested) { unified = cg_unified(); if (unified < 0) - return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m"); if (unified == 0) return 0; diff --git a/src/systemd-nspawn/nspawn-gperf.gperf b/src/systemd-nspawn/nspawn-gperf.gperf index 116655cdd2..2b5d452662 100644 --- a/src/systemd-nspawn/nspawn-gperf.gperf +++ b/src/systemd-nspawn/nspawn-gperf.gperf @@ -16,7 +16,7 @@ struct ConfigPerfItem; %includes %% Exec.Boot, config_parse_boot, 0, 0 -Exec.ProcessTwo, config_parse_pid2, 0, 0, +Exec.ProcessTwo, config_parse_pid2, 0, 0 Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) Exec.User, config_parse_string, 0, offsetof(Settings, user) @@ -26,16 +26,19 @@ Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) +Exec.PrivateUsers, config_parse_private_users, 0, 0 Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 Files.BindReadOnly, config_parse_bind, 1, 0 Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan) Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0 -Network.Bridge, config_parse_string, 0, offsetof(Settings, network_bridge) +Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge) +Network.Zone, config_parse_network_zone, 0, 0 Network.Port, config_parse_expose_port, 0, 0 diff --git a/src/systemd-nspawn/nspawn-mount.c b/src/systemd-nspawn/nspawn-mount.c index 70cca15278..8e2d2d543c 100644 --- a/src/systemd-nspawn/nspawn-mount.c +++ b/src/systemd-nspawn/nspawn-mount.c @@ -438,21 +438,22 @@ static int mount_bind(const char *dest, CustomMount *m) { r = mkdir_parents_label(where, 0755); if (r < 0) return log_error_errno(r, "Failed to make parents of %s: %m", where); + + /* Create the mount point. Any non-directory file can be + * mounted on any non-directory file (regular, fifo, socket, + * char, block). + */ + if (S_ISDIR(source_st.st_mode)) + r = mkdir_label(where, 0755); + else + r = touch(where); + if (r < 0) + return log_error_errno(r, "Failed to create mount point %s: %m", where); + } else { return log_error_errno(errno, "Failed to stat %s: %m", where); } - /* Create the mount point. Any non-directory file can be - * mounted on any non-directory file (regular, fifo, socket, - * char, block). - */ - if (S_ISDIR(source_st.st_mode)) - r = mkdir_label(where, 0755); - else - r = touch(where); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Failed to create mount point %s: %m", where); - if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0) return log_error_errno(errno, "mount(%s) failed: %m", where); @@ -750,7 +751,7 @@ static int mount_unified_cgroups(const char *dest) { return -EINVAL; } - if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0) + if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p); return 0; diff --git a/src/systemd-nspawn/nspawn-network.c b/src/systemd-nspawn/nspawn-network.c index 58d6035ddb..917827eac1 100644 --- a/src/systemd-nspawn/nspawn-network.c +++ b/src/systemd-nspawn/nspawn-network.c @@ -26,8 +26,11 @@ #include "alloc-util.h" #include "ether-addr-util.h" +#include "lockfile-util.h" #include "nspawn-network.h" #include "siphash24.h" +#include "socket-util.h" +#include "stat-util.h" #include "string-util.h" #include "udev-util.h" #include "util.h" @@ -38,6 +41,30 @@ #define VETH_EXTRA_CONTAINER_HASH_KEY SD_ID128_MAKE(af,50,17,61,ce,f9,4d,35,84,0d,2b,20,54,be,ce,59) #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f) +static int remove_one_link(sd_netlink *rtnl, const char *name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + if (isempty(name)) + return 0; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_DELLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, name); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r == -ENODEV) /* Already gone */ + return 0; + if (r < 0) + return log_error_errno(r, "Failed to remove interface %s: %m", name); + + return 1; +} + static int generate_mac( const char *machine_name, struct ether_addr *mac, @@ -231,51 +258,155 @@ int setup_veth_extra( if (r < 0) return r; - idx ++; + idx++; } return 0; } -int setup_bridge(const char *veth_name, const char *bridge_name) { +static int join_bridge(sd_netlink *rtnl, const char *veth_name, const char *bridge_name) { _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; - _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; int r, bridge_ifi; + assert(rtnl); assert(veth_name); assert(bridge_name); bridge_ifi = (int) if_nametoindex(bridge_name); if (bridge_ifi <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", bridge_name); - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); + return -errno; r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0); if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); + return r; r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP); if (r < 0) - return log_error_errno(r, "Failed to set IFF_UP flag: %m"); + return r; r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name); if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name field: %m"); + return r; r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge_ifi); if (r < 0) - return log_error_errno(r, "Failed to add netlink master field: %m"); + return r; r = sd_netlink_call(rtnl, m, 0, NULL); if (r < 0) - return log_error_errno(r, "Failed to add veth interface to bridge: %m"); + return r; return bridge_ifi; } +static int create_bridge(sd_netlink *rtnl, const char *bridge_name) { + _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; + int r; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return r; + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, bridge_name); + if (r < 0) + return r; + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return r; + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "bridge"); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_message_close_container(m); + if (r < 0) + return r; + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return r; + + return 0; +} + +int setup_bridge(const char *veth_name, const char *bridge_name, bool create) { + _cleanup_release_lock_file_ LockFile bridge_lock = LOCK_FILE_INIT; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + int r, bridge_ifi; + unsigned n = 0; + + assert(veth_name); + assert(bridge_name); + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + if (create) { + /* We take a system-wide lock here, so that we can safely check whether there's still a member in the + * bridge before removing it, without risking interferance from other nspawn instances. */ + + r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock); + if (r < 0) + return log_error_errno(r, "Failed to take network zone lock: %m"); + } + + for (;;) { + bridge_ifi = join_bridge(rtnl, veth_name, bridge_name); + if (bridge_ifi >= 0) + return bridge_ifi; + if (bridge_ifi != -ENODEV || !create || n > 10) + return log_error_errno(bridge_ifi, "Failed to add interface %s to bridge %s: %m", veth_name, bridge_name); + + /* Count attempts, so that we don't enter an endless loop here. */ + n++; + + /* The bridge doesn't exist yet. Let's create it */ + r = create_bridge(rtnl, bridge_name); + if (r < 0) + return log_error_errno(r, "Failed to create bridge interface %s: %m", bridge_name); + + /* Try again, now that the bridge exists */ + } +} + +int remove_bridge(const char *bridge_name) { + _cleanup_release_lock_file_ LockFile bridge_lock = LOCK_FILE_INIT; + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + const char *path; + int r; + + /* Removes the specified bridge, but only if it is currently empty */ + + if (isempty(bridge_name)) + return 0; + + r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock); + if (r < 0) + return log_error_errno(r, "Failed to take network zone lock: %m"); + + path = strjoina("/sys/class/net/", bridge_name, "/brif"); + + r = dir_is_empty(path); + if (r == -ENOENT) /* Already gone? */ + return 0; + if (r < 0) + return log_error_errno(r, "Can't detect if bridge %s is empty: %m", bridge_name); + if (r == 0) /* Still populated, leave it around */ + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + return remove_one_link(rtnl, bridge_name); +} + static int parse_interface(struct udev *udev, const char *name) { _cleanup_udev_device_unref_ struct udev_device *d = NULL; char ifi_str[2 + DECIMAL_STR_MAX(int)]; @@ -514,13 +645,13 @@ int veth_extra_parse(char ***l, const char *p) { r = extract_first_word(&p, &a, ":", EXTRACT_DONT_COALESCE_SEPARATORS); if (r < 0) return r; - if (r == 0 || isempty(a)) + if (r == 0 || !ifname_valid(a)) return -EINVAL; r = extract_first_word(&p, &b, ":", EXTRACT_DONT_COALESCE_SEPARATORS); if (r < 0) return r; - if (r == 0 || isempty(b)) { + if (r == 0 || !ifname_valid(b)) { free(b); b = strdup(a); if (!b) @@ -537,3 +668,26 @@ int veth_extra_parse(char ***l, const char *p) { a = b = NULL; return 0; } + +int remove_veth_links(const char *primary, char **pairs) { + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + char **a, **b; + int r; + + /* In some cases the kernel might pin the veth links between host and container even after the namespace + * died. Hence, let's better remove them explicitly too. */ + + if (isempty(primary) && strv_isempty(pairs)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + remove_one_link(rtnl, primary); + + STRV_FOREACH_PAIR(a, b, pairs) + remove_one_link(rtnl, *a); + + return 0; +} diff --git a/src/systemd-nspawn/nspawn-network.h b/src/systemd-nspawn/nspawn-network.h index 9ab1606d1c..3d8861e1e5 100644 --- a/src/systemd-nspawn/nspawn-network.h +++ b/src/systemd-nspawn/nspawn-network.h @@ -26,7 +26,8 @@ int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], bool bridge); int setup_veth_extra(const char *machine_name, pid_t pid, char **pairs); -int setup_bridge(const char *veth_name, const char *bridge_name); +int setup_bridge(const char *veth_name, const char *bridge_name, bool create); +int remove_bridge(const char *bridge_name); int setup_macvlan(const char *machine_name, pid_t pid, char **ifaces); int setup_ipvlan(const char *machine_name, pid_t pid, char **ifaces); @@ -34,3 +35,5 @@ int setup_ipvlan(const char *machine_name, pid_t pid, char **ifaces); int move_network_interfaces(pid_t pid, char **ifaces); int veth_extra_parse(char ***l, const char *p); + +int remove_veth_links(const char *primary, char **pairs); diff --git a/src/systemd-nspawn/nspawn-patch-uid.c b/src/systemd-nspawn/nspawn-patch-uid.c new file mode 100644 index 0000000000..c7382d412d --- /dev/null +++ b/src/systemd-nspawn/nspawn-patch-uid.c @@ -0,0 +1,469 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <fcntl.h> +#include <linux/magic.h> +#ifdef HAVE_ACL +#include <sys/acl.h> +#endif +#include <sys/stat.h> +#include <sys/vfs.h> +#include <unistd.h> + +#include "acl-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "missing.h" +#include "nspawn-patch-uid.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +#ifdef HAVE_ACL + +static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + acl_t acl; + + assert(fd >= 0); + assert(ret); + + if (name) { + _cleanup_close_ int child_fd = -1; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + xsprintf(procfs_path, "/proc/self/fd/%i", child_fd); + acl = acl_get_file(procfs_path, type); + } else if (type == ACL_TYPE_ACCESS) + acl = acl_get_fd(fd); + else { + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + acl = acl_get_file(procfs_path, type); + } + if (!acl) + return -errno; + + *ret = acl; + return 0; +} + +static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + int r; + + assert(fd >= 0); + assert(acl); + + if (name) { + _cleanup_close_ int child_fd = -1; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + xsprintf(procfs_path, "/proc/self/fd/%i", child_fd); + r = acl_set_file(procfs_path, type, acl); + } else if (type == ACL_TYPE_ACCESS) + r = acl_set_fd(fd, acl); + else { + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + r = acl_set_file(procfs_path, type, acl); + } + if (r < 0) + return -errno; + + return 0; +} + +static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) { + _cleanup_(acl_freep) acl_t copy = NULL; + acl_entry_t i; + int r; + + assert(acl); + assert(ret); + + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + while (r > 0) { + uid_t *old_uid, new_uid; + bool modify = false; + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (IN_SET(tag, ACL_USER, ACL_GROUP)) { + + /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that + * this is actually OK */ + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + old_uid = acl_get_qualifier(i); + if (!old_uid) + return -errno; + + new_uid = shift | (*old_uid & UINT32_C(0xFFFF)); + if (!uid_is_valid(new_uid)) + return -EINVAL; + + modify = new_uid != *old_uid; + if (modify && !copy) { + int n; + + /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the + * beginning, so that we copy all entries, starting from the first, this time. */ + + n = acl_entries(acl); + if (n < 0) + return -errno; + + copy = acl_init(n); + if (!copy) + return -errno; + + /* Seek back to the beginning */ + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + continue; + } + } + + if (copy) { + acl_entry_t new_entry; + + if (acl_create_entry(©, &new_entry) < 0) + return -errno; + + if (acl_copy_entry(new_entry, i) < 0) + return -errno; + + if (modify) + if (acl_set_qualifier(new_entry, &new_uid) < 0) + return -errno; + } + + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i); + if (r < 0) + return -errno; + } + + *ret = copy; + copy = NULL; + + return !!*ret; +} + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + /* ACLs are not supported on symlinks, there's no point in trying */ + if (S_ISLNK(st->st_mode)) + return 0; + + r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl); + if (r == -EOPNOTSUPP) + return 0; + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted); + if (r < 0) + return r; + + changed = true; + } + + if (S_ISDIR(st->st_mode)) { + acl_free(acl); + acl_free(shifted); + + acl = shifted = NULL; + + r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl); + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted); + if (r < 0) + return r; + + changed = true; + } + } + + return changed; +} + +#else + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + return 0; +} + +#endif + +static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) { + uid_t new_uid; + gid_t new_gid; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + new_uid = shift | (st->st_uid & UINT32_C(0xFFFF)); + new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF)); + + if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid)) + return -EINVAL; + + if (st->st_uid != new_uid || st->st_gid != new_gid) { + if (name) + r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW); + else + r = fchown(fd, new_uid, new_gid); + if (r < 0) + return -errno; + + /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */ + if (name && !S_ISLNK(st->st_mode)) + r = fchmodat(fd, name, st->st_mode, 0); + else + r = fchmod(fd, st->st_mode); + if (r < 0) + return -errno; + + changed = true; + } + + r = patch_acls(fd, name, st, shift); + if (r < 0) + return r; + + return r > 0 || changed; +} + +static int is_procfs_sysfs_or_suchlike(int fd) { + struct statfs sfs; + + assert(fd >= 0); + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); +} + +static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) { + bool changed = false; + int r; + + assert(fd >= 0); + + /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we + * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's + * stop the recursion when we hit a procfs or sysfs file system. */ + r = is_procfs_sysfs_or_suchlike(fd); + if (r < 0) + goto finish; + if (r > 0) { + r = 0; /* don't recurse */ + goto finish; + } + + r = patch_fd(fd, NULL, st, shift); + if (r == -EROFS) { + _cleanup_free_ char *name = NULL; + + if (!is_toplevel) { + /* When we hit a ready-only subtree we simply skip it, but log about it. */ + (void) fd_get_path(fd, &name); + log_debug("Skippping read-only file or directory %s.", strna(name)); + r = 0; + } + + goto finish; + } + if (r < 0) + goto finish; + + if (S_ISDIR(st->st_mode)) { + _cleanup_closedir_ DIR *d = NULL; + struct dirent *de; + + if (!donate_fd) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) { + r = -errno; + goto finish; + } + + fd = copy; + donate_fd = true; + } + + d = fdopendir(fd); + if (!d) { + r = -errno; + goto finish; + } + fd = -1; + + FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) { + struct stat fst; + + if (STR_IN_SET(de->d_name, ".", "..")) + continue; + + if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) { + r = -errno; + goto finish; + } + + if (S_ISDIR(fst.st_mode)) { + int subdir_fd; + + subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (subdir_fd < 0) { + r = -errno; + goto finish; + + } + + r = recurse_fd(subdir_fd, true, &fst, shift, false); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + + } else { + r = patch_fd(dirfd(d), de->d_name, &fst, shift); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + } + } + } + + r = changed; + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) { + struct stat st; + int r; + + assert(fd >= 0); + + /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an + * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges + * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual + * UID within the container. */ + + if ((shift & 0xFFFF) != 0) { + /* We only support containers where the shift starts at a 2^16 boundary */ + r = -EOPNOTSUPP; + goto finish; + } + + if (range != 0x10000) { + /* We only support containers with 16bit UID ranges for the patching logic */ + r = -EOPNOTSUPP; + goto finish; + } + + if (fstat(fd, &st) < 0) { + r = -errno; + goto finish; + } + + if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) { + /* We only support containers where the uid/gid container ID match */ + r = -EBADE; + goto finish; + } + + /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume + * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */ + if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0) + return 0; + + return recurse_fd(fd, donate_fd, &st, shift, true); + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +int fd_patch_uid(int fd, uid_t shift, uid_t range) { + return fd_patch_uid_internal(fd, false, shift, range); +} + +int path_patch_uid(const char *path, uid_t shift, uid_t range) { + int fd; + + fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (fd < 0) + return -errno; + + return fd_patch_uid_internal(fd, true, shift, range); +} diff --git a/src/systemd-nspawn/nspawn-patch-uid.h b/src/systemd-nspawn/nspawn-patch-uid.h new file mode 100644 index 0000000000..55d0990016 --- /dev/null +++ b/src/systemd-nspawn/nspawn-patch-uid.h @@ -0,0 +1,23 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> + +int fd_patch_uid(int fd, uid_t shift, uid_t range); +int path_patch_uid(const char *path, uid_t shift, uid_t range); diff --git a/src/systemd-nspawn/nspawn-register.c b/src/systemd-nspawn/nspawn-register.c index de1433a5e2..08cbff9731 100644 --- a/src/systemd-nspawn/nspawn-register.c +++ b/src/systemd-nspawn/nspawn-register.c @@ -20,6 +20,7 @@ #include <systemd/sd-bus.h> #include "bus-error.h" /* for bus_error_message */ +#include "bus-unit-util.h" #include "bus-util.h" #include "nspawn-register.h" #include "stat-util.h" diff --git a/src/systemd-nspawn/nspawn-settings.c b/src/systemd-nspawn/nspawn-settings.c index 4fb0054698..5f1522cfb6 100644 --- a/src/systemd-nspawn/nspawn-settings.c +++ b/src/systemd-nspawn/nspawn-settings.c @@ -24,7 +24,10 @@ #include "nspawn-settings.h" #include "parse-util.h" #include "process-util.h" +#include "socket-util.h" +#include "string-util.h" #include "strv.h" +#include "user-util.h" #include "util.h" int settings_load(FILE *f, const char *path, Settings **ret) { @@ -40,9 +43,13 @@ int settings_load(FILE *f, const char *path, Settings **ret) { s->start_mode = _START_MODE_INVALID; s->personality = PERSONALITY_INVALID; + s->userns_mode = _USER_NAMESPACE_MODE_INVALID; + s->uid_shift = UID_INVALID; + s->uid_range = UID_INVALID; s->read_only = -1; s->volatile_mode = _VOLATILE_MODE_INVALID; + s->userns_chown = -1; s->private_network = -1; s->network_veth = -1; @@ -59,6 +66,16 @@ int settings_load(FILE *f, const char *path, Settings **ret) { if (r < 0) return r; + /* Make sure that if userns_mode is set, userns_chown is set to something appropriate, and vice versa. Either + * both fields shall be initialized or neither. */ + if (s->userns_mode == USER_NAMESPACE_PICK) + s->userns_chown = true; + else if (s->userns_mode != _USER_NAMESPACE_MODE_INVALID && s->userns_chown < 0) + s->userns_chown = false; + + if (s->userns_chown >= 0 && s->userns_mode == _USER_NAMESPACE_MODE_INVALID) + s->userns_mode = USER_NAMESPACE_NO; + *ret = s; s = NULL; @@ -80,6 +97,7 @@ Settings* settings_free(Settings *s) { strv_free(s->network_ipvlan); strv_free(s->network_veth_extra); free(s->network_bridge); + free(s->network_zone); expose_port_free_all(s->expose_ports); custom_mount_free_all(s->custom_mounts, s->n_custom_mounts); @@ -95,6 +113,7 @@ bool settings_private_network(Settings *s) { s->private_network > 0 || s->network_veth > 0 || s->network_bridge || + s->network_zone || s->network_interfaces || s->network_macvlan || s->network_ipvlan || @@ -106,7 +125,8 @@ bool settings_network_veth(Settings *s) { return s->network_veth > 0 || - s->network_bridge; + s->network_bridge || + s->network_zone; } DEFINE_CONFIG_PARSE_ENUM(config_parse_volatile_mode, volatile_mode, VolatileMode, "Failed to parse volatile mode"); @@ -303,6 +323,38 @@ int config_parse_veth_extra( return 0; } +int config_parse_network_zone( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + _cleanup_free_ char *j = NULL; + + assert(filename); + assert(lvalue); + assert(rvalue); + + j = strappend("vz-", rvalue); + if (!ifname_valid(j)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid network zone name %s, ignoring: %m", rvalue); + return 0; + } + + free(settings->network_zone); + settings->network_zone = j; + j = NULL; + + return 0; +} + int config_parse_boot( const char *unit, const char *filename, @@ -392,3 +444,73 @@ conflict: log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring."); return 0; } + +int config_parse_private_users( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r == 0) { + /* no: User namespacing off */ + settings->userns_mode = USER_NAMESPACE_NO; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (streq(rvalue, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + settings->userns_mode = USER_NAMESPACE_PICK; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else { + const char *range, *shift; + uid_t sh, rn; + + /* anything else: User namespacing on, UID range is explicitly configured */ + + range = strchr(rvalue, ':'); + if (range) { + shift = strndupa(rvalue, range - rvalue); + range++; + + r = safe_atou32(range, &rn); + if (r < 0 || rn <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID range invalid, ignoring: %s", range); + return 0; + } + } else { + shift = rvalue; + rn = UINT32_C(0x10000); + } + + r = parse_uid(shift, &sh); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID shift invalid, ignoring: %s", range); + return 0; + } + + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = sh; + settings->uid_range = rn; + } + + return 0; +} diff --git a/src/systemd-nspawn/nspawn-settings.h b/src/systemd-nspawn/nspawn-settings.h index a017405cd9..1c47e37912 100644 --- a/src/systemd-nspawn/nspawn-settings.h +++ b/src/systemd-nspawn/nspawn-settings.h @@ -33,6 +33,14 @@ typedef enum StartMode { _START_MODE_INVALID = -1 } StartMode; +typedef enum UserNamespaceMode { + USER_NAMESPACE_NO, + USER_NAMESPACE_FIXED, + USER_NAMESPACE_PICK, + _USER_NAMESPACE_MODE_MAX, + _USER_NAMESPACE_MODE_INVALID = -1, +} UserNamespaceMode; + typedef enum SettingsMask { SETTING_START_MODE = 1 << 0, SETTING_ENVIRONMENT = 1 << 1, @@ -47,7 +55,8 @@ typedef enum SettingsMask { SETTING_VOLATILE_MODE = 1 << 10, SETTING_CUSTOM_MOUNTS = 1 << 11, SETTING_WORKING_DIRECTORY = 1 << 12, - _SETTINGS_MASK_ALL = (1 << 13) -1 + SETTING_USERNS = 1 << 13, + _SETTINGS_MASK_ALL = (1 << 14) -1 } SettingsMask; typedef struct Settings { @@ -62,17 +71,21 @@ typedef struct Settings { unsigned long personality; sd_id128_t machine_id; char *working_directory; + UserNamespaceMode userns_mode; + uid_t uid_shift, uid_range; /* [Image] */ int read_only; VolatileMode volatile_mode; CustomMount *custom_mounts; unsigned n_custom_mounts; + int userns_chown; /* [Network] */ int private_network; int network_veth; char *network_bridge; + char *network_zone; char **network_interfaces; char **network_macvlan; char **network_ipvlan; @@ -97,5 +110,7 @@ int config_parse_volatile_mode(const char *unit, const char *filename, unsigned int config_parse_bind(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_network_zone(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/systemd-nspawn/nspawn.c b/src/systemd-nspawn/nspawn.c index 1de527b57b..6390197646 100644 --- a/src/systemd-nspawn/nspawn.c +++ b/src/systemd-nspawn/nspawn.c @@ -22,7 +22,9 @@ #endif #include <errno.h> #include <getopt.h> +#include <grp.h> #include <linux/loop.h> +#include <pwd.h> #include <sched.h> #ifdef HAVE_SECCOMP #include <seccomp.h> @@ -64,6 +66,7 @@ #include "hostname-util.h" #include "log.h" #include "loopback-setup.h" +#include "machine-id-setup.h" #include "machine-image.h" #include "macro.h" #include "missing.h" @@ -73,6 +76,7 @@ #include "nspawn-expose-ports.h" #include "nspawn-mount.h" #include "nspawn-network.h" +#include "nspawn-patch-uid.h" #include "nspawn-register.h" #include "nspawn-settings.h" #include "nspawn-setuid.h" @@ -86,6 +90,7 @@ #ifdef HAVE_SECCOMP #include "seccomp-util.h" #endif +#include "selinux-util.h" #include "signal-util.h" #include "socket-util.h" #include "stat-util.h" @@ -98,6 +103,11 @@ #include "user-util.h" #include "util.h" +/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit + * UID range here */ +#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000)) +#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000)) + typedef enum ContainerStatus { CONTAINER_TERMINATED, CONTAINER_REBOOTED @@ -165,13 +175,15 @@ static char **arg_network_ipvlan = NULL; static bool arg_network_veth = false; static char **arg_network_veth_extra = NULL; static char *arg_network_bridge = NULL; +static char *arg_network_zone = NULL; static unsigned long arg_personality = PERSONALITY_INVALID; static char *arg_image = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; +static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; -static bool arg_userns = false; +static bool arg_userns_chown = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -199,8 +211,10 @@ static void help(void) { " --uuid=UUID Set a specific machine UUID for the container\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" + " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n" " --private-users[=UIDBASE[:NUIDS]]\n" - " Run within user namespace\n" + " Run within user namespace, user configured UID/GID range\n" + " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -220,6 +234,8 @@ static void help(void) { " Add a virtual Ethernet connection between host\n" " and container and add it to an existing bridge on\n" " the host\n" + " --network-zone=NAME Add a virtual Ethernet connection to the container,\n" + " and add it to an automatically managed bridge interface\n" " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" " Expose a container IP port on the host\n" " -Z --selinux-context=SECLABEL\n" @@ -247,7 +263,7 @@ static void help(void) { " the container\n" " --overlay-ro=PATH[:PATH...]:PATH\n" " Similar, but creates a read-only overlay mount\n" - " --setenv=NAME=VALUE Pass an environment variable to PID 1\n" + " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n" " --share-system Share system namespaces with host\n" " --register=BOOLEAN Register container as machine\n" " --keep-unit Do not register a scope for the machine, reuse\n" @@ -269,9 +285,15 @@ static int custom_mounts_prepare(void) { for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; - if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) { - log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); - return -EINVAL; + if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) { + + if (arg_userns_chown) { + log_error("--private-users-chown may not be combined with custom root mounts."); + return -EINVAL; + } else if (arg_uid_shift == UID_INVALID) { + log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); + return -EINVAL; + } } if (m->type != CUSTOM_MOUNT_OVERLAY) @@ -330,7 +352,6 @@ static int parse_argv(int argc, char *argv[]) { ARG_TMPFS, ARG_OVERLAY, ARG_OVERLAY_RO, - ARG_SETENV, ARG_SHARE_SYSTEM, ARG_REGISTER, ARG_KEEP_UNIT, @@ -338,6 +359,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_NETWORK_MACVLAN, ARG_NETWORK_IPVLAN, ARG_NETWORK_BRIDGE, + ARG_NETWORK_ZONE, ARG_NETWORK_VETH_EXTRA, ARG_PERSONALITY, ARG_VOLATILE, @@ -347,6 +369,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_KILL_SIGNAL, ARG_SETTINGS, ARG_CHDIR, + ARG_PRIVATE_USERS_CHOWN, }; static const struct option options[] = { @@ -371,7 +394,7 @@ static int parse_argv(int argc, char *argv[]) { { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, { "machine", required_argument, NULL, 'M' }, { "slice", required_argument, NULL, 'S' }, - { "setenv", required_argument, NULL, ARG_SETENV }, + { "setenv", required_argument, NULL, 'E' }, { "selinux-context", required_argument, NULL, 'Z' }, { "selinux-apifs-context", required_argument, NULL, 'L' }, { "quiet", no_argument, NULL, 'q' }, @@ -384,12 +407,14 @@ static int parse_argv(int argc, char *argv[]) { { "network-veth", no_argument, NULL, 'n' }, { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA}, { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE }, + { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE }, { "personality", required_argument, NULL, ARG_PERSONALITY }, { "image", required_argument, NULL, 'i' }, { "volatile", optional_argument, NULL, ARG_VOLATILE }, { "port", required_argument, NULL, 'p' }, { "property", required_argument, NULL, ARG_PROPERTY }, { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, + { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN}, { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, { "settings", required_argument, NULL, ARG_SETTINGS }, { "chdir", required_argument, NULL, ARG_CHDIR }, @@ -404,7 +429,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argc >= 0); assert(argv); - while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0) switch (c) { @@ -445,7 +470,35 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_USER; break; + case ARG_NETWORK_ZONE: { + char *j; + + j = strappend("vz-", optarg); + if (!j) + return log_oom(); + + if (!ifname_valid(j)) { + log_error("Network zone name not valid: %s", j); + free(j); + return -EINVAL; + } + + free(arg_network_zone); + arg_network_zone = j; + + arg_network_veth = true; + arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; + break; + } + case ARG_NETWORK_BRIDGE: + + if (!ifname_valid(optarg)) { + log_error("Bridge interface name not valid: %s", optarg); + return -EINVAL; + } + r = free_and_strdup(&arg_network_bridge, optarg); if (r < 0) return log_oom(); @@ -468,6 +521,12 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_NETWORK_INTERFACE: + + if (!ifname_valid(optarg)) { + log_error("Network interface name not valid: %s", optarg); + return -EINVAL; + } + if (strv_extend(&arg_network_interfaces, optarg) < 0) return log_oom(); @@ -476,6 +535,12 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_NETWORK_MACVLAN: + + if (!ifname_valid(optarg)) { + log_error("MACVLAN network interface name not valid: %s", optarg); + return -EINVAL; + } + if (strv_extend(&arg_network_macvlan, optarg) < 0) return log_oom(); @@ -484,6 +549,12 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_NETWORK_IPVLAN: + + if (!ifname_valid(optarg)) { + log_error("IPVLAN network interface name not valid: %s", optarg); + return -EINVAL; + } + if (strv_extend(&arg_network_ipvlan, optarg) < 0) return log_oom(); @@ -560,7 +631,7 @@ static int parse_argv(int argc, char *argv[]) { case ARG_CAPABILITY: case ARG_DROP_CAPABILITY: { p = optarg; - for(;;) { + for (;;) { _cleanup_free_ char *t = NULL; r = extract_first_word(&p, &t, ",", 0); @@ -708,7 +779,7 @@ static int parse_argv(int argc, char *argv[]) { break; } - case ARG_SETENV: { + case 'E': { char **n; if (!env_assignment_is_valid(optarg)) { @@ -795,10 +866,29 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_PRIVATE_USERS: - if (optarg) { + + r = optarg ? parse_boolean(optarg) : 1; + if (r == 0) { + /* no: User namespacing off */ + arg_userns_mode = USER_NAMESPACE_NO; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + arg_userns_mode = USER_NAMESPACE_FIXED; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (streq(optarg, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else { _cleanup_free_ char *buffer = NULL; const char *range, *shift; + /* anything else: User namespacing on, UID range is explicitly configured */ + range = strchr(optarg, ':'); if (range) { buffer = strndup(optarg, range - optarg); @@ -818,9 +908,28 @@ static int parse_argv(int argc, char *argv[]) { log_error("Failed to parse UID: %s", optarg); return -EINVAL; } + + arg_userns_mode = USER_NAMESPACE_FIXED; } - arg_userns = true; + arg_settings_mask |= SETTING_USERNS; + break; + + case 'U': + if (userns_supported()) { + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + arg_settings_mask |= SETTING_USERNS; + } + + break; + + case ARG_PRIVATE_USERS_CHOWN: + arg_userns_chown = true; + + arg_settings_mask |= SETTING_USERNS; break; case ARG_KILL_SIGNAL: @@ -891,6 +1000,9 @@ static int parse_argv(int argc, char *argv[]) { if (arg_share_system) arg_register = false; + if (arg_userns_mode == USER_NAMESPACE_PICK) + arg_userns_chown = true; + if (arg_start_mode != START_PID1 && arg_share_system) { log_error("--boot and --share-system may not be combined."); return -EINVAL; @@ -931,8 +1043,20 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) - return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support."); + if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) { + log_error("--private-users= is not supported, kernel compiled without user namespace support."); + return -EOPNOTSUPP; + } + + if (arg_userns_chown && arg_read_only) { + log_error("--read-only and --private-users-chown may not be combined."); + return -EINVAL; + } + + if (arg_network_bridge && arg_network_zone) { + log_error("--network-bridge= and --network-zone= may not be combined."); + return -EINVAL; + } if (argc > optind) { arg_parameters = strv_copy(argv + optind); @@ -975,6 +1099,13 @@ static int verify_arguments(void) { return -EINVAL; } +#ifndef HAVE_LIBIPTC + if (arg_expose_ports) { + log_error("--port= is not supported, compiled without libiptc support."); + return -EOPNOTSUPP; + } +#endif + if (arg_start_mode == START_BOOT && arg_kill_signal <= 0) arg_kill_signal = SIGRTMIN+3; @@ -984,7 +1115,7 @@ static int verify_arguments(void) { static int userns_lchown(const char *p, uid_t uid, gid_t gid) { assert(p); - if (!arg_userns) + if (arg_userns_mode == USER_NAMESPACE_NO) return 0; if (uid == UID_INVALID && gid == GID_INVALID) @@ -1366,11 +1497,11 @@ static int setup_hostname(void) { } static int setup_journal(const char *directory) { - sd_id128_t machine_id, this_id; - _cleanup_free_ char *b = NULL, *d = NULL; - const char *etc_machine_id, *p, *q; + sd_id128_t this_id; + _cleanup_free_ char *d = NULL; + const char *p, *q; bool try; - char *id; + char id[33]; int r; /* Don't link journals in ephemeral mode */ @@ -1382,30 +1513,13 @@ static int setup_journal(const char *directory) { try = arg_link_journal_try || arg_link_journal == LINK_AUTO; - etc_machine_id = prefix_roota(directory, "/etc/machine-id"); - - r = read_one_line_file(etc_machine_id, &b); - if (r == -ENOENT && try) - return 0; - else if (r < 0) - return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id); - - id = strstrip(b); - if (isempty(id) && try) - return 0; - - /* Verify validity */ - r = sd_id128_from_string(id, &machine_id); - if (r < 0) - return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id); - r = sd_id128_get_machine(&this_id); if (r < 0) return log_error_errno(r, "Failed to retrieve machine ID: %m"); - if (sd_id128_equal(machine_id, this_id)) { + if (sd_id128_equal(arg_uuid, this_id)) { log_full(try ? LOG_WARNING : LOG_ERR, - "Host and machine ids are equal (%s): refusing to link journals", id); + "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id)); if (try) return 0; return -EEXIST; @@ -1423,6 +1537,8 @@ static int setup_journal(const char *directory) { if (r < 0) return log_error_errno(r, "Failed to create /var/log/journal: %m"); + (void) sd_id128_to_string(arg_uuid, id); + p = strjoina("/var/log/journal/", id); q = prefix_roota(directory, p); @@ -1487,7 +1603,7 @@ static int setup_journal(const char *directory) { } if (arg_link_journal == LINK_HOST) { - /* don't create parents here -- if the host doesn't have + /* don't create parents here — if the host doesn't have * permanent journal set up, don't force it here */ if (mkdir(p, 0755) < 0 && errno != EEXIST) { @@ -1596,7 +1712,6 @@ static int setup_seccomp(void) { } } - /* Audit is broken in containers, much of the userspace audit hookup will fail if running inside a container. We don't @@ -2192,6 +2307,61 @@ static int mount_device(const char *what, const char *where, const char *directo #endif } +static int setup_machine_id(const char *directory) { + int r; + const char *etc_machine_id, *t; + _cleanup_free_ char *s = NULL; + + etc_machine_id = prefix_roota(directory, "/etc/machine-id"); + + r = read_one_line_file(etc_machine_id, &s); + if (r < 0) + return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id); + + t = strstrip(s); + + if (!isempty(t)) { + r = sd_id128_from_string(t, &arg_uuid); + if (r < 0) + return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id); + } else { + if (sd_id128_is_null(arg_uuid)) { + r = sd_id128_randomize(&arg_uuid); + if (r < 0) + return log_error_errno(r, "Failed to generate random machine ID: %m"); + } + } + + r = machine_id_setup(directory, arg_uuid); + if (r < 0) + return log_error_errno(r, "Failed to setup machine ID: %m"); + + return 0; +} + +static int recursive_chown(const char *directory, uid_t shift, uid_t range) { + int r; + + assert(directory); + + if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown) + return 0; + + r = path_patch_uid(directory, arg_uid_shift, arg_uid_range); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16."); + if (r == -EBADE) + return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match."); + if (r < 0) + return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m"); + if (r == 0) + log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation."); + else + log_debug("Patched directory tree to match UID/GID range."); + + return r; +} + static int mount_devices( const char *where, const char *root_device, bool root_device_rw, @@ -2409,7 +2579,7 @@ static int determine_names(void) { static int determine_uid_shift(const char *directory) { int r; - if (!arg_userns) { + if (arg_userns_mode == USER_NAMESPACE_NO) { arg_uid_shift = 0; return 0; } @@ -2436,7 +2606,6 @@ static int determine_uid_shift(const char *directory) { return -EINVAL; } - log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); return 0; } @@ -2449,6 +2618,7 @@ static int inner_child( FDSet *fds) { _cleanup_free_ char *home = NULL; + char as_uuid[37]; unsigned n_env = 1; const char *envp[] = { "PATH=" DEFAULT_PATH_SPLIT_USR, @@ -2472,7 +2642,7 @@ static int inner_child( cg_unified_flush(); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Tell the parent, that it now can write the UID map. */ (void) barrier_place(barrier); /* #1 */ @@ -2483,7 +2653,14 @@ static int inner_child( } } - r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context); + r = mount_all(NULL, + arg_userns_mode != USER_NAMESPACE_NO, + true, + arg_private_network, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); + if (r < 0) return r; @@ -2559,19 +2736,17 @@ static int inner_child( envp[n_env] = strv_find_prefix(environ, "TERM="); if (envp[n_env]) - n_env ++; + n_env++; if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) || (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) || (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) return log_oom(); - if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) { - char as_uuid[37]; + assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL)); - if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) - return log_oom(); - } + if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) + return log_oom(); if (fdset_size(fds) > 0) { r = fdset_cloexec(fds, false); @@ -2622,12 +2797,10 @@ static int inner_child( /* Automatically search for the init system */ - m = 1 + strv_length(arg_parameters); - a = newa(char*, m + 1); - if (strv_isempty(arg_parameters)) - a[1] = NULL; - else - memcpy(a + 1, arg_parameters, m * sizeof(char*)); + m = strv_length(arg_parameters); + a = newa(char*, m + 2); + memcpy_safe(a + 1, arg_parameters, m * sizeof(char*)); + a[1 + m] = NULL; a[0] = (char*) "/usr/lib/systemd/systemd"; execve(a[0], a, env_use); @@ -2641,7 +2814,8 @@ static int inner_child( execvpe(arg_parameters[0], arg_parameters, env_use); else { if (!arg_chdir) - chdir(home ?: "/root"); + /* If we cannot change the directory, we'll end up in /, that is expected. */ + (void) chdir(home ?: "/root"); execle("/bin/bash", "-bash", NULL, env_use); execle("/bin/sh", "-sh", NULL, env_use); @@ -2662,6 +2836,7 @@ static int outer_child( bool interactive, bool secondary, int pid_socket, + int uuid_socket, int kmsg_socket, int rtnl_socket, int uid_shift_socket, @@ -2675,6 +2850,7 @@ static int outer_child( assert(directory); assert(console); assert(pid_socket >= 0); + assert(uuid_socket >= 0); assert(kmsg_socket >= 0); cg_unified_flush(); @@ -2723,7 +2899,8 @@ static int outer_child( if (r < 0) return r; - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); if (l < 0) return log_error_errno(errno, "Failed to send UID shift: %m"); @@ -2731,17 +2908,49 @@ static int outer_child( log_error("Short write while sending UID shift."); return -EIO; } + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift + * we just read from the image is available. If yes, it will send the UID shift back to us, if + * not it will pick a different one, and send it back to us. */ + + l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) + return log_error_errno(errno, "Failed to recv UID shift: %m"); + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while recieving UID shift."); + return -EIO; + } + } + + log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } /* Turn directory into bind mount */ if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) return log_error_errno(errno, "Failed to make bind mount: %m"); - r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); + r = recursive_chown(directory, arg_uid_shift, arg_uid_range); if (r < 0) return r; - r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); + r = setup_volatile( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); + if (r < 0) + return r; + + r = setup_volatile_state( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); if (r < 0) return r; @@ -2755,7 +2964,13 @@ static int outer_child( return log_error_errno(r, "Failed to make tree read-only: %m"); } - r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_all(directory, + arg_userns_mode != USER_NAMESPACE_NO, + false, + arg_private_network, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; @@ -2789,15 +3004,32 @@ static int outer_child( if (r < 0) return r; + r = setup_machine_id(directory); + if (r < 0) + return r; + r = setup_journal(directory); if (r < 0) return r; - r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_custom( + directory, + arg_custom_mounts, + arg_n_custom_mounts, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; - r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_cgroups( + directory, + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; @@ -2808,12 +3040,13 @@ static int outer_child( pid = raw_clone(SIGCHLD|CLONE_NEWNS| (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | (arg_private_network ? CLONE_NEWNET : 0) | - (arg_userns ? CLONE_NEWUSER : 0), + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0), NULL); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); if (pid == 0) { pid_socket = safe_close(pid_socket); + uuid_socket = safe_close(uuid_socket); uid_shift_socket = safe_close(uid_shift_socket); /* The inner child has all namespaces that are @@ -2835,13 +3068,77 @@ static int outer_child( return -EIO; } + l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send machine ID: %m"); + if (l != sizeof(arg_uuid)) { + log_error("Short write while sending machine ID."); + return -EIO; + } + pid_socket = safe_close(pid_socket); + uuid_socket = safe_close(uuid_socket); kmsg_socket = safe_close(kmsg_socket); rtnl_socket = safe_close(rtnl_socket); return 0; } +static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { + unsigned n_tries = 100; + uid_t candidate; + int r; + + assert(shift); + assert(ret_lock_file); + assert(arg_userns_mode == USER_NAMESPACE_PICK); + assert(arg_uid_range == 0x10000U); + + candidate = *shift; + + (void) mkdir("/run/systemd/nspawn-uid", 0755); + + for (;;) { + char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT; + + if (--n_tries <= 0) + return -EBUSY; + + if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX) + goto next; + if ((candidate & UINT32_C(0xFFFF)) != 0) + goto next; + + xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate); + r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf); + if (r == -EBUSY) /* Range already taken by another nspawn instance */ + goto next; + if (r < 0) + return r; + + /* Make some superficial checks whether the range is currently known in the user database */ + if (getpwuid(candidate)) + goto next; + if (getpwuid(candidate + UINT32_C(0xFFFE))) + goto next; + if (getgrgid(candidate)) + goto next; + if (getgrgid(candidate + UINT32_C(0xFFFE))) + goto next; + + *ret_lock_file = lf; + lf = (struct LockFile) LOCK_FILE_INIT; + *shift = candidate; + return 0; + + next: + random_bytes(&candidate, sizeof(candidate)); + candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN; + candidate &= (uid_t) UINT32_C(0xFFFF0000); + } +} + static int setup_uid_map(pid_t pid) { char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; int r; @@ -3028,6 +3325,7 @@ static int load_settings(void) { (settings->private_network >= 0 || settings->network_veth >= 0 || settings->network_bridge || + settings->network_zone || settings->network_interfaces || settings->network_macvlan || settings->network_ipvlan || @@ -3058,6 +3356,10 @@ static int load_settings(void) { free(arg_network_bridge); arg_network_bridge = settings->network_bridge; settings->network_bridge = NULL; + + free(arg_network_zone); + arg_network_zone = settings->network_zone; + settings->network_zone = NULL; } } @@ -3073,6 +3375,19 @@ static int load_settings(void) { } } + if ((arg_settings_mask & SETTING_USERNS) == 0 && + settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) { + + if (!arg_settings_trusted) + log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p); + else { + arg_userns_mode = settings->userns_mode; + arg_uid_shift = settings->uid_shift; + arg_uid_range = settings->uid_range; + arg_userns_chown = settings->userns_chown; + } + } + return 0; } @@ -3083,14 +3398,14 @@ int main(int argc, char *argv[]) { _cleanup_close_ int master = -1, image_fd = -1; _cleanup_fdset_free_ FDSet *fds = NULL; int r, n_fd_passed, loop_nr = -1; - char veth_name[IFNAMSIZ]; + char veth_name[IFNAMSIZ] = ""; bool secondary = false, remove_subvol = false; sigset_t mask_chld; pid_t pid = 0; int ret = EXIT_SUCCESS; union in_addr_union exposed = {}; _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; - bool interactive; + bool interactive, veth_created = false; log_parse_environment(); log_open(); @@ -3285,6 +3600,12 @@ int main(int argc, char *argv[]) { goto finish; } + if (arg_selinux_apifs_context) { + r = mac_selinux_apply(console, arg_selinux_apifs_context); + if (r < 0) + goto finish; + } + if (unlockpt(master) < 0) { r = log_error_errno(errno, "Failed to unlock tty: %m"); goto finish; @@ -3305,19 +3626,42 @@ int main(int argc, char *argv[]) { } for (;;) { - _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 }; - ContainerStatus container_status; - _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; static const struct sigaction sa = { .sa_handler = nop_signal_handler, .sa_flags = SA_NOCLDSTOP, }; - int ifi = 0; - ssize_t l; + + _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT; + _cleanup_close_ int etc_passwd_lock = -1; + _cleanup_close_pair_ int + kmsg_socket_pair[2] = { -1, -1 }, + rtnl_socket_pair[2] = { -1, -1 }, + pid_socket_pair[2] = { -1, -1 }, + uuid_socket_pair[2] = { -1, -1 }, + uid_shift_socket_pair[2] = { -1, -1 }; + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + ContainerStatus container_status; char last_char = 0; + int ifi = 0; + ssize_t l; + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely + * check with getpwuid() if the specific user already exists. Note that /etc might be + * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we + * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are + * really just an extra safety net. We kinda assume that the UID range we allocate from is + * really ours. */ + + etc_passwd_lock = take_etc_passwd_lock(NULL); + if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) { + log_error_errno(r, "Failed to take /etc/passwd lock: %m"); + goto finish; + } + } r = barrier_create(&barrier); if (r < 0) { @@ -3340,7 +3684,12 @@ int main(int argc, char *argv[]) { goto finish; } - if (arg_userns) + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) { + r = log_error_errno(errno, "Failed to create id socket pair: %m"); + goto finish; + } + + if (arg_userns_mode != USER_NAMESPACE_NO) if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); goto finish; @@ -3380,6 +3729,7 @@ int main(int argc, char *argv[]) { kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]); rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); pid_socket_pair[0] = safe_close(pid_socket_pair[0]); + uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); (void) reset_all_signal_handlers(); @@ -3394,6 +3744,7 @@ int main(int argc, char *argv[]) { interactive, secondary, pid_socket_pair[1], + uuid_socket_pair[1], kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], @@ -3411,8 +3762,46 @@ int main(int argc, char *argv[]) { kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); pid_socket_pair[1] = safe_close(pid_socket_pair[1]); + uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* The child just let us know the UID shift it might have read from the image. */ + l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) { + r = log_error_errno(errno, "Failed to read UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while reading UID shift."); + r = EIO; + goto finish; + } + + if (arg_userns_mode == USER_NAMESPACE_PICK) { + /* If we are supposed to pick the UID shift, let's try to use the shift read from the + * image, but if that's already in use, pick a new one, and report back to the child, + * which one we now picked. */ + + r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock); + if (r < 0) { + log_error_errno(r, "Failed to pick suitable UID/GID range: %m"); + goto finish; + } + + l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); + if (l < 0) { + r = log_error_errno(errno, "Failed to send UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short write while writing UID shift."); + r = -EIO; + goto finish; + } + } + } + /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", pid, NULL); if (r < 0) @@ -3435,26 +3824,27 @@ int main(int argc, char *argv[]) { goto finish; } + /* We also retrieve container UUID in case it was generated by outer child */ + l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0); + if (l < 0) { + r = log_error_errno(errno, "Failed to read container machine ID: %m"); + goto finish; + } + if (l != sizeof(arg_uuid)) { + log_error("Short read while reading container machined ID."); + r = EIO; + goto finish; + } + log_debug("Init process invoked as PID " PID_FMT, pid); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { if (!barrier_place_and_sync(&barrier)) { /* #1 */ log_error("Child died too early."); r = -ESRCH; goto finish; } - l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); - if (l < 0) { - r = log_error_errno(errno, "Failed to read UID shift: %m"); - goto finish; - } - if (l != sizeof(arg_uid_shift)) { - log_error("Short read while reading UID shift."); - r = EIO; - goto finish; - } - r = setup_uid_map(pid); if (r < 0) goto finish; @@ -3469,14 +3859,23 @@ int main(int argc, char *argv[]) { goto finish; if (arg_network_veth) { - r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge); + r = setup_veth(arg_machine, pid, veth_name, + arg_network_bridge || arg_network_zone); if (r < 0) goto finish; else if (r > 0) ifi = r; if (arg_network_bridge) { - r = setup_bridge(veth_name, arg_network_bridge); + /* Add the interface to a bridge */ + r = setup_bridge(veth_name, arg_network_bridge, false); + if (r < 0) + goto finish; + if (r > 0) + ifi = r; + } else if (arg_network_zone) { + /* Add the interface to a bridge, possibly creating it */ + r = setup_bridge(veth_name, arg_network_zone, true); if (r < 0) goto finish; if (r > 0) @@ -3488,6 +3887,12 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + /* We created the primary and extra veth links now; let's remember this, so that we know to + remove them later on. Note that we don't bother with removing veth links that were created + here when their setup failed half-way, because in that case the kernel should be able to + remove them on its own, since they cannot be referenced by anything yet. */ + veth_created = true; + r = setup_macvlan(arg_machine, pid, arg_network_macvlan); if (r < 0) goto finish; @@ -3552,6 +3957,10 @@ int main(int argc, char *argv[]) { goto finish; } + /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear + * in getpwuid(), thus we can release the /etc/passwd lock. */ + etc_passwd_lock = safe_close(etc_passwd_lock); + sd_notifyf(false, "READY=1\n" "STATUS=Container running.\n" @@ -3619,7 +4028,7 @@ int main(int argc, char *argv[]) { /* We failed to wait for the container, or the * container exited abnormally */ goto finish; - else if (r > 0 || container_status == CONTAINER_TERMINATED){ + else if (r > 0 || container_status == CONTAINER_TERMINATED) { /* The container exited with a non-zero * status, or with zero status and no reboot * was requested. */ @@ -3646,6 +4055,9 @@ int main(int argc, char *argv[]) { } expose_port_flush(arg_expose_ports, &exposed); + + (void) remove_veth_links(veth_name, arg_network_veth_extra); + veth_created = false; } finish: @@ -3679,6 +4091,10 @@ finish: expose_port_flush(arg_expose_ports, &exposed); + if (veth_created) + (void) remove_veth_links(veth_name, arg_network_veth_extra); + (void) remove_bridge(arg_network_zone); + free(arg_directory); free(arg_template); free(arg_image); diff --git a/src/systemd-nspawn/test-patch-uid.c b/src/systemd-nspawn/test-patch-uid.c new file mode 100644 index 0000000000..11c5321788 --- /dev/null +++ b/src/systemd-nspawn/test-patch-uid.c @@ -0,0 +1,61 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <stdlib.h> + +#include "log.h" +#include "nspawn-patch-uid.h" +#include "user-util.h" +#include "util.h" + +int main(int argc, char *argv[]) { + uid_t shift, range; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + log_open(); + + if (argc != 4) { + log_error("Expected PATH SHIFT RANGE parameters."); + return EXIT_FAILURE; + } + + r = parse_uid(argv[2], &shift); + if (r < 0) { + log_error_errno(r, "Failed to parse UID shift %s.", argv[2]); + return EXIT_FAILURE; + } + + r = parse_gid(argv[3], &range); + if (r < 0) { + log_error_errno(r, "Failed to parse UID range %s.", argv[3]); + return EXIT_FAILURE; + } + + r = path_patch_uid(argv[1], shift, range); + if (r < 0) { + log_error_errno(r, "Failed to patch directory tree: %m"); + return EXIT_FAILURE; + } + + log_info("Changed: %s", yes_no(r)); + + return EXIT_SUCCESS; +} |