summaryrefslogtreecommitdiff
path: root/src/systemd-nspawn
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@sbcglobal.net>2016-06-12 08:43:34 -0400
committerLuke Shumaker <lukeshu@sbcglobal.net>2016-06-12 08:43:34 -0400
commit670b77ddfab0f4eddbe539964aba83d446d48129 (patch)
tree5b159fe9bd52169e05cdc60db5a48a5c5ac9602a /src/systemd-nspawn
parent23708daf3ba69ba9880102b4f720a3842883332e (diff)
parent34dbdee3b2f122d2ef903a368b172e75f962b66a (diff)
Merge branch 'lukeshu/postmove' into 'lukeshu/master'
Diffstat (limited to 'src/systemd-nspawn')
-rw-r--r--src/systemd-nspawn/Makefile17
-rw-r--r--src/systemd-nspawn/nspawn-cgroup.c9
-rw-r--r--src/systemd-nspawn/nspawn-gperf.gperf7
-rw-r--r--src/systemd-nspawn/nspawn-mount.c25
-rw-r--r--src/systemd-nspawn/nspawn-network.c184
-rw-r--r--src/systemd-nspawn/nspawn-network.h5
-rw-r--r--src/systemd-nspawn/nspawn-patch-uid.c469
-rw-r--r--src/systemd-nspawn/nspawn-patch-uid.h23
-rw-r--r--src/systemd-nspawn/nspawn-register.c1
-rw-r--r--src/systemd-nspawn/nspawn-settings.c124
-rw-r--r--src/systemd-nspawn/nspawn-settings.h17
-rw-r--r--src/systemd-nspawn/nspawn.c592
-rw-r--r--src/systemd-nspawn/test-patch-uid.c61
13 files changed, 1408 insertions, 126 deletions
diff --git a/src/systemd-nspawn/Makefile b/src/systemd-nspawn/Makefile
index 380266ea7f..9702abcbc5 100644
--- a/src/systemd-nspawn/Makefile
+++ b/src/systemd-nspawn/Makefile
@@ -41,10 +41,14 @@ systemd_nspawn_SOURCES = \
src/nspawn/nspawn-setuid.h \
src/nspawn/nspawn-stub-pid1.c \
src/nspawn/nspawn-stub-pid1.h \
+ src/nspawn/nspawn-patch-uid.c \
+ src/nspawn/nspawn-patch-uid.h \
src/core/mount-setup.c \
src/core/mount-setup.h \
src/core/loopback-setup.c \
- src/core/loopback-setup.h
+ src/core/loopback-setup.h \
+ src/core/machine-id-setup.c \
+ src/core/machine-id-setup.h
nodist_systemd_nspawn_SOURCES = \
src/nspawn/nspawn-gperf.c
@@ -66,6 +70,17 @@ systemd_nspawn_LDADD += \
libfirewall.la
endif # HAVE_LIBIPTC
+test_patch_uid_SOURCES = \
+ src/nspawn/nspawn-patch-uid.c \
+ src/nspawn/nspawn-patch-uid.h \
+ src/nspawn/test-patch-uid.c
+
+test_patch_uid_LDADD = \
+ libshared.la
+
+manual_tests += \
+ test-patch-uid
+
bin_PROGRAMS += systemd-nspawn
systemd_nspawn_LDADD += libsystemd.la # was hidden by libshared->libsystemd
systemd_nspawn_LDADD += libbasic.la # was hidden by libshared->libsystemd->libbasic
diff --git a/src/systemd-nspawn/nspawn-cgroup.c b/src/systemd-nspawn/nspawn-cgroup.c
index 1db5ba7116..f50f1ad6c2 100644
--- a/src/systemd-nspawn/nspawn-cgroup.c
+++ b/src/systemd-nspawn/nspawn-cgroup.c
@@ -55,8 +55,7 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) {
"cgroup.events",
"cgroup.clone_children",
"cgroup.controllers",
- "cgroup.subtree_control",
- "cgroup.populated")
+ "cgroup.subtree_control")
if (fchownat(fd, fn, uid_shift, uid_shift, 0) < 0)
log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
"Failed to chown() cgroup file %s, ignoring: %m", fn);
@@ -73,7 +72,7 @@ int sync_cgroup(pid_t pid, bool unified_requested) {
unified = cg_unified();
if (unified < 0)
- return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+ return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
if ((unified > 0) == unified_requested)
return 0;
@@ -94,7 +93,7 @@ int sync_cgroup(pid_t pid, bool unified_requested) {
if (unified)
r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
else
- r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+ r = mount("cgroup", tree, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
if (r < 0) {
r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
goto finish;
@@ -135,7 +134,7 @@ int create_subcgroup(pid_t pid, bool unified_requested) {
unified = cg_unified();
if (unified < 0)
- return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+ return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
if (unified == 0)
return 0;
diff --git a/src/systemd-nspawn/nspawn-gperf.gperf b/src/systemd-nspawn/nspawn-gperf.gperf
index 116655cdd2..2b5d452662 100644
--- a/src/systemd-nspawn/nspawn-gperf.gperf
+++ b/src/systemd-nspawn/nspawn-gperf.gperf
@@ -16,7 +16,7 @@ struct ConfigPerfItem;
%includes
%%
Exec.Boot, config_parse_boot, 0, 0
-Exec.ProcessTwo, config_parse_pid2, 0, 0,
+Exec.ProcessTwo, config_parse_pid2, 0, 0
Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters)
Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment)
Exec.User, config_parse_string, 0, offsetof(Settings, user)
@@ -26,16 +26,19 @@ Exec.KillSignal, config_parse_signal, 0, offsetof(Settings,
Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality)
Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id)
Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
+Exec.PrivateUsers, config_parse_private_users, 0, 0
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
Files.BindReadOnly, config_parse_bind, 1, 0
Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0
+Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown)
Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network)
Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces)
Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan)
Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan)
Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth)
Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0
-Network.Bridge, config_parse_string, 0, offsetof(Settings, network_bridge)
+Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge)
+Network.Zone, config_parse_network_zone, 0, 0
Network.Port, config_parse_expose_port, 0, 0
diff --git a/src/systemd-nspawn/nspawn-mount.c b/src/systemd-nspawn/nspawn-mount.c
index 70cca15278..8e2d2d543c 100644
--- a/src/systemd-nspawn/nspawn-mount.c
+++ b/src/systemd-nspawn/nspawn-mount.c
@@ -438,21 +438,22 @@ static int mount_bind(const char *dest, CustomMount *m) {
r = mkdir_parents_label(where, 0755);
if (r < 0)
return log_error_errno(r, "Failed to make parents of %s: %m", where);
+
+ /* Create the mount point. Any non-directory file can be
+ * mounted on any non-directory file (regular, fifo, socket,
+ * char, block).
+ */
+ if (S_ISDIR(source_st.st_mode))
+ r = mkdir_label(where, 0755);
+ else
+ r = touch(where);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create mount point %s: %m", where);
+
} else {
return log_error_errno(errno, "Failed to stat %s: %m", where);
}
- /* Create the mount point. Any non-directory file can be
- * mounted on any non-directory file (regular, fifo, socket,
- * char, block).
- */
- if (S_ISDIR(source_st.st_mode))
- r = mkdir_label(where, 0755);
- else
- r = touch(where);
- if (r < 0 && r != -EEXIST)
- return log_error_errno(r, "Failed to create mount point %s: %m", where);
-
if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
return log_error_errno(errno, "mount(%s) failed: %m", where);
@@ -750,7 +751,7 @@ static int mount_unified_cgroups(const char *dest) {
return -EINVAL;
}
- if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+ if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
return 0;
diff --git a/src/systemd-nspawn/nspawn-network.c b/src/systemd-nspawn/nspawn-network.c
index 58d6035ddb..917827eac1 100644
--- a/src/systemd-nspawn/nspawn-network.c
+++ b/src/systemd-nspawn/nspawn-network.c
@@ -26,8 +26,11 @@
#include "alloc-util.h"
#include "ether-addr-util.h"
+#include "lockfile-util.h"
#include "nspawn-network.h"
#include "siphash24.h"
+#include "socket-util.h"
+#include "stat-util.h"
#include "string-util.h"
#include "udev-util.h"
#include "util.h"
@@ -38,6 +41,30 @@
#define VETH_EXTRA_CONTAINER_HASH_KEY SD_ID128_MAKE(af,50,17,61,ce,f9,4d,35,84,0d,2b,20,54,be,ce,59)
#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
+static int remove_one_link(sd_netlink *rtnl, const char *name) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ if (isempty(name))
+ return 0;
+
+ r = sd_rtnl_message_new_link(rtnl, &m, RTM_DELLINK, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate netlink message: %m");
+
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, name);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+ r = sd_netlink_call(rtnl, m, 0, NULL);
+ if (r == -ENODEV) /* Already gone */
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to remove interface %s: %m", name);
+
+ return 1;
+}
+
static int generate_mac(
const char *machine_name,
struct ether_addr *mac,
@@ -231,51 +258,155 @@ int setup_veth_extra(
if (r < 0)
return r;
- idx ++;
+ idx++;
}
return 0;
}
-int setup_bridge(const char *veth_name, const char *bridge_name) {
+static int join_bridge(sd_netlink *rtnl, const char *veth_name, const char *bridge_name) {
_cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
- _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
int r, bridge_ifi;
+ assert(rtnl);
assert(veth_name);
assert(bridge_name);
bridge_ifi = (int) if_nametoindex(bridge_name);
if (bridge_ifi <= 0)
- return log_error_errno(errno, "Failed to resolve interface %s: %m", bridge_name);
-
- r = sd_netlink_open(&rtnl);
- if (r < 0)
- return log_error_errno(r, "Failed to connect to netlink: %m");
+ return -errno;
r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
if (r < 0)
- return log_error_errno(r, "Failed to allocate netlink message: %m");
+ return r;
r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
if (r < 0)
- return log_error_errno(r, "Failed to set IFF_UP flag: %m");
+ return r;
r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
if (r < 0)
- return log_error_errno(r, "Failed to add netlink interface name field: %m");
+ return r;
r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge_ifi);
if (r < 0)
- return log_error_errno(r, "Failed to add netlink master field: %m");
+ return r;
r = sd_netlink_call(rtnl, m, 0, NULL);
if (r < 0)
- return log_error_errno(r, "Failed to add veth interface to bridge: %m");
+ return r;
return bridge_ifi;
}
+static int create_bridge(sd_netlink *rtnl, const char *bridge_name) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_string(m, IFLA_IFNAME, bridge_name);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "bridge");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call(rtnl, m, 0, NULL);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int setup_bridge(const char *veth_name, const char *bridge_name, bool create) {
+ _cleanup_release_lock_file_ LockFile bridge_lock = LOCK_FILE_INIT;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ int r, bridge_ifi;
+ unsigned n = 0;
+
+ assert(veth_name);
+ assert(bridge_name);
+
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to netlink: %m");
+
+ if (create) {
+ /* We take a system-wide lock here, so that we can safely check whether there's still a member in the
+ * bridge before removing it, without risking interferance from other nspawn instances. */
+
+ r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock);
+ if (r < 0)
+ return log_error_errno(r, "Failed to take network zone lock: %m");
+ }
+
+ for (;;) {
+ bridge_ifi = join_bridge(rtnl, veth_name, bridge_name);
+ if (bridge_ifi >= 0)
+ return bridge_ifi;
+ if (bridge_ifi != -ENODEV || !create || n > 10)
+ return log_error_errno(bridge_ifi, "Failed to add interface %s to bridge %s: %m", veth_name, bridge_name);
+
+ /* Count attempts, so that we don't enter an endless loop here. */
+ n++;
+
+ /* The bridge doesn't exist yet. Let's create it */
+ r = create_bridge(rtnl, bridge_name);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create bridge interface %s: %m", bridge_name);
+
+ /* Try again, now that the bridge exists */
+ }
+}
+
+int remove_bridge(const char *bridge_name) {
+ _cleanup_release_lock_file_ LockFile bridge_lock = LOCK_FILE_INIT;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ const char *path;
+ int r;
+
+ /* Removes the specified bridge, but only if it is currently empty */
+
+ if (isempty(bridge_name))
+ return 0;
+
+ r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock);
+ if (r < 0)
+ return log_error_errno(r, "Failed to take network zone lock: %m");
+
+ path = strjoina("/sys/class/net/", bridge_name, "/brif");
+
+ r = dir_is_empty(path);
+ if (r == -ENOENT) /* Already gone? */
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Can't detect if bridge %s is empty: %m", bridge_name);
+ if (r == 0) /* Still populated, leave it around */
+ return 0;
+
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to netlink: %m");
+
+ return remove_one_link(rtnl, bridge_name);
+}
+
static int parse_interface(struct udev *udev, const char *name) {
_cleanup_udev_device_unref_ struct udev_device *d = NULL;
char ifi_str[2 + DECIMAL_STR_MAX(int)];
@@ -514,13 +645,13 @@ int veth_extra_parse(char ***l, const char *p) {
r = extract_first_word(&p, &a, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
if (r < 0)
return r;
- if (r == 0 || isempty(a))
+ if (r == 0 || !ifname_valid(a))
return -EINVAL;
r = extract_first_word(&p, &b, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
if (r < 0)
return r;
- if (r == 0 || isempty(b)) {
+ if (r == 0 || !ifname_valid(b)) {
free(b);
b = strdup(a);
if (!b)
@@ -537,3 +668,26 @@ int veth_extra_parse(char ***l, const char *p) {
a = b = NULL;
return 0;
}
+
+int remove_veth_links(const char *primary, char **pairs) {
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ char **a, **b;
+ int r;
+
+ /* In some cases the kernel might pin the veth links between host and container even after the namespace
+ * died. Hence, let's better remove them explicitly too. */
+
+ if (isempty(primary) && strv_isempty(pairs))
+ return 0;
+
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to netlink: %m");
+
+ remove_one_link(rtnl, primary);
+
+ STRV_FOREACH_PAIR(a, b, pairs)
+ remove_one_link(rtnl, *a);
+
+ return 0;
+}
diff --git a/src/systemd-nspawn/nspawn-network.h b/src/systemd-nspawn/nspawn-network.h
index 9ab1606d1c..3d8861e1e5 100644
--- a/src/systemd-nspawn/nspawn-network.h
+++ b/src/systemd-nspawn/nspawn-network.h
@@ -26,7 +26,8 @@
int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], bool bridge);
int setup_veth_extra(const char *machine_name, pid_t pid, char **pairs);
-int setup_bridge(const char *veth_name, const char *bridge_name);
+int setup_bridge(const char *veth_name, const char *bridge_name, bool create);
+int remove_bridge(const char *bridge_name);
int setup_macvlan(const char *machine_name, pid_t pid, char **ifaces);
int setup_ipvlan(const char *machine_name, pid_t pid, char **ifaces);
@@ -34,3 +35,5 @@ int setup_ipvlan(const char *machine_name, pid_t pid, char **ifaces);
int move_network_interfaces(pid_t pid, char **ifaces);
int veth_extra_parse(char ***l, const char *p);
+
+int remove_veth_links(const char *primary, char **pairs);
diff --git a/src/systemd-nspawn/nspawn-patch-uid.c b/src/systemd-nspawn/nspawn-patch-uid.c
new file mode 100644
index 0000000000..c7382d412d
--- /dev/null
+++ b/src/systemd-nspawn/nspawn-patch-uid.c
@@ -0,0 +1,469 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <fcntl.h>
+#include <linux/magic.h>
+#ifdef HAVE_ACL
+#include <sys/acl.h>
+#endif
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#include "acl-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "missing.h"
+#include "nspawn-patch-uid.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+#ifdef HAVE_ACL
+
+static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
+ char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
+ acl_t acl;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (name) {
+ _cleanup_close_ int child_fd = -1;
+
+ child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
+ if (child_fd < 0)
+ return -errno;
+
+ xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
+ acl = acl_get_file(procfs_path, type);
+ } else if (type == ACL_TYPE_ACCESS)
+ acl = acl_get_fd(fd);
+ else {
+ xsprintf(procfs_path, "/proc/self/fd/%i", fd);
+ acl = acl_get_file(procfs_path, type);
+ }
+ if (!acl)
+ return -errno;
+
+ *ret = acl;
+ return 0;
+}
+
+static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
+ char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
+ int r;
+
+ assert(fd >= 0);
+ assert(acl);
+
+ if (name) {
+ _cleanup_close_ int child_fd = -1;
+
+ child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
+ if (child_fd < 0)
+ return -errno;
+
+ xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
+ r = acl_set_file(procfs_path, type, acl);
+ } else if (type == ACL_TYPE_ACCESS)
+ r = acl_set_fd(fd, acl);
+ else {
+ xsprintf(procfs_path, "/proc/self/fd/%i", fd);
+ r = acl_set_file(procfs_path, type, acl);
+ }
+ if (r < 0)
+ return -errno;
+
+ return 0;
+}
+
+static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
+ _cleanup_(acl_freep) acl_t copy = NULL;
+ acl_entry_t i;
+ int r;
+
+ assert(acl);
+ assert(ret);
+
+ r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ if (r < 0)
+ return -errno;
+ while (r > 0) {
+ uid_t *old_uid, new_uid;
+ bool modify = false;
+ acl_tag_t tag;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
+
+ /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
+ * this is actually OK */
+ assert_cc(sizeof(uid_t) == sizeof(gid_t));
+
+ old_uid = acl_get_qualifier(i);
+ if (!old_uid)
+ return -errno;
+
+ new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
+ if (!uid_is_valid(new_uid))
+ return -EINVAL;
+
+ modify = new_uid != *old_uid;
+ if (modify && !copy) {
+ int n;
+
+ /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
+ * beginning, so that we copy all entries, starting from the first, this time. */
+
+ n = acl_entries(acl);
+ if (n < 0)
+ return -errno;
+
+ copy = acl_init(n);
+ if (!copy)
+ return -errno;
+
+ /* Seek back to the beginning */
+ r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ if (r < 0)
+ return -errno;
+ continue;
+ }
+ }
+
+ if (copy) {
+ acl_entry_t new_entry;
+
+ if (acl_create_entry(&copy, &new_entry) < 0)
+ return -errno;
+
+ if (acl_copy_entry(new_entry, i) < 0)
+ return -errno;
+
+ if (modify)
+ if (acl_set_qualifier(new_entry, &new_uid) < 0)
+ return -errno;
+ }
+
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
+ if (r < 0)
+ return -errno;
+ }
+
+ *ret = copy;
+ copy = NULL;
+
+ return !!*ret;
+}
+
+static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
+ _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
+ bool changed = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(st);
+
+ /* ACLs are not supported on symlinks, there's no point in trying */
+ if (S_ISLNK(st->st_mode))
+ return 0;
+
+ r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
+ if (r == -EOPNOTSUPP)
+ return 0;
+ if (r < 0)
+ return r;
+
+ r = shift_acl(acl, shift, &shifted);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
+ if (r < 0)
+ return r;
+
+ changed = true;
+ }
+
+ if (S_ISDIR(st->st_mode)) {
+ acl_free(acl);
+ acl_free(shifted);
+
+ acl = shifted = NULL;
+
+ r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
+ if (r < 0)
+ return r;
+
+ r = shift_acl(acl, shift, &shifted);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
+ if (r < 0)
+ return r;
+
+ changed = true;
+ }
+ }
+
+ return changed;
+}
+
+#else
+
+static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
+ return 0;
+}
+
+#endif
+
+static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
+ uid_t new_uid;
+ gid_t new_gid;
+ bool changed = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(st);
+
+ new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
+ new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
+
+ if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
+ return -EINVAL;
+
+ if (st->st_uid != new_uid || st->st_gid != new_gid) {
+ if (name)
+ r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
+ else
+ r = fchown(fd, new_uid, new_gid);
+ if (r < 0)
+ return -errno;
+
+ /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
+ if (name && !S_ISLNK(st->st_mode))
+ r = fchmodat(fd, name, st->st_mode, 0);
+ else
+ r = fchmod(fd, st->st_mode);
+ if (r < 0)
+ return -errno;
+
+ changed = true;
+ }
+
+ r = patch_acls(fd, name, st, shift);
+ if (r < 0)
+ return r;
+
+ return r > 0 || changed;
+}
+
+static int is_procfs_sysfs_or_suchlike(int fd) {
+ struct statfs sfs;
+
+ assert(fd >= 0);
+
+ if (fstatfs(fd, &sfs) < 0)
+ return -errno;
+
+ return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
+}
+
+static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
+ bool changed = false;
+ int r;
+
+ assert(fd >= 0);
+
+ /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
+ * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
+ * stop the recursion when we hit a procfs or sysfs file system. */
+ r = is_procfs_sysfs_or_suchlike(fd);
+ if (r < 0)
+ goto finish;
+ if (r > 0) {
+ r = 0; /* don't recurse */
+ goto finish;
+ }
+
+ r = patch_fd(fd, NULL, st, shift);
+ if (r == -EROFS) {
+ _cleanup_free_ char *name = NULL;
+
+ if (!is_toplevel) {
+ /* When we hit a ready-only subtree we simply skip it, but log about it. */
+ (void) fd_get_path(fd, &name);
+ log_debug("Skippping read-only file or directory %s.", strna(name));
+ r = 0;
+ }
+
+ goto finish;
+ }
+ if (r < 0)
+ goto finish;
+
+ if (S_ISDIR(st->st_mode)) {
+ _cleanup_closedir_ DIR *d = NULL;
+ struct dirent *de;
+
+ if (!donate_fd) {
+ int copy;
+
+ copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (copy < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ fd = copy;
+ donate_fd = true;
+ }
+
+ d = fdopendir(fd);
+ if (!d) {
+ r = -errno;
+ goto finish;
+ }
+ fd = -1;
+
+ FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
+ struct stat fst;
+
+ if (STR_IN_SET(de->d_name, ".", ".."))
+ continue;
+
+ if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ if (S_ISDIR(fst.st_mode)) {
+ int subdir_fd;
+
+ subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
+ if (subdir_fd < 0) {
+ r = -errno;
+ goto finish;
+
+ }
+
+ r = recurse_fd(subdir_fd, true, &fst, shift, false);
+ if (r < 0)
+ goto finish;
+ if (r > 0)
+ changed = true;
+
+ } else {
+ r = patch_fd(dirfd(d), de->d_name, &fst, shift);
+ if (r < 0)
+ goto finish;
+ if (r > 0)
+ changed = true;
+ }
+ }
+ }
+
+ r = changed;
+
+finish:
+ if (donate_fd)
+ safe_close(fd);
+
+ return r;
+}
+
+static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
+ struct stat st;
+ int r;
+
+ assert(fd >= 0);
+
+ /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
+ * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
+ * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
+ * UID within the container. */
+
+ if ((shift & 0xFFFF) != 0) {
+ /* We only support containers where the shift starts at a 2^16 boundary */
+ r = -EOPNOTSUPP;
+ goto finish;
+ }
+
+ if (range != 0x10000) {
+ /* We only support containers with 16bit UID ranges for the patching logic */
+ r = -EOPNOTSUPP;
+ goto finish;
+ }
+
+ if (fstat(fd, &st) < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
+ /* We only support containers where the uid/gid container ID match */
+ r = -EBADE;
+ goto finish;
+ }
+
+ /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
+ * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
+ if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
+ return 0;
+
+ return recurse_fd(fd, donate_fd, &st, shift, true);
+
+finish:
+ if (donate_fd)
+ safe_close(fd);
+
+ return r;
+}
+
+int fd_patch_uid(int fd, uid_t shift, uid_t range) {
+ return fd_patch_uid_internal(fd, false, shift, range);
+}
+
+int path_patch_uid(const char *path, uid_t shift, uid_t range) {
+ int fd;
+
+ fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
+ if (fd < 0)
+ return -errno;
+
+ return fd_patch_uid_internal(fd, true, shift, range);
+}
diff --git a/src/systemd-nspawn/nspawn-patch-uid.h b/src/systemd-nspawn/nspawn-patch-uid.h
new file mode 100644
index 0000000000..55d0990016
--- /dev/null
+++ b/src/systemd-nspawn/nspawn-patch-uid.h
@@ -0,0 +1,23 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+
+int fd_patch_uid(int fd, uid_t shift, uid_t range);
+int path_patch_uid(const char *path, uid_t shift, uid_t range);
diff --git a/src/systemd-nspawn/nspawn-register.c b/src/systemd-nspawn/nspawn-register.c
index de1433a5e2..08cbff9731 100644
--- a/src/systemd-nspawn/nspawn-register.c
+++ b/src/systemd-nspawn/nspawn-register.c
@@ -20,6 +20,7 @@
#include <systemd/sd-bus.h>
#include "bus-error.h" /* for bus_error_message */
+#include "bus-unit-util.h"
#include "bus-util.h"
#include "nspawn-register.h"
#include "stat-util.h"
diff --git a/src/systemd-nspawn/nspawn-settings.c b/src/systemd-nspawn/nspawn-settings.c
index 4fb0054698..5f1522cfb6 100644
--- a/src/systemd-nspawn/nspawn-settings.c
+++ b/src/systemd-nspawn/nspawn-settings.c
@@ -24,7 +24,10 @@
#include "nspawn-settings.h"
#include "parse-util.h"
#include "process-util.h"
+#include "socket-util.h"
+#include "string-util.h"
#include "strv.h"
+#include "user-util.h"
#include "util.h"
int settings_load(FILE *f, const char *path, Settings **ret) {
@@ -40,9 +43,13 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
s->start_mode = _START_MODE_INVALID;
s->personality = PERSONALITY_INVALID;
+ s->userns_mode = _USER_NAMESPACE_MODE_INVALID;
+ s->uid_shift = UID_INVALID;
+ s->uid_range = UID_INVALID;
s->read_only = -1;
s->volatile_mode = _VOLATILE_MODE_INVALID;
+ s->userns_chown = -1;
s->private_network = -1;
s->network_veth = -1;
@@ -59,6 +66,16 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
if (r < 0)
return r;
+ /* Make sure that if userns_mode is set, userns_chown is set to something appropriate, and vice versa. Either
+ * both fields shall be initialized or neither. */
+ if (s->userns_mode == USER_NAMESPACE_PICK)
+ s->userns_chown = true;
+ else if (s->userns_mode != _USER_NAMESPACE_MODE_INVALID && s->userns_chown < 0)
+ s->userns_chown = false;
+
+ if (s->userns_chown >= 0 && s->userns_mode == _USER_NAMESPACE_MODE_INVALID)
+ s->userns_mode = USER_NAMESPACE_NO;
+
*ret = s;
s = NULL;
@@ -80,6 +97,7 @@ Settings* settings_free(Settings *s) {
strv_free(s->network_ipvlan);
strv_free(s->network_veth_extra);
free(s->network_bridge);
+ free(s->network_zone);
expose_port_free_all(s->expose_ports);
custom_mount_free_all(s->custom_mounts, s->n_custom_mounts);
@@ -95,6 +113,7 @@ bool settings_private_network(Settings *s) {
s->private_network > 0 ||
s->network_veth > 0 ||
s->network_bridge ||
+ s->network_zone ||
s->network_interfaces ||
s->network_macvlan ||
s->network_ipvlan ||
@@ -106,7 +125,8 @@ bool settings_network_veth(Settings *s) {
return
s->network_veth > 0 ||
- s->network_bridge;
+ s->network_bridge ||
+ s->network_zone;
}
DEFINE_CONFIG_PARSE_ENUM(config_parse_volatile_mode, volatile_mode, VolatileMode, "Failed to parse volatile mode");
@@ -303,6 +323,38 @@ int config_parse_veth_extra(
return 0;
}
+int config_parse_network_zone(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Settings *settings = data;
+ _cleanup_free_ char *j = NULL;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ j = strappend("vz-", rvalue);
+ if (!ifname_valid(j)) {
+ log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid network zone name %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ free(settings->network_zone);
+ settings->network_zone = j;
+ j = NULL;
+
+ return 0;
+}
+
int config_parse_boot(
const char *unit,
const char *filename,
@@ -392,3 +444,73 @@ conflict:
log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring.");
return 0;
}
+
+int config_parse_private_users(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Settings *settings = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_boolean(rvalue);
+ if (r == 0) {
+ /* no: User namespacing off */
+ settings->userns_mode = USER_NAMESPACE_NO;
+ settings->uid_shift = UID_INVALID;
+ settings->uid_range = UINT32_C(0x10000);
+ } else if (r > 0) {
+ /* yes: User namespacing on, UID range is read from root dir */
+ settings->userns_mode = USER_NAMESPACE_FIXED;
+ settings->uid_shift = UID_INVALID;
+ settings->uid_range = UINT32_C(0x10000);
+ } else if (streq(rvalue, "pick")) {
+ /* pick: User namespacing on, UID range is picked randomly */
+ settings->userns_mode = USER_NAMESPACE_PICK;
+ settings->uid_shift = UID_INVALID;
+ settings->uid_range = UINT32_C(0x10000);
+ } else {
+ const char *range, *shift;
+ uid_t sh, rn;
+
+ /* anything else: User namespacing on, UID range is explicitly configured */
+
+ range = strchr(rvalue, ':');
+ if (range) {
+ shift = strndupa(rvalue, range - rvalue);
+ range++;
+
+ r = safe_atou32(range, &rn);
+ if (r < 0 || rn <= 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID range invalid, ignoring: %s", range);
+ return 0;
+ }
+ } else {
+ shift = rvalue;
+ rn = UINT32_C(0x10000);
+ }
+
+ r = parse_uid(shift, &sh);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID shift invalid, ignoring: %s", range);
+ return 0;
+ }
+
+ settings->userns_mode = USER_NAMESPACE_FIXED;
+ settings->uid_shift = sh;
+ settings->uid_range = rn;
+ }
+
+ return 0;
+}
diff --git a/src/systemd-nspawn/nspawn-settings.h b/src/systemd-nspawn/nspawn-settings.h
index a017405cd9..1c47e37912 100644
--- a/src/systemd-nspawn/nspawn-settings.h
+++ b/src/systemd-nspawn/nspawn-settings.h
@@ -33,6 +33,14 @@ typedef enum StartMode {
_START_MODE_INVALID = -1
} StartMode;
+typedef enum UserNamespaceMode {
+ USER_NAMESPACE_NO,
+ USER_NAMESPACE_FIXED,
+ USER_NAMESPACE_PICK,
+ _USER_NAMESPACE_MODE_MAX,
+ _USER_NAMESPACE_MODE_INVALID = -1,
+} UserNamespaceMode;
+
typedef enum SettingsMask {
SETTING_START_MODE = 1 << 0,
SETTING_ENVIRONMENT = 1 << 1,
@@ -47,7 +55,8 @@ typedef enum SettingsMask {
SETTING_VOLATILE_MODE = 1 << 10,
SETTING_CUSTOM_MOUNTS = 1 << 11,
SETTING_WORKING_DIRECTORY = 1 << 12,
- _SETTINGS_MASK_ALL = (1 << 13) -1
+ SETTING_USERNS = 1 << 13,
+ _SETTINGS_MASK_ALL = (1 << 14) -1
} SettingsMask;
typedef struct Settings {
@@ -62,17 +71,21 @@ typedef struct Settings {
unsigned long personality;
sd_id128_t machine_id;
char *working_directory;
+ UserNamespaceMode userns_mode;
+ uid_t uid_shift, uid_range;
/* [Image] */
int read_only;
VolatileMode volatile_mode;
CustomMount *custom_mounts;
unsigned n_custom_mounts;
+ int userns_chown;
/* [Network] */
int private_network;
int network_veth;
char *network_bridge;
+ char *network_zone;
char **network_interfaces;
char **network_macvlan;
char **network_ipvlan;
@@ -97,5 +110,7 @@ int config_parse_volatile_mode(const char *unit, const char *filename, unsigned
int config_parse_bind(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
+int config_parse_network_zone(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
+int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
diff --git a/src/systemd-nspawn/nspawn.c b/src/systemd-nspawn/nspawn.c
index 1de527b57b..6390197646 100644
--- a/src/systemd-nspawn/nspawn.c
+++ b/src/systemd-nspawn/nspawn.c
@@ -22,7 +22,9 @@
#endif
#include <errno.h>
#include <getopt.h>
+#include <grp.h>
#include <linux/loop.h>
+#include <pwd.h>
#include <sched.h>
#ifdef HAVE_SECCOMP
#include <seccomp.h>
@@ -64,6 +66,7 @@
#include "hostname-util.h"
#include "log.h"
#include "loopback-setup.h"
+#include "machine-id-setup.h"
#include "machine-image.h"
#include "macro.h"
#include "missing.h"
@@ -73,6 +76,7 @@
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
+#include "nspawn-patch-uid.h"
#include "nspawn-register.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
@@ -86,6 +90,7 @@
#ifdef HAVE_SECCOMP
#include "seccomp-util.h"
#endif
+#include "selinux-util.h"
#include "signal-util.h"
#include "socket-util.h"
#include "stat-util.h"
@@ -98,6 +103,11 @@
#include "user-util.h"
#include "util.h"
+/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
+ * UID range here */
+#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
+#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+
typedef enum ContainerStatus {
CONTAINER_TERMINATED,
CONTAINER_REBOOTED
@@ -165,13 +175,15 @@ static char **arg_network_ipvlan = NULL;
static bool arg_network_veth = false;
static char **arg_network_veth_extra = NULL;
static char *arg_network_bridge = NULL;
+static char *arg_network_zone = NULL;
static unsigned long arg_personality = PERSONALITY_INVALID;
static char *arg_image = NULL;
static VolatileMode arg_volatile_mode = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
static char **arg_property = NULL;
+static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
-static bool arg_userns = false;
+static bool arg_userns_chown = false;
static int arg_kill_signal = 0;
static bool arg_unified_cgroup_hierarchy = false;
static SettingsMask arg_settings_mask = 0;
@@ -199,8 +211,10 @@ static void help(void) {
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
+ " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
" --private-users[=UIDBASE[:NUIDS]]\n"
- " Run within user namespace\n"
+ " Run within user namespace, user configured UID/GID range\n"
+ " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
@@ -220,6 +234,8 @@ static void help(void) {
" Add a virtual Ethernet connection between host\n"
" and container and add it to an existing bridge on\n"
" the host\n"
+ " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
+ " and add it to an automatically managed bridge interface\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
" Expose a container IP port on the host\n"
" -Z --selinux-context=SECLABEL\n"
@@ -247,7 +263,7 @@ static void help(void) {
" the container\n"
" --overlay-ro=PATH[:PATH...]:PATH\n"
" Similar, but creates a read-only overlay mount\n"
- " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
+ " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
" --share-system Share system namespaces with host\n"
" --register=BOOLEAN Register container as machine\n"
" --keep-unit Do not register a scope for the machine, reuse\n"
@@ -269,9 +285,15 @@ static int custom_mounts_prepare(void) {
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
- if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
- log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
- return -EINVAL;
+ if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
+
+ if (arg_userns_chown) {
+ log_error("--private-users-chown may not be combined with custom root mounts.");
+ return -EINVAL;
+ } else if (arg_uid_shift == UID_INVALID) {
+ log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
+ return -EINVAL;
+ }
}
if (m->type != CUSTOM_MOUNT_OVERLAY)
@@ -330,7 +352,6 @@ static int parse_argv(int argc, char *argv[]) {
ARG_TMPFS,
ARG_OVERLAY,
ARG_OVERLAY_RO,
- ARG_SETENV,
ARG_SHARE_SYSTEM,
ARG_REGISTER,
ARG_KEEP_UNIT,
@@ -338,6 +359,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_NETWORK_MACVLAN,
ARG_NETWORK_IPVLAN,
ARG_NETWORK_BRIDGE,
+ ARG_NETWORK_ZONE,
ARG_NETWORK_VETH_EXTRA,
ARG_PERSONALITY,
ARG_VOLATILE,
@@ -347,6 +369,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_KILL_SIGNAL,
ARG_SETTINGS,
ARG_CHDIR,
+ ARG_PRIVATE_USERS_CHOWN,
};
static const struct option options[] = {
@@ -371,7 +394,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
{ "machine", required_argument, NULL, 'M' },
{ "slice", required_argument, NULL, 'S' },
- { "setenv", required_argument, NULL, ARG_SETENV },
+ { "setenv", required_argument, NULL, 'E' },
{ "selinux-context", required_argument, NULL, 'Z' },
{ "selinux-apifs-context", required_argument, NULL, 'L' },
{ "quiet", no_argument, NULL, 'q' },
@@ -384,12 +407,14 @@ static int parse_argv(int argc, char *argv[]) {
{ "network-veth", no_argument, NULL, 'n' },
{ "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
{ "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
+ { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
{ "personality", required_argument, NULL, ARG_PERSONALITY },
{ "image", required_argument, NULL, 'i' },
{ "volatile", optional_argument, NULL, ARG_VOLATILE },
{ "port", required_argument, NULL, 'p' },
{ "property", required_argument, NULL, ARG_PROPERTY },
{ "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
+ { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{ "settings", required_argument, NULL, ARG_SETTINGS },
{ "chdir", required_argument, NULL, ARG_CHDIR },
@@ -404,7 +429,7 @@ static int parse_argv(int argc, char *argv[]) {
assert(argc >= 0);
assert(argv);
- while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
switch (c) {
@@ -445,7 +470,35 @@ static int parse_argv(int argc, char *argv[]) {
arg_settings_mask |= SETTING_USER;
break;
+ case ARG_NETWORK_ZONE: {
+ char *j;
+
+ j = strappend("vz-", optarg);
+ if (!j)
+ return log_oom();
+
+ if (!ifname_valid(j)) {
+ log_error("Network zone name not valid: %s", j);
+ free(j);
+ return -EINVAL;
+ }
+
+ free(arg_network_zone);
+ arg_network_zone = j;
+
+ arg_network_veth = true;
+ arg_private_network = true;
+ arg_settings_mask |= SETTING_NETWORK;
+ break;
+ }
+
case ARG_NETWORK_BRIDGE:
+
+ if (!ifname_valid(optarg)) {
+ log_error("Bridge interface name not valid: %s", optarg);
+ return -EINVAL;
+ }
+
r = free_and_strdup(&arg_network_bridge, optarg);
if (r < 0)
return log_oom();
@@ -468,6 +521,12 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_NETWORK_INTERFACE:
+
+ if (!ifname_valid(optarg)) {
+ log_error("Network interface name not valid: %s", optarg);
+ return -EINVAL;
+ }
+
if (strv_extend(&arg_network_interfaces, optarg) < 0)
return log_oom();
@@ -476,6 +535,12 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_NETWORK_MACVLAN:
+
+ if (!ifname_valid(optarg)) {
+ log_error("MACVLAN network interface name not valid: %s", optarg);
+ return -EINVAL;
+ }
+
if (strv_extend(&arg_network_macvlan, optarg) < 0)
return log_oom();
@@ -484,6 +549,12 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_NETWORK_IPVLAN:
+
+ if (!ifname_valid(optarg)) {
+ log_error("IPVLAN network interface name not valid: %s", optarg);
+ return -EINVAL;
+ }
+
if (strv_extend(&arg_network_ipvlan, optarg) < 0)
return log_oom();
@@ -560,7 +631,7 @@ static int parse_argv(int argc, char *argv[]) {
case ARG_CAPABILITY:
case ARG_DROP_CAPABILITY: {
p = optarg;
- for(;;) {
+ for (;;) {
_cleanup_free_ char *t = NULL;
r = extract_first_word(&p, &t, ",", 0);
@@ -708,7 +779,7 @@ static int parse_argv(int argc, char *argv[]) {
break;
}
- case ARG_SETENV: {
+ case 'E': {
char **n;
if (!env_assignment_is_valid(optarg)) {
@@ -795,10 +866,29 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_PRIVATE_USERS:
- if (optarg) {
+
+ r = optarg ? parse_boolean(optarg) : 1;
+ if (r == 0) {
+ /* no: User namespacing off */
+ arg_userns_mode = USER_NAMESPACE_NO;
+ arg_uid_shift = UID_INVALID;
+ arg_uid_range = UINT32_C(0x10000);
+ } else if (r > 0) {
+ /* yes: User namespacing on, UID range is read from root dir */
+ arg_userns_mode = USER_NAMESPACE_FIXED;
+ arg_uid_shift = UID_INVALID;
+ arg_uid_range = UINT32_C(0x10000);
+ } else if (streq(optarg, "pick")) {
+ /* pick: User namespacing on, UID range is picked randomly */
+ arg_userns_mode = USER_NAMESPACE_PICK;
+ arg_uid_shift = UID_INVALID;
+ arg_uid_range = UINT32_C(0x10000);
+ } else {
_cleanup_free_ char *buffer = NULL;
const char *range, *shift;
+ /* anything else: User namespacing on, UID range is explicitly configured */
+
range = strchr(optarg, ':');
if (range) {
buffer = strndup(optarg, range - optarg);
@@ -818,9 +908,28 @@ static int parse_argv(int argc, char *argv[]) {
log_error("Failed to parse UID: %s", optarg);
return -EINVAL;
}
+
+ arg_userns_mode = USER_NAMESPACE_FIXED;
}
- arg_userns = true;
+ arg_settings_mask |= SETTING_USERNS;
+ break;
+
+ case 'U':
+ if (userns_supported()) {
+ arg_userns_mode = USER_NAMESPACE_PICK;
+ arg_uid_shift = UID_INVALID;
+ arg_uid_range = UINT32_C(0x10000);
+
+ arg_settings_mask |= SETTING_USERNS;
+ }
+
+ break;
+
+ case ARG_PRIVATE_USERS_CHOWN:
+ arg_userns_chown = true;
+
+ arg_settings_mask |= SETTING_USERNS;
break;
case ARG_KILL_SIGNAL:
@@ -891,6 +1000,9 @@ static int parse_argv(int argc, char *argv[]) {
if (arg_share_system)
arg_register = false;
+ if (arg_userns_mode == USER_NAMESPACE_PICK)
+ arg_userns_chown = true;
+
if (arg_start_mode != START_PID1 && arg_share_system) {
log_error("--boot and --share-system may not be combined.");
return -EINVAL;
@@ -931,8 +1043,20 @@ static int parse_argv(int argc, char *argv[]) {
return -EINVAL;
}
- if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
- return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
+ if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
+ log_error("--private-users= is not supported, kernel compiled without user namespace support.");
+ return -EOPNOTSUPP;
+ }
+
+ if (arg_userns_chown && arg_read_only) {
+ log_error("--read-only and --private-users-chown may not be combined.");
+ return -EINVAL;
+ }
+
+ if (arg_network_bridge && arg_network_zone) {
+ log_error("--network-bridge= and --network-zone= may not be combined.");
+ return -EINVAL;
+ }
if (argc > optind) {
arg_parameters = strv_copy(argv + optind);
@@ -975,6 +1099,13 @@ static int verify_arguments(void) {
return -EINVAL;
}
+#ifndef HAVE_LIBIPTC
+ if (arg_expose_ports) {
+ log_error("--port= is not supported, compiled without libiptc support.");
+ return -EOPNOTSUPP;
+ }
+#endif
+
if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
@@ -984,7 +1115,7 @@ static int verify_arguments(void) {
static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
assert(p);
- if (!arg_userns)
+ if (arg_userns_mode == USER_NAMESPACE_NO)
return 0;
if (uid == UID_INVALID && gid == GID_INVALID)
@@ -1366,11 +1497,11 @@ static int setup_hostname(void) {
}
static int setup_journal(const char *directory) {
- sd_id128_t machine_id, this_id;
- _cleanup_free_ char *b = NULL, *d = NULL;
- const char *etc_machine_id, *p, *q;
+ sd_id128_t this_id;
+ _cleanup_free_ char *d = NULL;
+ const char *p, *q;
bool try;
- char *id;
+ char id[33];
int r;
/* Don't link journals in ephemeral mode */
@@ -1382,30 +1513,13 @@ static int setup_journal(const char *directory) {
try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
- etc_machine_id = prefix_roota(directory, "/etc/machine-id");
-
- r = read_one_line_file(etc_machine_id, &b);
- if (r == -ENOENT && try)
- return 0;
- else if (r < 0)
- return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
-
- id = strstrip(b);
- if (isempty(id) && try)
- return 0;
-
- /* Verify validity */
- r = sd_id128_from_string(id, &machine_id);
- if (r < 0)
- return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
-
r = sd_id128_get_machine(&this_id);
if (r < 0)
return log_error_errno(r, "Failed to retrieve machine ID: %m");
- if (sd_id128_equal(machine_id, this_id)) {
+ if (sd_id128_equal(arg_uuid, this_id)) {
log_full(try ? LOG_WARNING : LOG_ERR,
- "Host and machine ids are equal (%s): refusing to link journals", id);
+ "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
if (try)
return 0;
return -EEXIST;
@@ -1423,6 +1537,8 @@ static int setup_journal(const char *directory) {
if (r < 0)
return log_error_errno(r, "Failed to create /var/log/journal: %m");
+ (void) sd_id128_to_string(arg_uuid, id);
+
p = strjoina("/var/log/journal/", id);
q = prefix_roota(directory, p);
@@ -1487,7 +1603,7 @@ static int setup_journal(const char *directory) {
}
if (arg_link_journal == LINK_HOST) {
- /* don't create parents here -- if the host doesn't have
+ /* don't create parents here — if the host doesn't have
* permanent journal set up, don't force it here */
if (mkdir(p, 0755) < 0 && errno != EEXIST) {
@@ -1596,7 +1712,6 @@ static int setup_seccomp(void) {
}
}
-
/*
Audit is broken in containers, much of the userspace audit
hookup will fail if running inside a container. We don't
@@ -2192,6 +2307,61 @@ static int mount_device(const char *what, const char *where, const char *directo
#endif
}
+static int setup_machine_id(const char *directory) {
+ int r;
+ const char *etc_machine_id, *t;
+ _cleanup_free_ char *s = NULL;
+
+ etc_machine_id = prefix_roota(directory, "/etc/machine-id");
+
+ r = read_one_line_file(etc_machine_id, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
+
+ t = strstrip(s);
+
+ if (!isempty(t)) {
+ r = sd_id128_from_string(t, &arg_uuid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
+ } else {
+ if (sd_id128_is_null(arg_uuid)) {
+ r = sd_id128_randomize(&arg_uuid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate random machine ID: %m");
+ }
+ }
+
+ r = machine_id_setup(directory, arg_uuid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to setup machine ID: %m");
+
+ return 0;
+}
+
+static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
+ int r;
+
+ assert(directory);
+
+ if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
+ return 0;
+
+ r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
+ if (r == -EOPNOTSUPP)
+ return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
+ if (r == -EBADE)
+ return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
+ if (r == 0)
+ log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
+ else
+ log_debug("Patched directory tree to match UID/GID range.");
+
+ return r;
+}
+
static int mount_devices(
const char *where,
const char *root_device, bool root_device_rw,
@@ -2409,7 +2579,7 @@ static int determine_names(void) {
static int determine_uid_shift(const char *directory) {
int r;
- if (!arg_userns) {
+ if (arg_userns_mode == USER_NAMESPACE_NO) {
arg_uid_shift = 0;
return 0;
}
@@ -2436,7 +2606,6 @@ static int determine_uid_shift(const char *directory) {
return -EINVAL;
}
- log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
return 0;
}
@@ -2449,6 +2618,7 @@ static int inner_child(
FDSet *fds) {
_cleanup_free_ char *home = NULL;
+ char as_uuid[37];
unsigned n_env = 1;
const char *envp[] = {
"PATH=" DEFAULT_PATH_SPLIT_USR,
@@ -2472,7 +2642,7 @@ static int inner_child(
cg_unified_flush();
- if (arg_userns) {
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
@@ -2483,7 +2653,14 @@ static int inner_child(
}
}
- r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
+ r = mount_all(NULL,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ true,
+ arg_private_network,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context);
+
if (r < 0)
return r;
@@ -2559,19 +2736,17 @@ static int inner_child(
envp[n_env] = strv_find_prefix(environ, "TERM=");
if (envp[n_env])
- n_env ++;
+ n_env++;
if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
(asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
(asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
return log_oom();
- if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
- char as_uuid[37];
+ assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
- if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
- return log_oom();
- }
+ if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+ return log_oom();
if (fdset_size(fds) > 0) {
r = fdset_cloexec(fds, false);
@@ -2622,12 +2797,10 @@ static int inner_child(
/* Automatically search for the init system */
- m = 1 + strv_length(arg_parameters);
- a = newa(char*, m + 1);
- if (strv_isempty(arg_parameters))
- a[1] = NULL;
- else
- memcpy(a + 1, arg_parameters, m * sizeof(char*));
+ m = strv_length(arg_parameters);
+ a = newa(char*, m + 2);
+ memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
+ a[1 + m] = NULL;
a[0] = (char*) "/usr/lib/systemd/systemd";
execve(a[0], a, env_use);
@@ -2641,7 +2814,8 @@ static int inner_child(
execvpe(arg_parameters[0], arg_parameters, env_use);
else {
if (!arg_chdir)
- chdir(home ?: "/root");
+ /* If we cannot change the directory, we'll end up in /, that is expected. */
+ (void) chdir(home ?: "/root");
execle("/bin/bash", "-bash", NULL, env_use);
execle("/bin/sh", "-sh", NULL, env_use);
@@ -2662,6 +2836,7 @@ static int outer_child(
bool interactive,
bool secondary,
int pid_socket,
+ int uuid_socket,
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
@@ -2675,6 +2850,7 @@ static int outer_child(
assert(directory);
assert(console);
assert(pid_socket >= 0);
+ assert(uuid_socket >= 0);
assert(kmsg_socket >= 0);
cg_unified_flush();
@@ -2723,7 +2899,8 @@ static int outer_child(
if (r < 0)
return r;
- if (arg_userns) {
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ /* Let the parent know which UID shift we read from the image */
l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
if (l < 0)
return log_error_errno(errno, "Failed to send UID shift: %m");
@@ -2731,17 +2908,49 @@ static int outer_child(
log_error("Short write while sending UID shift.");
return -EIO;
}
+
+ if (arg_userns_mode == USER_NAMESPACE_PICK) {
+ /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
+ * we just read from the image is available. If yes, it will send the UID shift back to us, if
+ * not it will pick a different one, and send it back to us. */
+
+ l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to recv UID shift: %m");
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short read while recieving UID shift.");
+ return -EIO;
+ }
+ }
+
+ log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
}
/* Turn directory into bind mount */
if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
return log_error_errno(errno, "Failed to make bind mount: %m");
- r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
+ r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
if (r < 0)
return r;
- r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
+ r = setup_volatile(
+ directory,
+ arg_volatile_mode,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_context);
+ if (r < 0)
+ return r;
+
+ r = setup_volatile_state(
+ directory,
+ arg_volatile_mode,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_context);
if (r < 0)
return r;
@@ -2755,7 +2964,13 @@ static int outer_child(
return log_error_errno(r, "Failed to make tree read-only: %m");
}
- r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
+ r = mount_all(directory,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ false,
+ arg_private_network,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context);
if (r < 0)
return r;
@@ -2789,15 +3004,32 @@ static int outer_child(
if (r < 0)
return r;
+ r = setup_machine_id(directory);
+ if (r < 0)
+ return r;
+
r = setup_journal(directory);
if (r < 0)
return r;
- r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
+ r = mount_custom(
+ directory,
+ arg_custom_mounts,
+ arg_n_custom_mounts,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context);
if (r < 0)
return r;
- r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
+ r = mount_cgroups(
+ directory,
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context);
if (r < 0)
return r;
@@ -2808,12 +3040,13 @@ static int outer_child(
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
(arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
(arg_private_network ? CLONE_NEWNET : 0) |
- (arg_userns ? CLONE_NEWUSER : 0),
+ (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0),
NULL);
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
if (pid == 0) {
pid_socket = safe_close(pid_socket);
+ uuid_socket = safe_close(uuid_socket);
uid_shift_socket = safe_close(uid_shift_socket);
/* The inner child has all namespaces that are
@@ -2835,13 +3068,77 @@ static int outer_child(
return -EIO;
}
+ l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send machine ID: %m");
+ if (l != sizeof(arg_uuid)) {
+ log_error("Short write while sending machine ID.");
+ return -EIO;
+ }
+
pid_socket = safe_close(pid_socket);
+ uuid_socket = safe_close(uuid_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
return 0;
}
+static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
+ unsigned n_tries = 100;
+ uid_t candidate;
+ int r;
+
+ assert(shift);
+ assert(ret_lock_file);
+ assert(arg_userns_mode == USER_NAMESPACE_PICK);
+ assert(arg_uid_range == 0x10000U);
+
+ candidate = *shift;
+
+ (void) mkdir("/run/systemd/nspawn-uid", 0755);
+
+ for (;;) {
+ char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
+ _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
+
+ if (--n_tries <= 0)
+ return -EBUSY;
+
+ if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
+ goto next;
+ if ((candidate & UINT32_C(0xFFFF)) != 0)
+ goto next;
+
+ xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
+ r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
+ if (r == -EBUSY) /* Range already taken by another nspawn instance */
+ goto next;
+ if (r < 0)
+ return r;
+
+ /* Make some superficial checks whether the range is currently known in the user database */
+ if (getpwuid(candidate))
+ goto next;
+ if (getpwuid(candidate + UINT32_C(0xFFFE)))
+ goto next;
+ if (getgrgid(candidate))
+ goto next;
+ if (getgrgid(candidate + UINT32_C(0xFFFE)))
+ goto next;
+
+ *ret_lock_file = lf;
+ lf = (struct LockFile) LOCK_FILE_INIT;
+ *shift = candidate;
+ return 0;
+
+ next:
+ random_bytes(&candidate, sizeof(candidate));
+ candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
+ candidate &= (uid_t) UINT32_C(0xFFFF0000);
+ }
+}
+
static int setup_uid_map(pid_t pid) {
char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
int r;
@@ -3028,6 +3325,7 @@ static int load_settings(void) {
(settings->private_network >= 0 ||
settings->network_veth >= 0 ||
settings->network_bridge ||
+ settings->network_zone ||
settings->network_interfaces ||
settings->network_macvlan ||
settings->network_ipvlan ||
@@ -3058,6 +3356,10 @@ static int load_settings(void) {
free(arg_network_bridge);
arg_network_bridge = settings->network_bridge;
settings->network_bridge = NULL;
+
+ free(arg_network_zone);
+ arg_network_zone = settings->network_zone;
+ settings->network_zone = NULL;
}
}
@@ -3073,6 +3375,19 @@ static int load_settings(void) {
}
}
+ if ((arg_settings_mask & SETTING_USERNS) == 0 &&
+ settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
+
+ if (!arg_settings_trusted)
+ log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
+ else {
+ arg_userns_mode = settings->userns_mode;
+ arg_uid_shift = settings->uid_shift;
+ arg_uid_range = settings->uid_range;
+ arg_userns_chown = settings->userns_chown;
+ }
+ }
+
return 0;
}
@@ -3083,14 +3398,14 @@ int main(int argc, char *argv[]) {
_cleanup_close_ int master = -1, image_fd = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
int r, n_fd_passed, loop_nr = -1;
- char veth_name[IFNAMSIZ];
+ char veth_name[IFNAMSIZ] = "";
bool secondary = false, remove_subvol = false;
sigset_t mask_chld;
pid_t pid = 0;
int ret = EXIT_SUCCESS;
union in_addr_union exposed = {};
_cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
- bool interactive;
+ bool interactive, veth_created = false;
log_parse_environment();
log_open();
@@ -3285,6 +3600,12 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ if (arg_selinux_apifs_context) {
+ r = mac_selinux_apply(console, arg_selinux_apifs_context);
+ if (r < 0)
+ goto finish;
+ }
+
if (unlockpt(master) < 0) {
r = log_error_errno(errno, "Failed to unlock tty: %m");
goto finish;
@@ -3305,19 +3626,42 @@ int main(int argc, char *argv[]) {
}
for (;;) {
- _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
- ContainerStatus container_status;
- _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
static const struct sigaction sa = {
.sa_handler = nop_signal_handler,
.sa_flags = SA_NOCLDSTOP,
};
- int ifi = 0;
- ssize_t l;
+
+ _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
+ _cleanup_close_ int etc_passwd_lock = -1;
+ _cleanup_close_pair_ int
+ kmsg_socket_pair[2] = { -1, -1 },
+ rtnl_socket_pair[2] = { -1, -1 },
+ pid_socket_pair[2] = { -1, -1 },
+ uuid_socket_pair[2] = { -1, -1 },
+ uid_shift_socket_pair[2] = { -1, -1 };
+ _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ ContainerStatus container_status;
char last_char = 0;
+ int ifi = 0;
+ ssize_t l;
+
+ if (arg_userns_mode == USER_NAMESPACE_PICK) {
+ /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
+ * check with getpwuid() if the specific user already exists. Note that /etc might be
+ * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
+ * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
+ * really just an extra safety net. We kinda assume that the UID range we allocate from is
+ * really ours. */
+
+ etc_passwd_lock = take_etc_passwd_lock(NULL);
+ if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
+ log_error_errno(r, "Failed to take /etc/passwd lock: %m");
+ goto finish;
+ }
+ }
r = barrier_create(&barrier);
if (r < 0) {
@@ -3340,7 +3684,12 @@ int main(int argc, char *argv[]) {
goto finish;
}
- if (arg_userns)
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create id socket pair: %m");
+ goto finish;
+ }
+
+ if (arg_userns_mode != USER_NAMESPACE_NO)
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
goto finish;
@@ -3380,6 +3729,7 @@ int main(int argc, char *argv[]) {
kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+ uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
(void) reset_all_signal_handlers();
@@ -3394,6 +3744,7 @@ int main(int argc, char *argv[]) {
interactive,
secondary,
pid_socket_pair[1],
+ uuid_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
@@ -3411,8 +3762,46 @@ int main(int argc, char *argv[]) {
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
+ uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ /* The child just let us know the UID shift it might have read from the image. */
+ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to read UID shift: %m");
+ goto finish;
+ }
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short read while reading UID shift.");
+ r = EIO;
+ goto finish;
+ }
+
+ if (arg_userns_mode == USER_NAMESPACE_PICK) {
+ /* If we are supposed to pick the UID shift, let's try to use the shift read from the
+ * image, but if that's already in use, pick a new one, and report back to the child,
+ * which one we now picked. */
+
+ r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
+ if (r < 0) {
+ log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
+ goto finish;
+ }
+
+ l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to send UID shift: %m");
+ goto finish;
+ }
+ if (l != sizeof(arg_uid_shift)) {
+ log_error("Short write while writing UID shift.");
+ r = -EIO;
+ goto finish;
+ }
+ }
+ }
+
/* Wait for the outer child. */
r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
if (r < 0)
@@ -3435,26 +3824,27 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ /* We also retrieve container UUID in case it was generated by outer child */
+ l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
+ if (l < 0) {
+ r = log_error_errno(errno, "Failed to read container machine ID: %m");
+ goto finish;
+ }
+ if (l != sizeof(arg_uuid)) {
+ log_error("Short read while reading container machined ID.");
+ r = EIO;
+ goto finish;
+ }
+
log_debug("Init process invoked as PID " PID_FMT, pid);
- if (arg_userns) {
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
if (!barrier_place_and_sync(&barrier)) { /* #1 */
log_error("Child died too early.");
r = -ESRCH;
goto finish;
}
- l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
- if (l < 0) {
- r = log_error_errno(errno, "Failed to read UID shift: %m");
- goto finish;
- }
- if (l != sizeof(arg_uid_shift)) {
- log_error("Short read while reading UID shift.");
- r = EIO;
- goto finish;
- }
-
r = setup_uid_map(pid);
if (r < 0)
goto finish;
@@ -3469,14 +3859,23 @@ int main(int argc, char *argv[]) {
goto finish;
if (arg_network_veth) {
- r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
+ r = setup_veth(arg_machine, pid, veth_name,
+ arg_network_bridge || arg_network_zone);
if (r < 0)
goto finish;
else if (r > 0)
ifi = r;
if (arg_network_bridge) {
- r = setup_bridge(veth_name, arg_network_bridge);
+ /* Add the interface to a bridge */
+ r = setup_bridge(veth_name, arg_network_bridge, false);
+ if (r < 0)
+ goto finish;
+ if (r > 0)
+ ifi = r;
+ } else if (arg_network_zone) {
+ /* Add the interface to a bridge, possibly creating it */
+ r = setup_bridge(veth_name, arg_network_zone, true);
if (r < 0)
goto finish;
if (r > 0)
@@ -3488,6 +3887,12 @@ int main(int argc, char *argv[]) {
if (r < 0)
goto finish;
+ /* We created the primary and extra veth links now; let's remember this, so that we know to
+ remove them later on. Note that we don't bother with removing veth links that were created
+ here when their setup failed half-way, because in that case the kernel should be able to
+ remove them on its own, since they cannot be referenced by anything yet. */
+ veth_created = true;
+
r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
if (r < 0)
goto finish;
@@ -3552,6 +3957,10 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
+ * in getpwuid(), thus we can release the /etc/passwd lock. */
+ etc_passwd_lock = safe_close(etc_passwd_lock);
+
sd_notifyf(false,
"READY=1\n"
"STATUS=Container running.\n"
@@ -3619,7 +4028,7 @@ int main(int argc, char *argv[]) {
/* We failed to wait for the container, or the
* container exited abnormally */
goto finish;
- else if (r > 0 || container_status == CONTAINER_TERMINATED){
+ else if (r > 0 || container_status == CONTAINER_TERMINATED) {
/* The container exited with a non-zero
* status, or with zero status and no reboot
* was requested. */
@@ -3646,6 +4055,9 @@ int main(int argc, char *argv[]) {
}
expose_port_flush(arg_expose_ports, &exposed);
+
+ (void) remove_veth_links(veth_name, arg_network_veth_extra);
+ veth_created = false;
}
finish:
@@ -3679,6 +4091,10 @@ finish:
expose_port_flush(arg_expose_ports, &exposed);
+ if (veth_created)
+ (void) remove_veth_links(veth_name, arg_network_veth_extra);
+ (void) remove_bridge(arg_network_zone);
+
free(arg_directory);
free(arg_template);
free(arg_image);
diff --git a/src/systemd-nspawn/test-patch-uid.c b/src/systemd-nspawn/test-patch-uid.c
new file mode 100644
index 0000000000..11c5321788
--- /dev/null
+++ b/src/systemd-nspawn/test-patch-uid.c
@@ -0,0 +1,61 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <stdlib.h>
+
+#include "log.h"
+#include "nspawn-patch-uid.h"
+#include "user-util.h"
+#include "util.h"
+
+int main(int argc, char *argv[]) {
+ uid_t shift, range;
+ int r;
+
+ log_set_max_level(LOG_DEBUG);
+ log_parse_environment();
+ log_open();
+
+ if (argc != 4) {
+ log_error("Expected PATH SHIFT RANGE parameters.");
+ return EXIT_FAILURE;
+ }
+
+ r = parse_uid(argv[2], &shift);
+ if (r < 0) {
+ log_error_errno(r, "Failed to parse UID shift %s.", argv[2]);
+ return EXIT_FAILURE;
+ }
+
+ r = parse_gid(argv[3], &range);
+ if (r < 0) {
+ log_error_errno(r, "Failed to parse UID range %s.", argv[3]);
+ return EXIT_FAILURE;
+ }
+
+ r = path_patch_uid(argv[1], shift, range);
+ if (r < 0) {
+ log_error_errno(r, "Failed to patch directory tree: %m");
+ return EXIT_FAILURE;
+ }
+
+ log_info("Changed: %s", yes_no(r));
+
+ return EXIT_SUCCESS;
+}