summaryrefslogtreecommitdiff
path: root/src/systemd-nspawn
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@sbcglobal.net>2016-09-14 18:33:57 -0400
committerLuke Shumaker <lukeshu@sbcglobal.net>2016-09-14 18:33:57 -0400
commit3c72c8d3ee67388336aca58c5afa3fb93a9c24c0 (patch)
treed072df7fee0f5906fad88c08398b2fe887cbc064 /src/systemd-nspawn
parente51613a3291342c6006edda8783755fb8994fd75 (diff)
parent6ba6ca19507add38549e07058c57489a8cd98cd1 (diff)
Merge branch 'notsystemd/postmove' into notsystemd/master
# Conflicts: # src/grp-journal/systemd-journald/Makefile # src/grp-login/systemd-logind/Makefile # src/grp-machine/grp-import/systemd-export/Makefile # src/grp-machine/grp-import/systemd-import/Makefile # src/grp-machine/grp-import/systemd-pull/Makefile # src/grp-machine/systemd-machined/Makefile # src/grp-network/libnetworkd-core/Makefile # src/grp-resolve/libbasic-dns/Makefile # src/grp-resolve/systemd-resolved/Makefile # src/grp-utils/systemd-path/Makefile # src/libshared/src/Makefile # src/libsystemd-network/include/systemd-network/sd-ndisc.h # src/libsystemd/Makefile # src/libsystemd/src/test.mk # src/libudev/Makefile # src/systemd-dbus1-generator/Makefile # src/systemd-nspawn/nspawn.c Signed-off-by: Luke Shumaker <lukeshu@sbcglobal.net>
Diffstat (limited to 'src/systemd-nspawn')
-rw-r--r--src/systemd-nspawn/Makefile24
-rw-r--r--src/systemd-nspawn/nspawn-cgroup.c2
-rw-r--r--src/systemd-nspawn/nspawn-gperf.gperf1
-rw-r--r--src/systemd-nspawn/nspawn-mount.c25
-rw-r--r--src/systemd-nspawn/nspawn-network.c2
-rw-r--r--src/systemd-nspawn/nspawn-patch-uid.c24
-rw-r--r--src/systemd-nspawn/nspawn-register.c11
-rw-r--r--src/systemd-nspawn/nspawn-seccomp.c198
-rw-r--r--src/systemd-nspawn/nspawn-seccomp.h24
-rw-r--r--src/systemd-nspawn/nspawn-settings.h4
-rw-r--r--src/systemd-nspawn/nspawn-setuid.c7
-rw-r--r--src/systemd-nspawn/nspawn.c432
-rw-r--r--src/systemd-nspawn/systemd-nspawn.completion.bash7
-rw-r--r--src/systemd-nspawn/systemd-nspawn.completion.zsh1
-rw-r--r--src/systemd-nspawn/systemd-nspawn.tmpfiles2
-rw-r--r--src/systemd-nspawn/systemd-nspawn.xml166
-rw-r--r--src/systemd-nspawn/systemd-nspawn@.service.in11
17 files changed, 637 insertions, 304 deletions
diff --git a/src/systemd-nspawn/Makefile b/src/systemd-nspawn/Makefile
index 2fe9bcc143..0c008808d8 100644
--- a/src/systemd-nspawn/Makefile
+++ b/src/systemd-nspawn/Makefile
@@ -36,6 +36,8 @@ systemd_nspawn_SOURCES = \
src/nspawn/nspawn-expose-ports.h \
src/nspawn/nspawn-cgroup.c \
src/nspawn/nspawn-cgroup.h \
+ src/nspawn/nspawn-seccomp.c \
+ src/nspawn/nspawn-seccomp.h \
src/nspawn/nspawn-register.c \
src/nspawn/nspawn-register.h \
src/nspawn/nspawn-setuid.c \
@@ -47,9 +49,7 @@ systemd_nspawn_SOURCES = \
src/core/mount-setup.c \
src/core/mount-setup.h \
src/core/loopback-setup.c \
- src/core/loopback-setup.h \
- src/core/machine-id-setup.c \
- src/core/machine-id-setup.h
+ src/core/loopback-setup.h
nodist_systemd_nspawn_SOURCES = \
src/nspawn/nspawn-gperf.c
@@ -59,12 +59,17 @@ gperf_gperf_sources += \
systemd_nspawn_CFLAGS = \
$(AM_CFLAGS) \
+ $(ACL_CFLAGS) \
$(BLKID_CFLAGS) \
- $(SECCOMP_CFLAGS)
+ $(SECCOMP_CFLAGS) \
+ $(SELINUX_CFLAGS)
systemd_nspawn_LDADD = \
- libshared.la \
- $(BLKID_LIBS)
+ libsystemd-shared.la \
+ $(ACL_LIBS) \
+ $(BLKID_LIBS) \
+ $(SECCOMP_LIBS) \
+ $(SELINUX_LIBS)
ifneq ($(HAVE_LIBIPTC),)
systemd_nspawn_LDADD += \
@@ -76,8 +81,13 @@ test_patch_uid_SOURCES = \
src/nspawn/nspawn-patch-uid.h \
src/nspawn/test-patch-uid.c
+test_patch_uid_CFLAGS = \
+ $(AM_CFLAGS) \
+ $(ACL_CFLAGS)
+
test_patch_uid_LDADD = \
- libshared.la
+ libsystemd-shared.la \
+ $(ACL_LIBS)
manual_tests += \
test-patch-uid
diff --git a/src/systemd-nspawn/nspawn-cgroup.c b/src/systemd-nspawn/nspawn-cgroup.c
index d0e2de2345..9fa46fadbd 100644
--- a/src/systemd-nspawn/nspawn-cgroup.c
+++ b/src/systemd-nspawn/nspawn-cgroup.c
@@ -124,7 +124,7 @@ int create_subcgroup(pid_t pid, bool unified_requested) {
int unified, r;
CGroupMask supported;
- /* In the unified hierarchy inner nodes may only only contain
+ /* In the unified hierarchy inner nodes may only contain
* subgroups, but not processes. Hence, if we running in the
* unified hierarchy and the container does the same, and we
* did not create a scope unit for the container move us and
diff --git a/src/systemd-nspawn/nspawn-gperf.gperf b/src/systemd-nspawn/nspawn-gperf.gperf
index dd3c1b3b88..332063e19b 100644
--- a/src/systemd-nspawn/nspawn-gperf.gperf
+++ b/src/systemd-nspawn/nspawn-gperf.gperf
@@ -29,6 +29,7 @@ Exec.Personality, config_parse_personality, 0, offsetof(Settings,
Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id)
Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
Exec.PrivateUsers, config_parse_private_users, 0, 0
+Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
diff --git a/src/systemd-nspawn/nspawn-mount.c b/src/systemd-nspawn/nspawn-mount.c
index d8ca696f21..ea1d591415 100644
--- a/src/systemd-nspawn/nspawn-mount.c
+++ b/src/systemd-nspawn/nspawn-mount.c
@@ -299,18 +299,19 @@ int mount_all(const char *dest,
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* Then, make it r/o */
- { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
+ { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
+ { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
#endif
};
@@ -344,7 +345,7 @@ int mount_all(const char *dest,
if (mount_table[k].fatal)
return log_error_errno(r, "Failed to create directory %s: %m", where);
- log_warning_errno(r, "Failed to create directory %s: %m", where);
+ log_debug_errno(r, "Failed to create directory %s: %m", where);
continue;
}
diff --git a/src/systemd-nspawn/nspawn-network.c b/src/systemd-nspawn/nspawn-network.c
index 1ae0a52f03..b188491000 100644
--- a/src/systemd-nspawn/nspawn-network.c
+++ b/src/systemd-nspawn/nspawn-network.c
@@ -351,7 +351,7 @@ int setup_bridge(const char *veth_name, const char *bridge_name, bool create) {
if (create) {
/* We take a system-wide lock here, so that we can safely check whether there's still a member in the
- * bridge before removing it, without risking interferance from other nspawn instances. */
+ * bridge before removing it, without risking interference from other nspawn instances. */
r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock);
if (r < 0)
diff --git a/src/systemd-nspawn/nspawn-patch-uid.c b/src/systemd-nspawn/nspawn-patch-uid.c
index 28d2801c69..dc703165ff 100644
--- a/src/systemd-nspawn/nspawn-patch-uid.c
+++ b/src/systemd-nspawn/nspawn-patch-uid.c
@@ -265,9 +265,12 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
return -errno;
/* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
- if (name && !S_ISLNK(st->st_mode))
- r = fchmodat(fd, name, st->st_mode, 0);
- else
+ if (name) {
+ if (!S_ISLNK(st->st_mode))
+ r = fchmodat(fd, name, st->st_mode, 0);
+ else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
+ r = 0;
+ } else
r = fchmod(fd, st->st_mode);
if (r < 0)
return -errno;
@@ -282,7 +285,13 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
return r > 0 || changed;
}
-static int is_procfs_sysfs_or_suchlike(int fd) {
+/*
+ * Check if the filesystem is fully compatible with user namespaces or
+ * UID/GID patching. Some filesystems in this list can be fully mounted inside
+ * user namespaces, however their inodes may relate to host resources or only
+ * valid in the global user namespace, therefore no patching should be applied.
+ */
+static int is_fs_fully_userns_compatible(int fd) {
struct statfs sfs;
assert(fd >= 0);
@@ -302,6 +311,9 @@ static int is_procfs_sysfs_or_suchlike(int fd) {
F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
}
@@ -313,8 +325,8 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
* probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
- * stop the recursion when we hit a procfs or sysfs file system. */
- r = is_procfs_sysfs_or_suchlike(fd);
+ * stop the recursion when we hit procfs, sysfs or some other special file systems. */
+ r = is_fs_fully_userns_compatible(fd);
if (r < 0)
goto finish;
if (r > 0) {
diff --git a/src/systemd-nspawn/nspawn-register.c b/src/systemd-nspawn/nspawn-register.c
index 3f174b6f24..12e29153f9 100644
--- a/src/systemd-nspawn/nspawn-register.c
+++ b/src/systemd-nspawn/nspawn-register.c
@@ -105,7 +105,7 @@ int register_machine(
return bus_log_create_error(r);
}
- r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
+ r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "closed");
if (r < 0)
return bus_log_create_error(r);
@@ -113,26 +113,19 @@ int register_machine(
* systemd-nspawn@.service, to keep the device
* policies in sync regardless if we are run with or
* without the --keep-unit switch. */
- r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
+ r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2,
/* Allow the container to
* access and create the API
* device nodes, so that
* PrivateDevices= in the
* container can work
* fine */
- "/dev/null", "rwm",
- "/dev/zero", "rwm",
- "/dev/full", "rwm",
- "/dev/random", "rwm",
- "/dev/urandom", "rwm",
- "/dev/tty", "rwm",
"/dev/net/tun", "rwm",
/* Allow the container
* access to ptys. However,
* do not permit the
* container to ever create
* these device nodes. */
- "/dev/pts/ptmx", "rw",
"char-pts", "rw");
if (r < 0)
return bus_log_create_error(r);
diff --git a/src/systemd-nspawn/nspawn-seccomp.c b/src/systemd-nspawn/nspawn-seccomp.c
new file mode 100644
index 0000000000..3efdf16a80
--- /dev/null
+++ b/src/systemd-nspawn/nspawn-seccomp.c
@@ -0,0 +1,198 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+
+#include <linux/netlink.h>
+
+#ifdef HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+
+#include "basic/log.h"
+
+#ifdef HAVE_SECCOMP
+#include "shared/seccomp-util.h"
+#endif
+
+#include "nspawn-seccomp.h"
+
+#ifdef HAVE_SECCOMP
+
+static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx,
+ uint64_t cap_list_retain) {
+ unsigned i;
+ int r;
+ static const struct {
+ uint64_t capability;
+ int syscall_num;
+ } blacklist[] = {
+ { 0, SCMP_SYS(_sysctl) }, /* obsolete syscall */
+ { 0, SCMP_SYS(add_key) }, /* keyring is not namespaced */
+ { 0, SCMP_SYS(afs_syscall) }, /* obsolete syscall */
+ { 0, SCMP_SYS(bdflush) },
+#ifdef __NR_bpf
+ { 0, SCMP_SYS(bpf) },
+#endif
+ { 0, SCMP_SYS(break) }, /* obsolete syscall */
+ { 0, SCMP_SYS(create_module) }, /* obsolete syscall */
+ { 0, SCMP_SYS(ftime) }, /* obsolete syscall */
+ { 0, SCMP_SYS(get_kernel_syms) }, /* obsolete syscall */
+ { 0, SCMP_SYS(getpmsg) }, /* obsolete syscall */
+ { 0, SCMP_SYS(gtty) }, /* obsolete syscall */
+#ifdef __NR_kexec_file_load
+ { 0, SCMP_SYS(kexec_file_load) },
+#endif
+ { 0, SCMP_SYS(kexec_load) },
+ { 0, SCMP_SYS(keyctl) }, /* keyring is not namespaced */
+ { 0, SCMP_SYS(lock) }, /* obsolete syscall */
+ { 0, SCMP_SYS(lookup_dcookie) },
+ { 0, SCMP_SYS(mpx) }, /* obsolete syscall */
+ { 0, SCMP_SYS(nfsservctl) }, /* obsolete syscall */
+ { 0, SCMP_SYS(open_by_handle_at) },
+ { 0, SCMP_SYS(perf_event_open) },
+ { 0, SCMP_SYS(prof) }, /* obsolete syscall */
+ { 0, SCMP_SYS(profil) }, /* obsolete syscall */
+ { 0, SCMP_SYS(putpmsg) }, /* obsolete syscall */
+ { 0, SCMP_SYS(query_module) }, /* obsolete syscall */
+ { 0, SCMP_SYS(quotactl) },
+ { 0, SCMP_SYS(request_key) }, /* keyring is not namespaced */
+ { 0, SCMP_SYS(security) }, /* obsolete syscall */
+ { 0, SCMP_SYS(sgetmask) }, /* obsolete syscall */
+ { 0, SCMP_SYS(ssetmask) }, /* obsolete syscall */
+ { 0, SCMP_SYS(stty) }, /* obsolete syscall */
+ { 0, SCMP_SYS(swapoff) },
+ { 0, SCMP_SYS(swapon) },
+ { 0, SCMP_SYS(sysfs) }, /* obsolete syscall */
+ { 0, SCMP_SYS(tuxcall) }, /* obsolete syscall */
+ { 0, SCMP_SYS(ulimit) }, /* obsolete syscall */
+ { 0, SCMP_SYS(uselib) }, /* obsolete syscall */
+ { 0, SCMP_SYS(ustat) }, /* obsolete syscall */
+ { 0, SCMP_SYS(vserver) }, /* obsolete syscall */
+ { CAP_SYSLOG, SCMP_SYS(syslog) },
+ { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(init_module) },
+ { CAP_SYS_PACCT, SCMP_SYS(acct) },
+ { CAP_SYS_PTRACE, SCMP_SYS(process_vm_readv) },
+ { CAP_SYS_PTRACE, SCMP_SYS(process_vm_writev) },
+ { CAP_SYS_PTRACE, SCMP_SYS(ptrace) },
+ { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
+ { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
+ { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_iobase) },
+ { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_read) },
+ { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_write) },
+#ifdef __NR_s390_pci_mmio_read
+ { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_read) },
+#endif
+#ifdef __NR_s390_pci_mmio_write
+ { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_write) },
+#endif
+ { CAP_SYS_TIME, SCMP_SYS(adjtimex) },
+ { CAP_SYS_TIME, SCMP_SYS(clock_adjtime) },
+ { CAP_SYS_TIME, SCMP_SYS(clock_settime) },
+ { CAP_SYS_TIME, SCMP_SYS(settimeofday) },
+ { CAP_SYS_TIME, SCMP_SYS(stime) },
+ };
+
+ for (i = 0; i < ELEMENTSOF(blacklist); i++) {
+ if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
+ continue;
+
+ r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
+ if (r == -EFAULT)
+ continue; /* unknown syscall */
+ if (r < 0)
+ return log_error_errno(r, "Failed to block syscall: %m");
+ }
+
+ return 0;
+}
+
+int setup_seccomp(uint64_t cap_list_retain) {
+ scmp_filter_ctx seccomp;
+ int r;
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return log_oom();
+
+ r = seccomp_add_secondary_archs(seccomp);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
+ goto finish;
+ }
+
+ r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
+ if (r < 0)
+ goto finish;
+
+ /*
+ Audit is broken in containers, much of the userspace audit
+ hookup will fail if running inside a container. We don't
+ care and just turn off creation of audit sockets.
+
+ This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
+ with EAFNOSUPPORT which audit userspace uses as indication
+ that audit is disabled in the kernel.
+ */
+
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 2,
+ SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
+ SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
+ if (r < 0) {
+ log_error_errno(r, "Failed to add audit seccomp rule: %m");
+ goto finish;
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0) {
+ log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
+ goto finish;
+ }
+
+ r = seccomp_load(seccomp);
+ if (r == -EINVAL) {
+ log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
+ r = 0;
+ goto finish;
+ }
+ if (r < 0) {
+ log_error_errno(r, "Failed to install seccomp audit filter: %m");
+ goto finish;
+ }
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
+#else
+
+int setup_seccomp(uint64_t cap_list_retain) {
+ return 0;
+}
+
+#endif
diff --git a/src/systemd-nspawn/nspawn-seccomp.h b/src/systemd-nspawn/nspawn-seccomp.h
new file mode 100644
index 0000000000..5bde16faf9
--- /dev/null
+++ b/src/systemd-nspawn/nspawn-seccomp.h
@@ -0,0 +1,24 @@
+#pragma once
+
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+
+int setup_seccomp(uint64_t cap_list_retain);
diff --git a/src/systemd-nspawn/nspawn-settings.h b/src/systemd-nspawn/nspawn-settings.h
index 95e70d5fa4..8a88647df2 100644
--- a/src/systemd-nspawn/nspawn-settings.h
+++ b/src/systemd-nspawn/nspawn-settings.h
@@ -57,7 +57,8 @@ typedef enum SettingsMask {
SETTING_CUSTOM_MOUNTS = 1 << 11,
SETTING_WORKING_DIRECTORY = 1 << 12,
SETTING_USERNS = 1 << 13,
- _SETTINGS_MASK_ALL = (1 << 14) -1
+ SETTING_NOTIFY_READY = 1 << 14,
+ _SETTINGS_MASK_ALL = (1 << 15) -1
} SettingsMask;
typedef struct Settings {
@@ -74,6 +75,7 @@ typedef struct Settings {
char *working_directory;
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
+ bool notify_ready;
/* [Image] */
int read_only;
diff --git a/src/systemd-nspawn/nspawn-setuid.c b/src/systemd-nspawn/nspawn-setuid.c
index 38ca9e2e24..360781c24c 100644
--- a/src/systemd-nspawn/nspawn-setuid.c
+++ b/src/systemd-nspawn/nspawn-setuid.c
@@ -125,14 +125,12 @@ int change_uid_gid(const char *user, char **_home) {
fd = -1;
if (!fgets(line, sizeof(line), f)) {
-
if (!ferror(f)) {
log_error("Failed to resolve user %s.", user);
return -ESRCH;
}
- log_error_errno(errno, "Failed to read from getent: %m");
- return -errno;
+ return log_error_errno(errno, "Failed to read from getent: %m");
}
truncate_nl(line);
@@ -215,8 +213,7 @@ int change_uid_gid(const char *user, char **_home) {
return -ESRCH;
}
- log_error_errno(errno, "Failed to read from getent: %m");
- return -errno;
+ return log_error_errno(errno, "Failed to read from getent: %m");
}
truncate_nl(line);
diff --git a/src/systemd-nspawn/nspawn.c b/src/systemd-nspawn/nspawn.c
index 5537a526f0..fb25ec7afa 100644
--- a/src/systemd-nspawn/nspawn.c
+++ b/src/systemd-nspawn/nspawn.c
@@ -27,9 +27,6 @@
#include <sched.h>
#include <linux/loop.h>
-#ifdef HAVE_SECCOMP
-#include <seccomp.h>
-#endif
#ifdef HAVE_SELINUX
#include <selinux/selinux.h>
#endif
@@ -57,7 +54,6 @@
#include "basic/copy.h"
#include "basic/env-util.h"
#include "basic/fd-util.h"
-#include "basic/fdset.h"
#include "basic/fileio.h"
#include "basic/formats-util.h"
#include "basic/fs-util.h"
@@ -71,45 +67,39 @@
#include "basic/path-util.h"
#include "basic/process-util.h"
#include "basic/random-util.h"
+#include "basic/raw-clone.h"
#include "basic/rm-rf.h"
#include "shared/base-filesystem.h"
#include "shared/dev-setup.h"
+#include "shared/fdset.h"
#include "shared/gpt.h"
#include "shared/machine-image.h"
#include "shared/ptyfwd.h"
+#include "shared/udev-util.h"
#include "loopback-setup.h"
-#include "machine-id-setup.h"
#include "nspawn-cgroup.h"
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
#include "nspawn-patch-uid.h"
#include "nspawn-register.h"
+#include "nspawn-seccomp.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
-#ifdef HAVE_SECCOMP
-#include "shared/seccomp-util.h"
-#endif
-#include "basic/selinux-util.h"
-#include "basic/signal-util.h"
-#include "basic/socket-util.h"
-#include "basic/stat-util.h"
-#include "basic/stdio-util.h"
-#include "basic/string-util.h"
-#include "basic/strv.h"
-#include "basic/terminal-util.h"
-#include "basic/umask-util.h"
-#include "basic/user-util.h"
-#include "basic/util.h"
-#include "shared/udev-util.h"
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
- * UID range here */
+ * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
+ * may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
+ * nspawn_notify_socket_path is relative to the container
+ * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
+#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
+
typedef enum ContainerStatus {
CONTAINER_TERMINATED,
CONTAINER_REBOOTED
@@ -137,7 +127,9 @@ static StartMode arg_start_mode = START_PID1;
static bool arg_ephemeral = false;
static LinkJournal arg_link_journal = LINK_AUTO;
static bool arg_link_journal_try = false;
-static uint64_t arg_retain =
+static uint64_t arg_caps_retain =
+ (1ULL << CAP_AUDIT_CONTROL) |
+ (1ULL << CAP_AUDIT_WRITE) |
(1ULL << CAP_CHOWN) |
(1ULL << CAP_DAC_OVERRIDE) |
(1ULL << CAP_DAC_READ_SEARCH) |
@@ -147,23 +139,21 @@ static uint64_t arg_retain =
(1ULL << CAP_KILL) |
(1ULL << CAP_LEASE) |
(1ULL << CAP_LINUX_IMMUTABLE) |
+ (1ULL << CAP_MKNOD) |
(1ULL << CAP_NET_BIND_SERVICE) |
(1ULL << CAP_NET_BROADCAST) |
(1ULL << CAP_NET_RAW) |
- (1ULL << CAP_SETGID) |
(1ULL << CAP_SETFCAP) |
+ (1ULL << CAP_SETGID) |
(1ULL << CAP_SETPCAP) |
(1ULL << CAP_SETUID) |
(1ULL << CAP_SYS_ADMIN) |
+ (1ULL << CAP_SYS_BOOT) |
(1ULL << CAP_SYS_CHROOT) |
(1ULL << CAP_SYS_NICE) |
(1ULL << CAP_SYS_PTRACE) |
- (1ULL << CAP_SYS_TTY_CONFIG) |
(1ULL << CAP_SYS_RESOURCE) |
- (1ULL << CAP_SYS_BOOT) |
- (1ULL << CAP_AUDIT_WRITE) |
- (1ULL << CAP_AUDIT_CONTROL) |
- (1ULL << CAP_MKNOD);
+ (1ULL << CAP_SYS_TTY_CONFIG);
static CustomMount *arg_custom_mounts = NULL;
static unsigned arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
@@ -192,6 +182,7 @@ static SettingsMask arg_settings_mask = 0;
static int arg_settings_trusted = -1;
static char **arg_parameters = NULL;
static const char *arg_container_service_name = "systemd-nspawn";
+static bool arg_notify_ready = false;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -272,10 +263,11 @@ static void help(void) {
" the service unit nspawn is running in\n"
" --volatile[=MODE] Run the system in volatile mode\n"
" --settings=BOOLEAN Load additional settings from .nspawn file\n"
+ " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
+ " accepted values: yes and no\n"
, program_invocation_short_name);
}
-
static int custom_mounts_prepare(void) {
unsigned i;
int r;
@@ -372,6 +364,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_SETTINGS,
ARG_CHDIR,
ARG_PRIVATE_USERS_CHOWN,
+ ARG_NOTIFY_READY,
};
static const struct option options[] = {
@@ -420,6 +413,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{ "settings", required_argument, NULL, ARG_SETTINGS },
{ "chdir", required_argument, NULL, ARG_CHDIR },
+ { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{}
};
@@ -589,9 +583,12 @@ static int parse_argv(int argc, char *argv[]) {
case ARG_UUID:
r = sd_id128_from_string(optarg, &arg_uuid);
- if (r < 0) {
- log_error("Invalid UUID: %s", optarg);
- return r;
+ if (r < 0)
+ return log_error_errno(r, "Invalid UUID: %s", optarg);
+
+ if (sd_id128_is_null(arg_uuid)) {
+ log_error("Machine UUID may not be all zeroes.");
+ return -EINVAL;
}
arg_settings_mask |= SETTING_MACHINE_ID;
@@ -992,6 +989,16 @@ static int parse_argv(int argc, char *argv[]) {
arg_settings_mask |= SETTING_WORKING_DIRECTORY;
break;
+ case ARG_NOTIFY_READY:
+ r = parse_boolean(optarg);
+ if (r < 0) {
+ log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
+ return -EINVAL;
+ }
+ arg_notify_ready = r;
+ arg_settings_mask |= SETTING_NOTIFY_READY;
+ break;
+
case '?':
return -EINVAL;
@@ -1076,7 +1083,7 @@ static int parse_argv(int argc, char *argv[]) {
if (mask_all_settings)
arg_settings_mask = _SETTINGS_MASK_ALL;
- arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
r = detect_unified_cgroup_hierarchy();
if (r < 0)
@@ -1251,20 +1258,9 @@ static int setup_resolv_conf(const char *dest) {
return 0;
}
-static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
- assert(s);
-
- snprintf(s, 37,
- "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
- SD_ID128_FORMAT_VAL(id));
-
- return s;
-}
-
static int setup_boot_id(const char *dest) {
+ sd_id128_t rnd = SD_ID128_NULL;
const char *from, *to;
- sd_id128_t rnd = {};
- char as_uuid[37];
int r;
if (arg_share_system)
@@ -1280,18 +1276,16 @@ static int setup_boot_id(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to generate random boot id: %m");
- id128_format_as_uuid(rnd, as_uuid);
-
- r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
+ r = id128_write(from, ID128_UUID, rnd, false);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
if (mount(from, to, NULL, MS_BIND, NULL) < 0)
r = log_error_errno(errno, "Failed to bind mount boot id: %m");
else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
- log_warning_errno(errno, "Failed to make boot id read-only: %m");
+ log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m");
- unlink(from);
+ (void) unlink(from);
return r;
}
@@ -1633,7 +1627,7 @@ static int setup_journal(const char *directory) {
}
static int drop_capabilities(void) {
- return capability_bounding_set_drop(arg_retain, false);
+ return capability_bounding_set_drop(arg_caps_retain, false);
}
static int reset_audit_loginuid(void) {
@@ -1668,99 +1662,6 @@ static int reset_audit_loginuid(void) {
return 0;
}
-static int setup_seccomp(void) {
-
-#ifdef HAVE_SECCOMP
- static const struct {
- uint64_t capability;
- int syscall_num;
- } blacklist[] = {
- { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
- { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
- { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
- { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
- { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
- { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
- { CAP_SYS_MODULE, SCMP_SYS(init_module) },
- { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
- { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
- { CAP_SYSLOG, SCMP_SYS(syslog) },
- };
-
- scmp_filter_ctx seccomp;
- unsigned i;
- int r;
-
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return log_oom();
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0) {
- log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
- goto finish;
- }
-
- for (i = 0; i < ELEMENTSOF(blacklist); i++) {
- if (arg_retain & (1ULL << blacklist[i].capability))
- continue;
-
- r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
- if (r == -EFAULT)
- continue; /* unknown syscall */
- if (r < 0) {
- log_error_errno(r, "Failed to block syscall: %m");
- goto finish;
- }
- }
-
- /*
- Audit is broken in containers, much of the userspace audit
- hookup will fail if running inside a container. We don't
- care and just turn off creation of audit sockets.
-
- This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
- with EAFNOSUPPORT which audit userspace uses as indication
- that audit is disabled in the kernel.
- */
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EAFNOSUPPORT),
- SCMP_SYS(socket),
- 2,
- SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
- SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
- if (r < 0) {
- log_error_errno(r, "Failed to add audit seccomp rule: %m");
- goto finish;
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
- goto finish;
- }
-
- r = seccomp_load(seccomp);
- if (r == -EINVAL) {
- log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
- r = 0;
- goto finish;
- }
- if (r < 0) {
- log_error_errno(r, "Failed to install seccomp audit filter: %m");
- goto finish;
- }
-
-finish:
- seccomp_release(seccomp);
- return r;
-#else
- return 0;
-#endif
-
-}
static int setup_propagate(const char *root) {
const char *p, *q;
@@ -2310,33 +2211,37 @@ static int mount_device(const char *what, const char *where, const char *directo
}
static int setup_machine_id(const char *directory) {
+ const char *etc_machine_id;
+ sd_id128_t id;
int r;
- const char *etc_machine_id, *t;
- _cleanup_free_ char *s = NULL;
- etc_machine_id = prefix_roota(directory, "/etc/machine-id");
+ /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
+ * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
+ * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
+ * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
+ * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
+ * container behaves nicely). */
- r = read_one_line_file(etc_machine_id, &s);
- if (r < 0)
- return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
+ etc_machine_id = prefix_roota(directory, "/etc/machine-id");
- t = strstrip(s);
+ r = id128_read(etc_machine_id, ID128_PLAIN, &id);
+ if (r < 0) {
+ if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
+ return log_error_errno(r, "Failed to read machine ID from container image: %m");
- if (!isempty(t)) {
- r = sd_id128_from_string(t, &arg_uuid);
- if (r < 0)
- return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
- } else {
if (sd_id128_is_null(arg_uuid)) {
r = sd_id128_randomize(&arg_uuid);
if (r < 0)
- return log_error_errno(r, "Failed to generate random machine ID: %m");
+ return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
+ }
+ } else {
+ if (sd_id128_is_null(id)) {
+ log_error("Machine ID in container image is zero, refusing.");
+ return -EINVAL;
}
- }
- r = machine_id_setup(directory, arg_uuid);
- if (r < 0)
- return log_error_errno(r, "Failed to setup machine ID: %m");
+ arg_uuid = id;
+ }
return 0;
}
@@ -2447,10 +2352,9 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) {
switch (status.si_code) {
case CLD_EXITED:
- if (status.si_status == 0) {
+ if (status.si_status == 0)
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
-
- } else
+ else
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
*container = CONTAINER_TERMINATED;
@@ -2458,13 +2362,11 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) {
case CLD_KILLED:
if (status.si_status == SIGINT) {
-
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
*container = CONTAINER_TERMINATED;
return 0;
} else if (status.si_status == SIGHUP) {
-
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
*container = CONTAINER_REBOOTED;
return 0;
@@ -2480,8 +2382,6 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) {
log_error("Container %s failed due to unknown reason.", arg_machine);
return -EIO;
}
-
- return r;
}
static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
@@ -2632,6 +2532,7 @@ static int inner_child(
NULL, /* container_uuid */
NULL, /* LISTEN_FDS */
NULL, /* LISTEN_PID */
+ NULL, /* NOTIFY_SOCKET */
NULL
};
@@ -2725,7 +2626,7 @@ static int inner_child(
#ifdef HAVE_SELINUX
if (arg_selinux_context)
- if (setexeccon((security_context_t) arg_selinux_context) < 0)
+ if (setexeccon(arg_selinux_context) < 0)
return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
#endif
@@ -2745,9 +2646,9 @@ static int inner_child(
(asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
return log_oom();
- assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
+ assert(!sd_id128_is_null(arg_uuid));
- if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+ if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
return log_oom();
if (fdset_size(fds) > 0) {
@@ -2759,6 +2660,8 @@ static int inner_child(
(asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
return log_oom();
}
+ if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
+ return log_oom();
env_use = strv_env_merge(2, envp, arg_setenv);
if (!env_use)
@@ -2828,6 +2731,37 @@ static int inner_child(
return log_error_errno(r, "execv() failed: %m");
}
+static int setup_sd_notify_child(void) {
+ static const int one = 1;
+ int fd = -1;
+ union sockaddr_union sa = {
+ .sa.sa_family = AF_UNIX,
+ };
+ int r;
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate notification socket: %m");
+
+ (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
+ (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
+
+ strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
+ r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+ if (r < 0) {
+ safe_close(fd);
+ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+ }
+
+ r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
+ if (r < 0) {
+ safe_close(fd);
+ return log_error_errno(errno, "SO_PASSCRED failed: %m");
+ }
+
+ return fd;
+}
+
static int outer_child(
Barrier *barrier,
const char *directory,
@@ -2839,6 +2773,7 @@ static int outer_child(
bool secondary,
int pid_socket,
int uuid_socket,
+ int notify_socket,
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
@@ -2847,12 +2782,14 @@ static int outer_child(
pid_t pid;
ssize_t l;
int r;
+ _cleanup_close_ int fd = -1;
assert(barrier);
assert(directory);
assert(console);
assert(pid_socket >= 0);
assert(uuid_socket >= 0);
+ assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
cg_unified_flush();
@@ -2920,7 +2857,7 @@ static int outer_child(
if (l < 0)
return log_error_errno(errno, "Failed to recv UID shift: %m");
if (l != sizeof(arg_uid_shift)) {
- log_error("Short read while recieving UID shift.");
+ log_error("Short read while receiving UID shift.");
return -EIO;
}
}
@@ -2994,7 +2931,7 @@ static int outer_child(
if (r < 0)
return r;
- r = setup_seccomp();
+ r = setup_seccomp(arg_caps_retain);
if (r < 0)
return r;
@@ -3039,16 +2976,20 @@ static int outer_child(
if (r < 0)
return log_error_errno(r, "Failed to move root directory: %m");
+ fd = setup_sd_notify_child();
+ if (fd < 0)
+ return fd;
+
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
(arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
(arg_private_network ? CLONE_NEWNET : 0) |
- (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0),
- NULL);
+ (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
if (pid == 0) {
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
+ notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
/* The inner child has all namespaces that are
@@ -3078,8 +3019,13 @@ static int outer_child(
return -EIO;
}
+ l = send_one_fd(notify_socket, fd, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send notify fd: %m");
+
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
+ notify_socket = safe_close(notify_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
@@ -3162,6 +3108,95 @@ static int setup_uid_map(pid_t pid) {
return 0;
}
+static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ char buf[NOTIFY_BUFFER_MAX+1];
+ char *p = NULL;
+ struct iovec iovec = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf)-1,
+ };
+ union {
+ struct cmsghdr cmsghdr;
+ uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
+ CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
+ } control = {};
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+ struct cmsghdr *cmsg;
+ struct ucred *ucred = NULL;
+ ssize_t n;
+ pid_t inner_child_pid;
+ _cleanup_strv_free_ char **tags = NULL;
+
+ assert(userdata);
+
+ inner_child_pid = PTR_TO_PID(userdata);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected poll event for notify fd.");
+ return 0;
+ }
+
+ n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+ if (n < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ return 0;
+
+ return log_warning_errno(errno, "Couldn't read notification socket: %m");
+ }
+ cmsg_close_all(&msghdr);
+
+ CMSG_FOREACH(cmsg, &msghdr) {
+ if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_CREDENTIALS &&
+ cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+ ucred = (struct ucred*) CMSG_DATA(cmsg);
+ }
+ }
+
+ if (!ucred || ucred->pid != inner_child_pid) {
+ log_warning("Received notify message without valid credentials. Ignoring.");
+ return 0;
+ }
+
+ if ((size_t) n >= sizeof(buf)) {
+ log_warning("Received notify message exceeded maximum size. Ignoring.");
+ return 0;
+ }
+
+ buf[n] = 0;
+ tags = strv_split(buf, "\n\r");
+ if (!tags)
+ return log_oom();
+
+ if (strv_find(tags, "READY=1"))
+ sd_notifyf(false, "READY=1\n");
+
+ p = strv_find_startswith(tags, "STATUS=");
+ if (p)
+ sd_notifyf(false, "STATUS=Container running: %s", p);
+
+ return 0;
+}
+
+static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
+ int r;
+ sd_event_source *notify_event_source;
+
+ r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify event source: %m");
+
+ (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
+
+ return 0;
+}
+
static int load_settings(void) {
_cleanup_(settings_freep) Settings *settings = NULL;
_cleanup_fclose_ FILE *f = NULL;
@@ -3278,9 +3313,9 @@ static int load_settings(void) {
if (settings->capability != 0)
log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
} else
- arg_retain |= plus;
+ arg_caps_retain |= plus;
- arg_retain &= ~settings->drop_capability;
+ arg_caps_retain &= ~settings->drop_capability;
}
if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
@@ -3390,6 +3425,9 @@ static int load_settings(void) {
}
}
+ if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
+ arg_notify_ready = settings->notify_ready;
+
return 0;
}
@@ -3504,7 +3542,7 @@ int main(int argc, char *argv[]) {
}
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", arg_directory);
- return r;
+ goto finish;
}
if (arg_template) {
@@ -3640,7 +3678,9 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[2] = { -1, -1 },
pid_socket_pair[2] = { -1, -1 },
uuid_socket_pair[2] = { -1, -1 },
+ notify_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 };
+ _cleanup_close_ int notify_socket= -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
@@ -3691,6 +3731,11 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create notify socket pair: %m");
+ goto finish;
+ }
+
if (arg_userns_mode != USER_NAMESPACE_NO)
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
@@ -3712,7 +3757,7 @@ int main(int argc, char *argv[]) {
goto finish;
}
- pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
+ pid = raw_clone(SIGCHLD|CLONE_NEWNS);
if (pid < 0) {
if (errno == EINVAL)
r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
@@ -3732,6 +3777,7 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
+ notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
(void) reset_all_signal_handlers();
@@ -3747,6 +3793,7 @@ int main(int argc, char *argv[]) {
secondary,
pid_socket_pair[1],
uuid_socket_pair[1],
+ notify_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
@@ -3765,6 +3812,7 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
+ notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -3838,6 +3886,13 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ /* We also retrieve the socket used for notifications generated by outer child */
+ notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+ if (notify_socket < 0) {
+ r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
+ goto finish;
+ }
+
log_debug("Init process invoked as PID " PID_FMT, pid);
if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -3952,6 +4007,16 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ r = sd_event_new(&event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get default event source: %m");
+ goto finish;
+ }
+
+ r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
+ if (r < 0)
+ goto finish;
+
/* Let the child know that we are ready and wait that the child is completely ready now. */
if (!barrier_place_and_sync(&barrier)) { /* #4 */
log_error("Child died too early.");
@@ -3964,15 +4029,10 @@ int main(int argc, char *argv[]) {
etc_passwd_lock = safe_close(etc_passwd_lock);
sd_notifyf(false,
- "READY=1\n"
"STATUS=Container running.\n"
"X_NSPAWN_LEADER_PID=" PID_FMT, pid);
-
- r = sd_event_new(&event);
- if (r < 0) {
- log_error_errno(r, "Failed to get default event source: %m");
- goto finish;
- }
+ if (!arg_notify_ready)
+ sd_notify(false, "READY=1\n");
if (arg_kill_signal > 0) {
/* Try to kill the init system on SIGINT or SIGTERM */
diff --git a/src/systemd-nspawn/systemd-nspawn.completion.bash b/src/systemd-nspawn/systemd-nspawn.completion.bash
index 0cf249d8ce..ea4a5e1f43 100644
--- a/src/systemd-nspawn/systemd-nspawn.completion.bash
+++ b/src/systemd-nspawn/systemd-nspawn.completion.bash
@@ -60,7 +60,8 @@ _systemd_nspawn() {
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro -M --machine
-S --slice --setenv -Z --selinux-context -L --selinux-apifs-context --register --network-interface --network-bridge
--personality -i --image --tmpfs --volatile
- --network-macvlan --kill-signal --template'
+ --network-macvlan --kill-signal --template
+ --notify-ready'
)
_init_completion || return
@@ -139,6 +140,10 @@ _systemd_nspawn() {
_signals
return
;;
+ --notify-ready)
+ comps='yes no'
+ return
+ ;;
esac
COMPREPLY=( $(compgen -W '$comps' -- "$cur") )
return 0
diff --git a/src/systemd-nspawn/systemd-nspawn.completion.zsh b/src/systemd-nspawn/systemd-nspawn.completion.zsh
index 3e0f667909..77b2e7cd7c 100644
--- a/src/systemd-nspawn/systemd-nspawn.completion.zsh
+++ b/src/systemd-nspawn/systemd-nspawn.completion.zsh
@@ -46,4 +46,5 @@ _arguments \
'--keep-unit[Instead of creating a transient scope unit to run the container in, simply register the service or scope unit systemd-nspawn has been invoked in with systemd-machined(8).]' \
'--personality=[Control the architecture ("personality") reported by uname(2) in the container.]:architecture:(x86 x86-64)' \
'--volatile=[Run the system in volatile mode.]:volatile:(no yes state)' \
+ "--notify-ready=[Control when the ready notification is sent]:options:(yes no)" \
'*:: : _normal'
diff --git a/src/systemd-nspawn/systemd-nspawn.tmpfiles b/src/systemd-nspawn/systemd-nspawn.tmpfiles
index 9fa3878d6b..78bd1c670e 100644
--- a/src/systemd-nspawn/systemd-nspawn.tmpfiles
+++ b/src/systemd-nspawn/systemd-nspawn.tmpfiles
@@ -10,7 +10,7 @@
Q /var/lib/machines 0700 - - -
# Remove old temporary snapshots, but only at boot. Ideally we'd have
-# "self-destroying" btrfs snapshots that go away if the last last
+# "self-destroying" btrfs snapshots that go away if the last
# reference to it does. To mimic a scheme like this at least remove
# the old snapshots on fresh boots, where we know they cannot be
# referenced anymore. Note that we actually remove all temporary files
diff --git a/src/systemd-nspawn/systemd-nspawn.xml b/src/systemd-nspawn/systemd-nspawn.xml
index 476cc2932f..69d2f6ff7d 100644
--- a/src/systemd-nspawn/systemd-nspawn.xml
+++ b/src/systemd-nspawn/systemd-nspawn.xml
@@ -67,69 +67,80 @@
<refsect1>
<title>Description</title>
- <para><command>systemd-nspawn</command> may be used to run a
- command or OS in a light-weight namespace container. In many ways
- it is similar to
- <citerefentry project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
- but more powerful since it fully virtualizes the file system
- hierarchy, as well as the process tree, the various IPC subsystems
- and the host and domain name.</para>
-
- <para><command>systemd-nspawn</command> limits access to various
- kernel interfaces in the container to read-only, such as
- <filename>/sys</filename>, <filename>/proc/sys</filename> or
- <filename>/sys/fs/selinux</filename>. Network interfaces and the
- system clock may not be changed from within the container. Device
- nodes may not be created. The host system cannot be rebooted and
- kernel modules may not be loaded from within the container.</para>
-
- <para>Note that even though these security precautions are taken
- <command>systemd-nspawn</command> is not suitable for fully secure
- container setups. Many of the security features may be
- circumvented and are hence primarily useful to avoid accidental
- changes to the host system from the container.</para>
-
- <para>In contrast to
- <citerefentry project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry> <command>systemd-nspawn</command>
- may be used to boot full Linux-based operating systems in a
+ <para><command>systemd-nspawn</command> may be used to run a command or OS in a light-weight namespace
+ container. In many ways it is similar to <citerefentry
+ project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry>, but more powerful
+ since it fully virtualizes the file system hierarchy, as well as the process tree, the various IPC subsystems and
+ the host and domain name.</para>
+
+ <para><command>systemd-nspawn</command> may be invoked on any directory tree containing an operating system tree,
+ using the <option>--directory=</option> command line option. By using the <option>--machine=</option> option an OS
+ tree is automatically searched for in a couple of locations, most importantly in
+ <filename>/var/lib/machines</filename>, the suggested directory to place container images installed on the
+ system.</para>
+
+ <para>In contrast to <citerefentry
+ project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry> <command>systemd-nspawn</command>
+ may be used to boot full Linux-based operating systems in a container.</para>
+
+ <para><command>systemd-nspawn</command> limits access to various kernel interfaces in the container to read-only,
+ such as <filename>/sys</filename>, <filename>/proc/sys</filename> or <filename>/sys/fs/selinux</filename>. The
+ host's network interfaces and the system clock may not be changed from within the container. Device nodes may not
+ be created. The host system cannot be rebooted and kernel modules may not be loaded from within the
container.</para>
- <para>Use a tool like
- <citerefentry project='mankier'><refentrytitle>dnf</refentrytitle><manvolnum>8</manvolnum></citerefentry>,
- <citerefentry project='die-net'><refentrytitle>debootstrap</refentrytitle><manvolnum>8</manvolnum></citerefentry>,
- or
- <citerefentry project='archlinux'><refentrytitle>pacman</refentrytitle><manvolnum>8</manvolnum></citerefentry>
- to set up an OS directory tree suitable as file system hierarchy
- for <command>systemd-nspawn</command> containers.</para>
-
- <para>Note that <command>systemd-nspawn</command> will mount file
- systems private to the container to <filename>/dev</filename>,
- <filename>/run</filename> and similar. These will not be visible
- outside of the container, and their contents will be lost when the
- container exits.</para>
-
- <para>Note that running two <command>systemd-nspawn</command>
- containers from the same directory tree will not make processes in
- them see each other. The PID namespace separation of the two
- containers is complete and the containers will share very few
- runtime objects except for the underlying file system. Use
- <citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry>'s
- <command>login</command> command to request an additional login
- prompt in a running container.</para>
-
- <para><command>systemd-nspawn</command> implements the
- <ulink
- url="http://www.freedesktop.org/wiki/Software/systemd/ContainerInterface">Container
- Interface</ulink> specification.</para>
-
- <para>As a safety check <command>systemd-nspawn</command> will
- verify the existence of <filename>/usr/lib/os-release</filename>
- or <filename>/etc/os-release</filename> in the container tree
- before starting the container (see
- <citerefentry><refentrytitle>os-release</refentrytitle><manvolnum>5</manvolnum></citerefentry>).
- It might be necessary to add this file to the container tree
- manually if the OS of the container is too old to contain this
+ <para>Use a tool like <citerefentry
+ project='mankier'><refentrytitle>dnf</refentrytitle><manvolnum>8</manvolnum></citerefentry>, <citerefentry
+ project='die-net'><refentrytitle>debootstrap</refentrytitle><manvolnum>8</manvolnum></citerefentry>, or
+ <citerefentry project='archlinux'><refentrytitle>pacman</refentrytitle><manvolnum>8</manvolnum></citerefentry> to
+ set up an OS directory tree suitable as file system hierarchy for <command>systemd-nspawn</command> containers. See
+ the Examples section below for details on suitable invocation of these commands.</para>
+
+ <para>As a safety check <command>systemd-nspawn</command> will verify the existence of
+ <filename>/usr/lib/os-release</filename> or <filename>/etc/os-release</filename> in the container tree before
+ starting the container (see
+ <citerefentry><refentrytitle>os-release</refentrytitle><manvolnum>5</manvolnum></citerefentry>). It might be
+ necessary to add this file to the container tree manually if the OS of the container is too old to contain this
file out-of-the-box.</para>
+
+ <para><command>systemd-nspawn</command> may be invoked directly from the interactive command line or run as system
+ service in the background. In this mode each container instance runs as its own service instance; a default
+ template unit file <filename>systemd-nspawn@.service</filename> is provided to make this easy, taking the container
+ name as instance identifier. Note that different default options apply when <command>systemd-nspawn</command> is
+ invoked by the template unit file than interactively on the command line. Most importantly the template unit file
+ makes use of the <option>--boot</option> which is not the default in case <command>systemd-nspawn</command> is
+ invoked from the interactive command line. Further differences with the defaults are documented along with the
+ various supported options below.</para>
+
+ <para>The <citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry> tool may
+ be used to execute a number of operations on containers. In particular it provides easy-to-use commands to run
+ containers as system services using the <filename>systemd-nspawn@.service</filename> template unit
+ file.</para>
+
+ <para>Along with each container a settings file with the <filename>.nspawn</filename> suffix may exist, containing
+ additional settings to apply when running the container. See
+ <citerefentry><refentrytitle>systemd.nspawn</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
+ details. Settings files override the default options used by the <filename>systemd-nspawn@.service</filename>
+ template unit file, making it usually unnecessary to alter this template file directly.</para>
+
+ <para>Note that <command>systemd-nspawn</command> will mount file systems private to the container to
+ <filename>/dev</filename>, <filename>/run</filename> and similar. These will not be visible outside of the
+ container, and their contents will be lost when the container exits.</para>
+
+ <para>Note that running two <command>systemd-nspawn</command> containers from the same directory tree will not make
+ processes in them see each other. The PID namespace separation of the two containers is complete and the containers
+ will share very few runtime objects except for the underlying file system. Use
+ <citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry>'s
+ <command>login</command> or <command>shell</command> commands to request an additional login session in a running
+ container.</para>
+
+ <para><command>systemd-nspawn</command> implements the <ulink
+ url="http://www.freedesktop.org/wiki/Software/systemd/ContainerInterface">Container Interface</ulink>
+ specification.</para>
+
+ <para>While running, containers invoked with <command>systemd-nspawn</command> are registered with the
+ <citerefentry><refentrytitle>systemd-machined</refentrytitle><manvolnum>8</manvolnum></citerefentry> service that
+ keeps track of running containers, and provides programming interfaces to interact with them.</para>
</refsect1>
<refsect1>
@@ -139,7 +150,7 @@
are used as arguments for the init binary. Otherwise,
<replaceable>COMMAND</replaceable> specifies the program to launch
in the container, and the remaining arguments are used as
- arguments for this program. If <option>-b</option> is not used and
+ arguments for this program. If <option>--boot</option> is not used and
no arguments are specified, a shell is launched in the
container.</para>
@@ -310,6 +321,9 @@
</tbody>
</tgroup>
</table>
+
+ <para>Note that <option>--boot</option> is the default mode of operation if the
+ <filename>systemd-nspawn@.service</filename> template unit file is used.</para>
</listitem>
</varlistentry>
@@ -446,7 +460,10 @@
<listitem><para>If the kernel supports the user namespaces feature, equivalent to
<option>--private-users=pick</option>, otherwise equivalent to
- <option>--private-users=no</option>.</para></listitem>
+ <option>--private-users=no</option>.</para>
+
+ <para>Note that <option>-U</option> is the default if the <filename>systemd-nspawn@.service</filename> template unit
+ file is used.</para></listitem>
</varlistentry>
<varlistentry>
@@ -540,6 +557,9 @@
assignment via DHCP. In case <filename>systemd-networkd</filename> is running on both the host and inside the
container, automatic IP communication from the container to the host is thus available, with further
connectivity to the external network.</para>
+
+ <para>Note that <option>--network-veth</option> is the default if the
+ <filename>systemd-nspawn@.service</filename> template unit file is used.</para>
</listitem>
</varlistentry>
@@ -705,7 +725,10 @@
Effectively, booting a container once with
<literal>guest</literal> or <literal>host</literal> will link
the journal persistently if further on the default of
- <literal>auto</literal> is used.</para></listitem>
+ <literal>auto</literal> is used.</para>
+
+ <para>Note that <option>--link-journal=try-guest</option> is the default if the
+ <filename>systemd-nspawn@.service</filename> template unit file is used.</para></listitem>
</varlistentry>
<varlistentry>
@@ -910,8 +933,8 @@
<literal>tmpfs</literal> instance, and
<filename>/usr</filename> from the OS tree is mounted into it
in read-only mode (the system thus starts up with read-only OS
- resources, but pristine state and configuration, any changes
- to the either are lost on shutdown). When the mode parameter
+ image, but pristine state and configuration, any changes
+ are lost on shutdown). When the mode parameter
is specified as <option>state</option>, the OS tree is
mounted read-only, but <filename>/var</filename> is mounted as
a <literal>tmpfs</literal> instance into it (the system thus
@@ -980,6 +1003,19 @@
effect.</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--notify-ready=</option></term>
+
+ <listitem><para>Configures support for notifications from the container's init process.
+ <option>--notify-ready=</option> takes a boolean (<option>no</option> and <option>yes</option>).
+ With option <option>no</option> systemd-nspawn notifies systemd
+ with a <literal>READY=1</literal> message when the init process is created.
+ With option <option>yes</option> systemd-nspawn waits for the
+ <literal>READY=1</literal> message from the init process in the container
+ before sending its own to systemd. For more details about notifications
+ see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>).</para></listitem>
+ </varlistentry>
+
<xi:include href="standard-options.xml" xpointer="help" />
<xi:include href="standard-options.xml" xpointer="version" />
</variablelist>
diff --git a/src/systemd-nspawn/systemd-nspawn@.service.in b/src/systemd-nspawn/systemd-nspawn@.service.in
index ea28941507..c8141639b6 100644
--- a/src/systemd-nspawn/systemd-nspawn@.service.in
+++ b/src/systemd-nspawn/systemd-nspawn@.service.in
@@ -20,20 +20,13 @@ RestartForceExitStatus=133
SuccessExitStatus=133
Slice=machine.slice
Delegate=yes
-TasksMax=8192
+TasksMax=16384
# Enforce a strict device policy, similar to the one nspawn configures
# when it allocates its own scope unit. Make sure to keep these
# policies in sync if you change them!
-DevicePolicy=strict
-DeviceAllow=/dev/null rwm
-DeviceAllow=/dev/zero rwm
-DeviceAllow=/dev/full rwm
-DeviceAllow=/dev/random rwm
-DeviceAllow=/dev/urandom rwm
-DeviceAllow=/dev/tty rwm
+DevicePolicy=closed
DeviceAllow=/dev/net/tun rwm
-DeviceAllow=/dev/pts/ptmx rw
DeviceAllow=char-pts rw
# nspawn itself needs access to /dev/loop-control and /dev/loop, to