diff options
author | Luke Shumaker <lukeshu@sbcglobal.net> | 2016-09-14 18:33:57 -0400 |
---|---|---|
committer | Luke Shumaker <lukeshu@sbcglobal.net> | 2016-09-14 18:33:57 -0400 |
commit | 3c72c8d3ee67388336aca58c5afa3fb93a9c24c0 (patch) | |
tree | d072df7fee0f5906fad88c08398b2fe887cbc064 /src/systemd-nspawn | |
parent | e51613a3291342c6006edda8783755fb8994fd75 (diff) | |
parent | 6ba6ca19507add38549e07058c57489a8cd98cd1 (diff) |
Merge branch 'notsystemd/postmove' into notsystemd/master
# Conflicts:
# src/grp-journal/systemd-journald/Makefile
# src/grp-login/systemd-logind/Makefile
# src/grp-machine/grp-import/systemd-export/Makefile
# src/grp-machine/grp-import/systemd-import/Makefile
# src/grp-machine/grp-import/systemd-pull/Makefile
# src/grp-machine/systemd-machined/Makefile
# src/grp-network/libnetworkd-core/Makefile
# src/grp-resolve/libbasic-dns/Makefile
# src/grp-resolve/systemd-resolved/Makefile
# src/grp-utils/systemd-path/Makefile
# src/libshared/src/Makefile
# src/libsystemd-network/include/systemd-network/sd-ndisc.h
# src/libsystemd/Makefile
# src/libsystemd/src/test.mk
# src/libudev/Makefile
# src/systemd-dbus1-generator/Makefile
# src/systemd-nspawn/nspawn.c
Signed-off-by: Luke Shumaker <lukeshu@sbcglobal.net>
Diffstat (limited to 'src/systemd-nspawn')
-rw-r--r-- | src/systemd-nspawn/Makefile | 24 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-cgroup.c | 2 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-gperf.gperf | 1 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-mount.c | 25 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-network.c | 2 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-patch-uid.c | 24 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-register.c | 11 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-seccomp.c | 198 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-seccomp.h | 24 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-settings.h | 4 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn-setuid.c | 7 | ||||
-rw-r--r-- | src/systemd-nspawn/nspawn.c | 432 | ||||
-rw-r--r-- | src/systemd-nspawn/systemd-nspawn.completion.bash | 7 | ||||
-rw-r--r-- | src/systemd-nspawn/systemd-nspawn.completion.zsh | 1 | ||||
-rw-r--r-- | src/systemd-nspawn/systemd-nspawn.tmpfiles | 2 | ||||
-rw-r--r-- | src/systemd-nspawn/systemd-nspawn.xml | 166 | ||||
-rw-r--r-- | src/systemd-nspawn/systemd-nspawn@.service.in | 11 |
17 files changed, 637 insertions, 304 deletions
diff --git a/src/systemd-nspawn/Makefile b/src/systemd-nspawn/Makefile index 2fe9bcc143..0c008808d8 100644 --- a/src/systemd-nspawn/Makefile +++ b/src/systemd-nspawn/Makefile @@ -36,6 +36,8 @@ systemd_nspawn_SOURCES = \ src/nspawn/nspawn-expose-ports.h \ src/nspawn/nspawn-cgroup.c \ src/nspawn/nspawn-cgroup.h \ + src/nspawn/nspawn-seccomp.c \ + src/nspawn/nspawn-seccomp.h \ src/nspawn/nspawn-register.c \ src/nspawn/nspawn-register.h \ src/nspawn/nspawn-setuid.c \ @@ -47,9 +49,7 @@ systemd_nspawn_SOURCES = \ src/core/mount-setup.c \ src/core/mount-setup.h \ src/core/loopback-setup.c \ - src/core/loopback-setup.h \ - src/core/machine-id-setup.c \ - src/core/machine-id-setup.h + src/core/loopback-setup.h nodist_systemd_nspawn_SOURCES = \ src/nspawn/nspawn-gperf.c @@ -59,12 +59,17 @@ gperf_gperf_sources += \ systemd_nspawn_CFLAGS = \ $(AM_CFLAGS) \ + $(ACL_CFLAGS) \ $(BLKID_CFLAGS) \ - $(SECCOMP_CFLAGS) + $(SECCOMP_CFLAGS) \ + $(SELINUX_CFLAGS) systemd_nspawn_LDADD = \ - libshared.la \ - $(BLKID_LIBS) + libsystemd-shared.la \ + $(ACL_LIBS) \ + $(BLKID_LIBS) \ + $(SECCOMP_LIBS) \ + $(SELINUX_LIBS) ifneq ($(HAVE_LIBIPTC),) systemd_nspawn_LDADD += \ @@ -76,8 +81,13 @@ test_patch_uid_SOURCES = \ src/nspawn/nspawn-patch-uid.h \ src/nspawn/test-patch-uid.c +test_patch_uid_CFLAGS = \ + $(AM_CFLAGS) \ + $(ACL_CFLAGS) + test_patch_uid_LDADD = \ - libshared.la + libsystemd-shared.la \ + $(ACL_LIBS) manual_tests += \ test-patch-uid diff --git a/src/systemd-nspawn/nspawn-cgroup.c b/src/systemd-nspawn/nspawn-cgroup.c index d0e2de2345..9fa46fadbd 100644 --- a/src/systemd-nspawn/nspawn-cgroup.c +++ b/src/systemd-nspawn/nspawn-cgroup.c @@ -124,7 +124,7 @@ int create_subcgroup(pid_t pid, bool unified_requested) { int unified, r; CGroupMask supported; - /* In the unified hierarchy inner nodes may only only contain + /* In the unified hierarchy inner nodes may only contain * subgroups, but not processes. Hence, if we running in the * unified hierarchy and the container does the same, and we * did not create a scope unit for the container move us and diff --git a/src/systemd-nspawn/nspawn-gperf.gperf b/src/systemd-nspawn/nspawn-gperf.gperf index dd3c1b3b88..332063e19b 100644 --- a/src/systemd-nspawn/nspawn-gperf.gperf +++ b/src/systemd-nspawn/nspawn-gperf.gperf @@ -29,6 +29,7 @@ Exec.Personality, config_parse_personality, 0, offsetof(Settings, Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) Exec.PrivateUsers, config_parse_private_users, 0, 0 +Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 diff --git a/src/systemd-nspawn/nspawn-mount.c b/src/systemd-nspawn/nspawn-mount.c index d8ca696f21..ea1d591415 100644 --- a/src/systemd-nspawn/nspawn-mount.c +++ b/src/systemd-nspawn/nspawn-mount.c @@ -299,18 +299,19 @@ int mount_all(const char *dest, } MountPoint; static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* Then, make it r/o */ - { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true }, - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false }, - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false }, + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/ + { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */ + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */ + { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true }, + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false }, + { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false }, + { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false }, #ifdef HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */ + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */ #endif }; @@ -344,7 +345,7 @@ int mount_all(const char *dest, if (mount_table[k].fatal) return log_error_errno(r, "Failed to create directory %s: %m", where); - log_warning_errno(r, "Failed to create directory %s: %m", where); + log_debug_errno(r, "Failed to create directory %s: %m", where); continue; } diff --git a/src/systemd-nspawn/nspawn-network.c b/src/systemd-nspawn/nspawn-network.c index 1ae0a52f03..b188491000 100644 --- a/src/systemd-nspawn/nspawn-network.c +++ b/src/systemd-nspawn/nspawn-network.c @@ -351,7 +351,7 @@ int setup_bridge(const char *veth_name, const char *bridge_name, bool create) { if (create) { /* We take a system-wide lock here, so that we can safely check whether there's still a member in the - * bridge before removing it, without risking interferance from other nspawn instances. */ + * bridge before removing it, without risking interference from other nspawn instances. */ r = make_lock_file("/run/systemd/nspawn-network-zone", LOCK_EX, &bridge_lock); if (r < 0) diff --git a/src/systemd-nspawn/nspawn-patch-uid.c b/src/systemd-nspawn/nspawn-patch-uid.c index 28d2801c69..dc703165ff 100644 --- a/src/systemd-nspawn/nspawn-patch-uid.c +++ b/src/systemd-nspawn/nspawn-patch-uid.c @@ -265,9 +265,12 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift return -errno; /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */ - if (name && !S_ISLNK(st->st_mode)) - r = fchmodat(fd, name, st->st_mode, 0); - else + if (name) { + if (!S_ISLNK(st->st_mode)) + r = fchmodat(fd, name, st->st_mode, 0); + else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */ + r = 0; + } else r = fchmod(fd, st->st_mode); if (r < 0) return -errno; @@ -282,7 +285,13 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift return r > 0 || changed; } -static int is_procfs_sysfs_or_suchlike(int fd) { +/* + * Check if the filesystem is fully compatible with user namespaces or + * UID/GID patching. Some filesystems in this list can be fully mounted inside + * user namespaces, however their inodes may relate to host resources or only + * valid in the global user namespace, therefore no patching should be applied. + */ +static int is_fs_fully_userns_compatible(int fd) { struct statfs sfs; assert(fd >= 0); @@ -302,6 +311,9 @@ static int is_procfs_sysfs_or_suchlike(int fd) { F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); } @@ -313,8 +325,8 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's - * stop the recursion when we hit a procfs or sysfs file system. */ - r = is_procfs_sysfs_or_suchlike(fd); + * stop the recursion when we hit procfs, sysfs or some other special file systems. */ + r = is_fs_fully_userns_compatible(fd); if (r < 0) goto finish; if (r > 0) { diff --git a/src/systemd-nspawn/nspawn-register.c b/src/systemd-nspawn/nspawn-register.c index 3f174b6f24..12e29153f9 100644 --- a/src/systemd-nspawn/nspawn-register.c +++ b/src/systemd-nspawn/nspawn-register.c @@ -105,7 +105,7 @@ int register_machine( return bus_log_create_error(r); } - r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict"); + r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "closed"); if (r < 0) return bus_log_create_error(r); @@ -113,26 +113,19 @@ int register_machine( * systemd-nspawn@.service, to keep the device * policies in sync regardless if we are run with or * without the --keep-unit switch. */ - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9, + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2, /* Allow the container to * access and create the API * device nodes, so that * PrivateDevices= in the * container can work * fine */ - "/dev/null", "rwm", - "/dev/zero", "rwm", - "/dev/full", "rwm", - "/dev/random", "rwm", - "/dev/urandom", "rwm", - "/dev/tty", "rwm", "/dev/net/tun", "rwm", /* Allow the container * access to ptys. However, * do not permit the * container to ever create * these device nodes. */ - "/dev/pts/ptmx", "rw", "char-pts", "rw"); if (r < 0) return bus_log_create_error(r); diff --git a/src/systemd-nspawn/nspawn-seccomp.c b/src/systemd-nspawn/nspawn-seccomp.c new file mode 100644 index 0000000000..3efdf16a80 --- /dev/null +++ b/src/systemd-nspawn/nspawn-seccomp.c @@ -0,0 +1,198 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <errno.h> +#include <sys/capability.h> +#include <sys/types.h> + +#include <linux/netlink.h> + +#ifdef HAVE_SECCOMP +#include <seccomp.h> +#endif + +#include "basic/log.h" + +#ifdef HAVE_SECCOMP +#include "shared/seccomp-util.h" +#endif + +#include "nspawn-seccomp.h" + +#ifdef HAVE_SECCOMP + +static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, + uint64_t cap_list_retain) { + unsigned i; + int r; + static const struct { + uint64_t capability; + int syscall_num; + } blacklist[] = { + { 0, SCMP_SYS(_sysctl) }, /* obsolete syscall */ + { 0, SCMP_SYS(add_key) }, /* keyring is not namespaced */ + { 0, SCMP_SYS(afs_syscall) }, /* obsolete syscall */ + { 0, SCMP_SYS(bdflush) }, +#ifdef __NR_bpf + { 0, SCMP_SYS(bpf) }, +#endif + { 0, SCMP_SYS(break) }, /* obsolete syscall */ + { 0, SCMP_SYS(create_module) }, /* obsolete syscall */ + { 0, SCMP_SYS(ftime) }, /* obsolete syscall */ + { 0, SCMP_SYS(get_kernel_syms) }, /* obsolete syscall */ + { 0, SCMP_SYS(getpmsg) }, /* obsolete syscall */ + { 0, SCMP_SYS(gtty) }, /* obsolete syscall */ +#ifdef __NR_kexec_file_load + { 0, SCMP_SYS(kexec_file_load) }, +#endif + { 0, SCMP_SYS(kexec_load) }, + { 0, SCMP_SYS(keyctl) }, /* keyring is not namespaced */ + { 0, SCMP_SYS(lock) }, /* obsolete syscall */ + { 0, SCMP_SYS(lookup_dcookie) }, + { 0, SCMP_SYS(mpx) }, /* obsolete syscall */ + { 0, SCMP_SYS(nfsservctl) }, /* obsolete syscall */ + { 0, SCMP_SYS(open_by_handle_at) }, + { 0, SCMP_SYS(perf_event_open) }, + { 0, SCMP_SYS(prof) }, /* obsolete syscall */ + { 0, SCMP_SYS(profil) }, /* obsolete syscall */ + { 0, SCMP_SYS(putpmsg) }, /* obsolete syscall */ + { 0, SCMP_SYS(query_module) }, /* obsolete syscall */ + { 0, SCMP_SYS(quotactl) }, + { 0, SCMP_SYS(request_key) }, /* keyring is not namespaced */ + { 0, SCMP_SYS(security) }, /* obsolete syscall */ + { 0, SCMP_SYS(sgetmask) }, /* obsolete syscall */ + { 0, SCMP_SYS(ssetmask) }, /* obsolete syscall */ + { 0, SCMP_SYS(stty) }, /* obsolete syscall */ + { 0, SCMP_SYS(swapoff) }, + { 0, SCMP_SYS(swapon) }, + { 0, SCMP_SYS(sysfs) }, /* obsolete syscall */ + { 0, SCMP_SYS(tuxcall) }, /* obsolete syscall */ + { 0, SCMP_SYS(ulimit) }, /* obsolete syscall */ + { 0, SCMP_SYS(uselib) }, /* obsolete syscall */ + { 0, SCMP_SYS(ustat) }, /* obsolete syscall */ + { 0, SCMP_SYS(vserver) }, /* obsolete syscall */ + { CAP_SYSLOG, SCMP_SYS(syslog) }, + { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, + { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, + { CAP_SYS_MODULE, SCMP_SYS(init_module) }, + { CAP_SYS_PACCT, SCMP_SYS(acct) }, + { CAP_SYS_PTRACE, SCMP_SYS(process_vm_readv) }, + { CAP_SYS_PTRACE, SCMP_SYS(process_vm_writev) }, + { CAP_SYS_PTRACE, SCMP_SYS(ptrace) }, + { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, + { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, + { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_iobase) }, + { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_read) }, + { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_write) }, +#ifdef __NR_s390_pci_mmio_read + { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_read) }, +#endif +#ifdef __NR_s390_pci_mmio_write + { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_write) }, +#endif + { CAP_SYS_TIME, SCMP_SYS(adjtimex) }, + { CAP_SYS_TIME, SCMP_SYS(clock_adjtime) }, + { CAP_SYS_TIME, SCMP_SYS(clock_settime) }, + { CAP_SYS_TIME, SCMP_SYS(settimeofday) }, + { CAP_SYS_TIME, SCMP_SYS(stime) }, + }; + + for (i = 0; i < ELEMENTSOF(blacklist); i++) { + if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) + continue; + + r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); + if (r == -EFAULT) + continue; /* unknown syscall */ + if (r < 0) + return log_error_errno(r, "Failed to block syscall: %m"); + } + + return 0; +} + +int setup_seccomp(uint64_t cap_list_retain) { + scmp_filter_ctx seccomp; + int r; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return log_oom(); + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) { + log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); + goto finish; + } + + r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); + if (r < 0) + goto finish; + + /* + Audit is broken in containers, much of the userspace audit + hookup will fail if running inside a container. We don't + care and just turn off creation of audit sockets. + + This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail + with EAFNOSUPPORT which audit userspace uses as indication + that audit is disabled in the kernel. + */ + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 2, + SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), + SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); + if (r < 0) { + log_error_errno(r, "Failed to add audit seccomp rule: %m"); + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) { + log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); + goto finish; + } + + r = seccomp_load(seccomp); + if (r == -EINVAL) { + log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m"); + r = 0; + goto finish; + } + if (r < 0) { + log_error_errno(r, "Failed to install seccomp audit filter: %m"); + goto finish; + } + +finish: + seccomp_release(seccomp); + return r; +} + +#else + +int setup_seccomp(uint64_t cap_list_retain) { + return 0; +} + +#endif diff --git a/src/systemd-nspawn/nspawn-seccomp.h b/src/systemd-nspawn/nspawn-seccomp.h new file mode 100644 index 0000000000..5bde16faf9 --- /dev/null +++ b/src/systemd-nspawn/nspawn-seccomp.h @@ -0,0 +1,24 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> + +int setup_seccomp(uint64_t cap_list_retain); diff --git a/src/systemd-nspawn/nspawn-settings.h b/src/systemd-nspawn/nspawn-settings.h index 95e70d5fa4..8a88647df2 100644 --- a/src/systemd-nspawn/nspawn-settings.h +++ b/src/systemd-nspawn/nspawn-settings.h @@ -57,7 +57,8 @@ typedef enum SettingsMask { SETTING_CUSTOM_MOUNTS = 1 << 11, SETTING_WORKING_DIRECTORY = 1 << 12, SETTING_USERNS = 1 << 13, - _SETTINGS_MASK_ALL = (1 << 14) -1 + SETTING_NOTIFY_READY = 1 << 14, + _SETTINGS_MASK_ALL = (1 << 15) -1 } SettingsMask; typedef struct Settings { @@ -74,6 +75,7 @@ typedef struct Settings { char *working_directory; UserNamespaceMode userns_mode; uid_t uid_shift, uid_range; + bool notify_ready; /* [Image] */ int read_only; diff --git a/src/systemd-nspawn/nspawn-setuid.c b/src/systemd-nspawn/nspawn-setuid.c index 38ca9e2e24..360781c24c 100644 --- a/src/systemd-nspawn/nspawn-setuid.c +++ b/src/systemd-nspawn/nspawn-setuid.c @@ -125,14 +125,12 @@ int change_uid_gid(const char *user, char **_home) { fd = -1; if (!fgets(line, sizeof(line), f)) { - if (!ferror(f)) { log_error("Failed to resolve user %s.", user); return -ESRCH; } - log_error_errno(errno, "Failed to read from getent: %m"); - return -errno; + return log_error_errno(errno, "Failed to read from getent: %m"); } truncate_nl(line); @@ -215,8 +213,7 @@ int change_uid_gid(const char *user, char **_home) { return -ESRCH; } - log_error_errno(errno, "Failed to read from getent: %m"); - return -errno; + return log_error_errno(errno, "Failed to read from getent: %m"); } truncate_nl(line); diff --git a/src/systemd-nspawn/nspawn.c b/src/systemd-nspawn/nspawn.c index 5537a526f0..fb25ec7afa 100644 --- a/src/systemd-nspawn/nspawn.c +++ b/src/systemd-nspawn/nspawn.c @@ -27,9 +27,6 @@ #include <sched.h> #include <linux/loop.h> -#ifdef HAVE_SECCOMP -#include <seccomp.h> -#endif #ifdef HAVE_SELINUX #include <selinux/selinux.h> #endif @@ -57,7 +54,6 @@ #include "basic/copy.h" #include "basic/env-util.h" #include "basic/fd-util.h" -#include "basic/fdset.h" #include "basic/fileio.h" #include "basic/formats-util.h" #include "basic/fs-util.h" @@ -71,45 +67,39 @@ #include "basic/path-util.h" #include "basic/process-util.h" #include "basic/random-util.h" +#include "basic/raw-clone.h" #include "basic/rm-rf.h" #include "shared/base-filesystem.h" #include "shared/dev-setup.h" +#include "shared/fdset.h" #include "shared/gpt.h" #include "shared/machine-image.h" #include "shared/ptyfwd.h" +#include "shared/udev-util.h" #include "loopback-setup.h" -#include "machine-id-setup.h" #include "nspawn-cgroup.h" #include "nspawn-expose-ports.h" #include "nspawn-mount.h" #include "nspawn-network.h" #include "nspawn-patch-uid.h" #include "nspawn-register.h" +#include "nspawn-seccomp.h" #include "nspawn-settings.h" #include "nspawn-setuid.h" #include "nspawn-stub-pid1.h" -#ifdef HAVE_SECCOMP -#include "shared/seccomp-util.h" -#endif -#include "basic/selinux-util.h" -#include "basic/signal-util.h" -#include "basic/socket-util.h" -#include "basic/stat-util.h" -#include "basic/stdio-util.h" -#include "basic/string-util.h" -#include "basic/strv.h" -#include "basic/terminal-util.h" -#include "basic/umask-util.h" -#include "basic/user-util.h" -#include "basic/util.h" -#include "shared/udev-util.h" /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit - * UID range here */ + * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems + * may have their own allocation ranges too. */ #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000)) #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000)) +/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path + * nspawn_notify_socket_path is relative to the container + * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */ +#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify" + typedef enum ContainerStatus { CONTAINER_TERMINATED, CONTAINER_REBOOTED @@ -137,7 +127,9 @@ static StartMode arg_start_mode = START_PID1; static bool arg_ephemeral = false; static LinkJournal arg_link_journal = LINK_AUTO; static bool arg_link_journal_try = false; -static uint64_t arg_retain = +static uint64_t arg_caps_retain = + (1ULL << CAP_AUDIT_CONTROL) | + (1ULL << CAP_AUDIT_WRITE) | (1ULL << CAP_CHOWN) | (1ULL << CAP_DAC_OVERRIDE) | (1ULL << CAP_DAC_READ_SEARCH) | @@ -147,23 +139,21 @@ static uint64_t arg_retain = (1ULL << CAP_KILL) | (1ULL << CAP_LEASE) | (1ULL << CAP_LINUX_IMMUTABLE) | + (1ULL << CAP_MKNOD) | (1ULL << CAP_NET_BIND_SERVICE) | (1ULL << CAP_NET_BROADCAST) | (1ULL << CAP_NET_RAW) | - (1ULL << CAP_SETGID) | (1ULL << CAP_SETFCAP) | + (1ULL << CAP_SETGID) | (1ULL << CAP_SETPCAP) | (1ULL << CAP_SETUID) | (1ULL << CAP_SYS_ADMIN) | + (1ULL << CAP_SYS_BOOT) | (1ULL << CAP_SYS_CHROOT) | (1ULL << CAP_SYS_NICE) | (1ULL << CAP_SYS_PTRACE) | - (1ULL << CAP_SYS_TTY_CONFIG) | (1ULL << CAP_SYS_RESOURCE) | - (1ULL << CAP_SYS_BOOT) | - (1ULL << CAP_AUDIT_WRITE) | - (1ULL << CAP_AUDIT_CONTROL) | - (1ULL << CAP_MKNOD); + (1ULL << CAP_SYS_TTY_CONFIG); static CustomMount *arg_custom_mounts = NULL; static unsigned arg_n_custom_mounts = 0; static char **arg_setenv = NULL; @@ -192,6 +182,7 @@ static SettingsMask arg_settings_mask = 0; static int arg_settings_trusted = -1; static char **arg_parameters = NULL; static const char *arg_container_service_name = "systemd-nspawn"; +static bool arg_notify_ready = false; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -272,10 +263,11 @@ static void help(void) { " the service unit nspawn is running in\n" " --volatile[=MODE] Run the system in volatile mode\n" " --settings=BOOLEAN Load additional settings from .nspawn file\n" + " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n" + " accepted values: yes and no\n" , program_invocation_short_name); } - static int custom_mounts_prepare(void) { unsigned i; int r; @@ -372,6 +364,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_SETTINGS, ARG_CHDIR, ARG_PRIVATE_USERS_CHOWN, + ARG_NOTIFY_READY, }; static const struct option options[] = { @@ -420,6 +413,7 @@ static int parse_argv(int argc, char *argv[]) { { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, { "settings", required_argument, NULL, ARG_SETTINGS }, { "chdir", required_argument, NULL, ARG_CHDIR }, + { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, {} }; @@ -589,9 +583,12 @@ static int parse_argv(int argc, char *argv[]) { case ARG_UUID: r = sd_id128_from_string(optarg, &arg_uuid); - if (r < 0) { - log_error("Invalid UUID: %s", optarg); - return r; + if (r < 0) + return log_error_errno(r, "Invalid UUID: %s", optarg); + + if (sd_id128_is_null(arg_uuid)) { + log_error("Machine UUID may not be all zeroes."); + return -EINVAL; } arg_settings_mask |= SETTING_MACHINE_ID; @@ -992,6 +989,16 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_WORKING_DIRECTORY; break; + case ARG_NOTIFY_READY: + r = parse_boolean(optarg); + if (r < 0) { + log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg); + return -EINVAL; + } + arg_notify_ready = r; + arg_settings_mask |= SETTING_NOTIFY_READY; + break; + case '?': return -EINVAL; @@ -1076,7 +1083,7 @@ static int parse_argv(int argc, char *argv[]) { if (mask_all_settings) arg_settings_mask = _SETTINGS_MASK_ALL; - arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; + arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; r = detect_unified_cgroup_hierarchy(); if (r < 0) @@ -1251,20 +1258,9 @@ static int setup_resolv_conf(const char *dest) { return 0; } -static char* id128_format_as_uuid(sd_id128_t id, char s[37]) { - assert(s); - - snprintf(s, 37, - "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - SD_ID128_FORMAT_VAL(id)); - - return s; -} - static int setup_boot_id(const char *dest) { + sd_id128_t rnd = SD_ID128_NULL; const char *from, *to; - sd_id128_t rnd = {}; - char as_uuid[37]; int r; if (arg_share_system) @@ -1280,18 +1276,16 @@ static int setup_boot_id(const char *dest) { if (r < 0) return log_error_errno(r, "Failed to generate random boot id: %m"); - id128_format_as_uuid(rnd, as_uuid); - - r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE); + r = id128_write(from, ID128_UUID, rnd, false); if (r < 0) return log_error_errno(r, "Failed to write boot id: %m"); if (mount(from, to, NULL, MS_BIND, NULL) < 0) r = log_error_errno(errno, "Failed to bind mount boot id: %m"); else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0) - log_warning_errno(errno, "Failed to make boot id read-only: %m"); + log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m"); - unlink(from); + (void) unlink(from); return r; } @@ -1633,7 +1627,7 @@ static int setup_journal(const char *directory) { } static int drop_capabilities(void) { - return capability_bounding_set_drop(arg_retain, false); + return capability_bounding_set_drop(arg_caps_retain, false); } static int reset_audit_loginuid(void) { @@ -1668,99 +1662,6 @@ static int reset_audit_loginuid(void) { return 0; } -static int setup_seccomp(void) { - -#ifdef HAVE_SECCOMP - static const struct { - uint64_t capability; - int syscall_num; - } blacklist[] = { - { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, - { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, - { CAP_SYS_BOOT, SCMP_SYS(kexec_load) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapon) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapoff) }, - { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) }, - { CAP_SYS_MODULE, SCMP_SYS(init_module) }, - { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, - { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, - { CAP_SYSLOG, SCMP_SYS(syslog) }, - }; - - scmp_filter_ctx seccomp; - unsigned i; - int r; - - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); - goto finish; - } - - for (i = 0; i < ELEMENTSOF(blacklist); i++) { - if (arg_retain & (1ULL << blacklist[i].capability)) - continue; - - r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); - if (r == -EFAULT) - continue; /* unknown syscall */ - if (r < 0) { - log_error_errno(r, "Failed to block syscall: %m"); - goto finish; - } - } - - /* - Audit is broken in containers, much of the userspace audit - hookup will fail if running inside a container. We don't - care and just turn off creation of audit sockets. - - This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail - with EAFNOSUPPORT which audit userspace uses as indication - that audit is disabled in the kernel. - */ - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EAFNOSUPPORT), - SCMP_SYS(socket), - 2, - SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), - SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); - if (r < 0) { - log_error_errno(r, "Failed to add audit seccomp rule: %m"); - goto finish; - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - - r = seccomp_load(seccomp); - if (r == -EINVAL) { - log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m"); - r = 0; - goto finish; - } - if (r < 0) { - log_error_errno(r, "Failed to install seccomp audit filter: %m"); - goto finish; - } - -finish: - seccomp_release(seccomp); - return r; -#else - return 0; -#endif - -} static int setup_propagate(const char *root) { const char *p, *q; @@ -2310,33 +2211,37 @@ static int mount_device(const char *what, const char *where, const char *directo } static int setup_machine_id(const char *directory) { + const char *etc_machine_id; + sd_id128_t id; int r; - const char *etc_machine_id, *t; - _cleanup_free_ char *s = NULL; - etc_machine_id = prefix_roota(directory, "/etc/machine-id"); + /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the + * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The + * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not + * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id + * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the + * container behaves nicely). */ - r = read_one_line_file(etc_machine_id, &s); - if (r < 0) - return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id); + etc_machine_id = prefix_roota(directory, "/etc/machine-id"); - t = strstrip(s); + r = id128_read(etc_machine_id, ID128_PLAIN, &id); + if (r < 0) { + if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */ + return log_error_errno(r, "Failed to read machine ID from container image: %m"); - if (!isempty(t)) { - r = sd_id128_from_string(t, &arg_uuid); - if (r < 0) - return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id); - } else { if (sd_id128_is_null(arg_uuid)) { r = sd_id128_randomize(&arg_uuid); if (r < 0) - return log_error_errno(r, "Failed to generate random machine ID: %m"); + return log_error_errno(r, "Failed to acquire randomized machine UUID: %m"); + } + } else { + if (sd_id128_is_null(id)) { + log_error("Machine ID in container image is zero, refusing."); + return -EINVAL; } - } - r = machine_id_setup(directory, arg_uuid); - if (r < 0) - return log_error_errno(r, "Failed to setup machine ID: %m"); + arg_uuid = id; + } return 0; } @@ -2447,10 +2352,9 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { switch (status.si_code) { case CLD_EXITED: - if (status.si_status == 0) { + if (status.si_status == 0) log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine); - - } else + else log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status); *container = CONTAINER_TERMINATED; @@ -2458,13 +2362,11 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { case CLD_KILLED: if (status.si_status == SIGINT) { - log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine); *container = CONTAINER_TERMINATED; return 0; } else if (status.si_status == SIGHUP) { - log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine); *container = CONTAINER_REBOOTED; return 0; @@ -2480,8 +2382,6 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { log_error("Container %s failed due to unknown reason.", arg_machine); return -EIO; } - - return r; } static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { @@ -2632,6 +2532,7 @@ static int inner_child( NULL, /* container_uuid */ NULL, /* LISTEN_FDS */ NULL, /* LISTEN_PID */ + NULL, /* NOTIFY_SOCKET */ NULL }; @@ -2725,7 +2626,7 @@ static int inner_child( #ifdef HAVE_SELINUX if (arg_selinux_context) - if (setexeccon((security_context_t) arg_selinux_context) < 0) + if (setexeccon(arg_selinux_context) < 0) return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); #endif @@ -2745,9 +2646,9 @@ static int inner_child( (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) return log_oom(); - assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL)); + assert(!sd_id128_is_null(arg_uuid)); - if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) + if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0) return log_oom(); if (fdset_size(fds) > 0) { @@ -2759,6 +2660,8 @@ static int inner_child( (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) return log_oom(); } + if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0) + return log_oom(); env_use = strv_env_merge(2, envp, arg_setenv); if (!env_use) @@ -2828,6 +2731,37 @@ static int inner_child( return log_error_errno(r, "execv() failed: %m"); } +static int setup_sd_notify_child(void) { + static const int one = 1; + int fd = -1; + union sockaddr_union sa = { + .sa.sa_family = AF_UNIX, + }; + int r; + + fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate notification socket: %m"); + + (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755); + (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH); + + strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1); + r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)); + if (r < 0) { + safe_close(fd); + return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); + } + + r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)); + if (r < 0) { + safe_close(fd); + return log_error_errno(errno, "SO_PASSCRED failed: %m"); + } + + return fd; +} + static int outer_child( Barrier *barrier, const char *directory, @@ -2839,6 +2773,7 @@ static int outer_child( bool secondary, int pid_socket, int uuid_socket, + int notify_socket, int kmsg_socket, int rtnl_socket, int uid_shift_socket, @@ -2847,12 +2782,14 @@ static int outer_child( pid_t pid; ssize_t l; int r; + _cleanup_close_ int fd = -1; assert(barrier); assert(directory); assert(console); assert(pid_socket >= 0); assert(uuid_socket >= 0); + assert(notify_socket >= 0); assert(kmsg_socket >= 0); cg_unified_flush(); @@ -2920,7 +2857,7 @@ static int outer_child( if (l < 0) return log_error_errno(errno, "Failed to recv UID shift: %m"); if (l != sizeof(arg_uid_shift)) { - log_error("Short read while recieving UID shift."); + log_error("Short read while receiving UID shift."); return -EIO; } } @@ -2994,7 +2931,7 @@ static int outer_child( if (r < 0) return r; - r = setup_seccomp(); + r = setup_seccomp(arg_caps_retain); if (r < 0) return r; @@ -3039,16 +2976,20 @@ static int outer_child( if (r < 0) return log_error_errno(r, "Failed to move root directory: %m"); + fd = setup_sd_notify_child(); + if (fd < 0) + return fd; + pid = raw_clone(SIGCHLD|CLONE_NEWNS| (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | (arg_private_network ? CLONE_NEWNET : 0) | - (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0), - NULL); + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); if (pid == 0) { pid_socket = safe_close(pid_socket); uuid_socket = safe_close(uuid_socket); + notify_socket = safe_close(notify_socket); uid_shift_socket = safe_close(uid_shift_socket); /* The inner child has all namespaces that are @@ -3078,8 +3019,13 @@ static int outer_child( return -EIO; } + l = send_one_fd(notify_socket, fd, 0); + if (l < 0) + return log_error_errno(errno, "Failed to send notify fd: %m"); + pid_socket = safe_close(pid_socket); uuid_socket = safe_close(uuid_socket); + notify_socket = safe_close(notify_socket); kmsg_socket = safe_close(kmsg_socket); rtnl_socket = safe_close(rtnl_socket); @@ -3162,6 +3108,95 @@ static int setup_uid_map(pid_t pid) { return 0; } +static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + char buf[NOTIFY_BUFFER_MAX+1]; + char *p = NULL; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + union { + struct cmsghdr cmsghdr; + uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)]; + } control = {}; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + struct ucred *ucred = NULL; + ssize_t n; + pid_t inner_child_pid; + _cleanup_strv_free_ char **tags = NULL; + + assert(userdata); + + inner_child_pid = PTR_TO_PID(userdata); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for notify fd."); + return 0; + } + + n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC); + if (n < 0) { + if (errno == EAGAIN || errno == EINTR) + return 0; + + return log_warning_errno(errno, "Couldn't read notification socket: %m"); + } + cmsg_close_all(&msghdr); + + CMSG_FOREACH(cmsg, &msghdr) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + + ucred = (struct ucred*) CMSG_DATA(cmsg); + } + } + + if (!ucred || ucred->pid != inner_child_pid) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } + + if ((size_t) n >= sizeof(buf)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + buf[n] = 0; + tags = strv_split(buf, "\n\r"); + if (!tags) + return log_oom(); + + if (strv_find(tags, "READY=1")) + sd_notifyf(false, "READY=1\n"); + + p = strv_find_startswith(tags, "STATUS="); + if (p) + sd_notifyf(false, "STATUS=Container running: %s", p); + + return 0; +} + +static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) { + int r; + sd_event_source *notify_event_source; + + r = sd_event_add_io(event, ¬ify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + + (void) sd_event_source_set_description(notify_event_source, "nspawn-notify"); + + return 0; +} + static int load_settings(void) { _cleanup_(settings_freep) Settings *settings = NULL; _cleanup_fclose_ FILE *f = NULL; @@ -3278,9 +3313,9 @@ static int load_settings(void) { if (settings->capability != 0) log_warning("Ignoring Capability= setting, file %s is not trusted.", p); } else - arg_retain |= plus; + arg_caps_retain |= plus; - arg_retain &= ~settings->drop_capability; + arg_caps_retain &= ~settings->drop_capability; } if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && @@ -3390,6 +3425,9 @@ static int load_settings(void) { } } + if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0) + arg_notify_ready = settings->notify_ready; + return 0; } @@ -3504,7 +3542,7 @@ int main(int argc, char *argv[]) { } if (r < 0) { log_error_errno(r, "Failed to lock %s: %m", arg_directory); - return r; + goto finish; } if (arg_template) { @@ -3640,7 +3678,9 @@ int main(int argc, char *argv[]) { rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, + notify_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 }; + _cleanup_close_ int notify_socket= -1; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; @@ -3691,6 +3731,11 @@ int main(int argc, char *argv[]) { goto finish; } + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) { + r = log_error_errno(errno, "Failed to create notify socket pair: %m"); + goto finish; + } + if (arg_userns_mode != USER_NAMESPACE_NO) if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); @@ -3712,7 +3757,7 @@ int main(int argc, char *argv[]) { goto finish; } - pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL); + pid = raw_clone(SIGCHLD|CLONE_NEWNS); if (pid < 0) { if (errno == EINVAL) r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); @@ -3732,6 +3777,7 @@ int main(int argc, char *argv[]) { rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); pid_socket_pair[0] = safe_close(pid_socket_pair[0]); uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); + notify_socket_pair[0] = safe_close(notify_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); (void) reset_all_signal_handlers(); @@ -3747,6 +3793,7 @@ int main(int argc, char *argv[]) { secondary, pid_socket_pair[1], uuid_socket_pair[1], + notify_socket_pair[1], kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], @@ -3765,6 +3812,7 @@ int main(int argc, char *argv[]) { rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); pid_socket_pair[1] = safe_close(pid_socket_pair[1]); uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); + notify_socket_pair[1] = safe_close(notify_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); if (arg_userns_mode != USER_NAMESPACE_NO) { @@ -3838,6 +3886,13 @@ int main(int argc, char *argv[]) { goto finish; } + /* We also retrieve the socket used for notifications generated by outer child */ + notify_socket = receive_one_fd(notify_socket_pair[0], 0); + if (notify_socket < 0) { + r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m"); + goto finish; + } + log_debug("Init process invoked as PID " PID_FMT, pid); if (arg_userns_mode != USER_NAMESPACE_NO) { @@ -3952,6 +4007,16 @@ int main(int argc, char *argv[]) { goto finish; } + r = sd_event_new(&event); + if (r < 0) { + log_error_errno(r, "Failed to get default event source: %m"); + goto finish; + } + + r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid)); + if (r < 0) + goto finish; + /* Let the child know that we are ready and wait that the child is completely ready now. */ if (!barrier_place_and_sync(&barrier)) { /* #4 */ log_error("Child died too early."); @@ -3964,15 +4029,10 @@ int main(int argc, char *argv[]) { etc_passwd_lock = safe_close(etc_passwd_lock); sd_notifyf(false, - "READY=1\n" "STATUS=Container running.\n" "X_NSPAWN_LEADER_PID=" PID_FMT, pid); - - r = sd_event_new(&event); - if (r < 0) { - log_error_errno(r, "Failed to get default event source: %m"); - goto finish; - } + if (!arg_notify_ready) + sd_notify(false, "READY=1\n"); if (arg_kill_signal > 0) { /* Try to kill the init system on SIGINT or SIGTERM */ diff --git a/src/systemd-nspawn/systemd-nspawn.completion.bash b/src/systemd-nspawn/systemd-nspawn.completion.bash index 0cf249d8ce..ea4a5e1f43 100644 --- a/src/systemd-nspawn/systemd-nspawn.completion.bash +++ b/src/systemd-nspawn/systemd-nspawn.completion.bash @@ -60,7 +60,8 @@ _systemd_nspawn() { [ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro -M --machine -S --slice --setenv -Z --selinux-context -L --selinux-apifs-context --register --network-interface --network-bridge --personality -i --image --tmpfs --volatile - --network-macvlan --kill-signal --template' + --network-macvlan --kill-signal --template + --notify-ready' ) _init_completion || return @@ -139,6 +140,10 @@ _systemd_nspawn() { _signals return ;; + --notify-ready) + comps='yes no' + return + ;; esac COMPREPLY=( $(compgen -W '$comps' -- "$cur") ) return 0 diff --git a/src/systemd-nspawn/systemd-nspawn.completion.zsh b/src/systemd-nspawn/systemd-nspawn.completion.zsh index 3e0f667909..77b2e7cd7c 100644 --- a/src/systemd-nspawn/systemd-nspawn.completion.zsh +++ b/src/systemd-nspawn/systemd-nspawn.completion.zsh @@ -46,4 +46,5 @@ _arguments \ '--keep-unit[Instead of creating a transient scope unit to run the container in, simply register the service or scope unit systemd-nspawn has been invoked in with systemd-machined(8).]' \ '--personality=[Control the architecture ("personality") reported by uname(2) in the container.]:architecture:(x86 x86-64)' \ '--volatile=[Run the system in volatile mode.]:volatile:(no yes state)' \ + "--notify-ready=[Control when the ready notification is sent]:options:(yes no)" \ '*:: : _normal' diff --git a/src/systemd-nspawn/systemd-nspawn.tmpfiles b/src/systemd-nspawn/systemd-nspawn.tmpfiles index 9fa3878d6b..78bd1c670e 100644 --- a/src/systemd-nspawn/systemd-nspawn.tmpfiles +++ b/src/systemd-nspawn/systemd-nspawn.tmpfiles @@ -10,7 +10,7 @@ Q /var/lib/machines 0700 - - - # Remove old temporary snapshots, but only at boot. Ideally we'd have -# "self-destroying" btrfs snapshots that go away if the last last +# "self-destroying" btrfs snapshots that go away if the last # reference to it does. To mimic a scheme like this at least remove # the old snapshots on fresh boots, where we know they cannot be # referenced anymore. Note that we actually remove all temporary files diff --git a/src/systemd-nspawn/systemd-nspawn.xml b/src/systemd-nspawn/systemd-nspawn.xml index 476cc2932f..69d2f6ff7d 100644 --- a/src/systemd-nspawn/systemd-nspawn.xml +++ b/src/systemd-nspawn/systemd-nspawn.xml @@ -67,69 +67,80 @@ <refsect1> <title>Description</title> - <para><command>systemd-nspawn</command> may be used to run a - command or OS in a light-weight namespace container. In many ways - it is similar to - <citerefentry project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry>, - but more powerful since it fully virtualizes the file system - hierarchy, as well as the process tree, the various IPC subsystems - and the host and domain name.</para> - - <para><command>systemd-nspawn</command> limits access to various - kernel interfaces in the container to read-only, such as - <filename>/sys</filename>, <filename>/proc/sys</filename> or - <filename>/sys/fs/selinux</filename>. Network interfaces and the - system clock may not be changed from within the container. Device - nodes may not be created. The host system cannot be rebooted and - kernel modules may not be loaded from within the container.</para> - - <para>Note that even though these security precautions are taken - <command>systemd-nspawn</command> is not suitable for fully secure - container setups. Many of the security features may be - circumvented and are hence primarily useful to avoid accidental - changes to the host system from the container.</para> - - <para>In contrast to - <citerefentry project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry> <command>systemd-nspawn</command> - may be used to boot full Linux-based operating systems in a + <para><command>systemd-nspawn</command> may be used to run a command or OS in a light-weight namespace + container. In many ways it is similar to <citerefentry + project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry>, but more powerful + since it fully virtualizes the file system hierarchy, as well as the process tree, the various IPC subsystems and + the host and domain name.</para> + + <para><command>systemd-nspawn</command> may be invoked on any directory tree containing an operating system tree, + using the <option>--directory=</option> command line option. By using the <option>--machine=</option> option an OS + tree is automatically searched for in a couple of locations, most importantly in + <filename>/var/lib/machines</filename>, the suggested directory to place container images installed on the + system.</para> + + <para>In contrast to <citerefentry + project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry> <command>systemd-nspawn</command> + may be used to boot full Linux-based operating systems in a container.</para> + + <para><command>systemd-nspawn</command> limits access to various kernel interfaces in the container to read-only, + such as <filename>/sys</filename>, <filename>/proc/sys</filename> or <filename>/sys/fs/selinux</filename>. The + host's network interfaces and the system clock may not be changed from within the container. Device nodes may not + be created. The host system cannot be rebooted and kernel modules may not be loaded from within the container.</para> - <para>Use a tool like - <citerefentry project='mankier'><refentrytitle>dnf</refentrytitle><manvolnum>8</manvolnum></citerefentry>, - <citerefentry project='die-net'><refentrytitle>debootstrap</refentrytitle><manvolnum>8</manvolnum></citerefentry>, - or - <citerefentry project='archlinux'><refentrytitle>pacman</refentrytitle><manvolnum>8</manvolnum></citerefentry> - to set up an OS directory tree suitable as file system hierarchy - for <command>systemd-nspawn</command> containers.</para> - - <para>Note that <command>systemd-nspawn</command> will mount file - systems private to the container to <filename>/dev</filename>, - <filename>/run</filename> and similar. These will not be visible - outside of the container, and their contents will be lost when the - container exits.</para> - - <para>Note that running two <command>systemd-nspawn</command> - containers from the same directory tree will not make processes in - them see each other. The PID namespace separation of the two - containers is complete and the containers will share very few - runtime objects except for the underlying file system. Use - <citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry>'s - <command>login</command> command to request an additional login - prompt in a running container.</para> - - <para><command>systemd-nspawn</command> implements the - <ulink - url="http://www.freedesktop.org/wiki/Software/systemd/ContainerInterface">Container - Interface</ulink> specification.</para> - - <para>As a safety check <command>systemd-nspawn</command> will - verify the existence of <filename>/usr/lib/os-release</filename> - or <filename>/etc/os-release</filename> in the container tree - before starting the container (see - <citerefentry><refentrytitle>os-release</refentrytitle><manvolnum>5</manvolnum></citerefentry>). - It might be necessary to add this file to the container tree - manually if the OS of the container is too old to contain this + <para>Use a tool like <citerefentry + project='mankier'><refentrytitle>dnf</refentrytitle><manvolnum>8</manvolnum></citerefentry>, <citerefentry + project='die-net'><refentrytitle>debootstrap</refentrytitle><manvolnum>8</manvolnum></citerefentry>, or + <citerefentry project='archlinux'><refentrytitle>pacman</refentrytitle><manvolnum>8</manvolnum></citerefentry> to + set up an OS directory tree suitable as file system hierarchy for <command>systemd-nspawn</command> containers. See + the Examples section below for details on suitable invocation of these commands.</para> + + <para>As a safety check <command>systemd-nspawn</command> will verify the existence of + <filename>/usr/lib/os-release</filename> or <filename>/etc/os-release</filename> in the container tree before + starting the container (see + <citerefentry><refentrytitle>os-release</refentrytitle><manvolnum>5</manvolnum></citerefentry>). It might be + necessary to add this file to the container tree manually if the OS of the container is too old to contain this file out-of-the-box.</para> + + <para><command>systemd-nspawn</command> may be invoked directly from the interactive command line or run as system + service in the background. In this mode each container instance runs as its own service instance; a default + template unit file <filename>systemd-nspawn@.service</filename> is provided to make this easy, taking the container + name as instance identifier. Note that different default options apply when <command>systemd-nspawn</command> is + invoked by the template unit file than interactively on the command line. Most importantly the template unit file + makes use of the <option>--boot</option> which is not the default in case <command>systemd-nspawn</command> is + invoked from the interactive command line. Further differences with the defaults are documented along with the + various supported options below.</para> + + <para>The <citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry> tool may + be used to execute a number of operations on containers. In particular it provides easy-to-use commands to run + containers as system services using the <filename>systemd-nspawn@.service</filename> template unit + file.</para> + + <para>Along with each container a settings file with the <filename>.nspawn</filename> suffix may exist, containing + additional settings to apply when running the container. See + <citerefentry><refentrytitle>systemd.nspawn</refentrytitle><manvolnum>5</manvolnum></citerefentry> for + details. Settings files override the default options used by the <filename>systemd-nspawn@.service</filename> + template unit file, making it usually unnecessary to alter this template file directly.</para> + + <para>Note that <command>systemd-nspawn</command> will mount file systems private to the container to + <filename>/dev</filename>, <filename>/run</filename> and similar. These will not be visible outside of the + container, and their contents will be lost when the container exits.</para> + + <para>Note that running two <command>systemd-nspawn</command> containers from the same directory tree will not make + processes in them see each other. The PID namespace separation of the two containers is complete and the containers + will share very few runtime objects except for the underlying file system. Use + <citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry>'s + <command>login</command> or <command>shell</command> commands to request an additional login session in a running + container.</para> + + <para><command>systemd-nspawn</command> implements the <ulink + url="http://www.freedesktop.org/wiki/Software/systemd/ContainerInterface">Container Interface</ulink> + specification.</para> + + <para>While running, containers invoked with <command>systemd-nspawn</command> are registered with the + <citerefentry><refentrytitle>systemd-machined</refentrytitle><manvolnum>8</manvolnum></citerefentry> service that + keeps track of running containers, and provides programming interfaces to interact with them.</para> </refsect1> <refsect1> @@ -139,7 +150,7 @@ are used as arguments for the init binary. Otherwise, <replaceable>COMMAND</replaceable> specifies the program to launch in the container, and the remaining arguments are used as - arguments for this program. If <option>-b</option> is not used and + arguments for this program. If <option>--boot</option> is not used and no arguments are specified, a shell is launched in the container.</para> @@ -310,6 +321,9 @@ </tbody> </tgroup> </table> + + <para>Note that <option>--boot</option> is the default mode of operation if the + <filename>systemd-nspawn@.service</filename> template unit file is used.</para> </listitem> </varlistentry> @@ -446,7 +460,10 @@ <listitem><para>If the kernel supports the user namespaces feature, equivalent to <option>--private-users=pick</option>, otherwise equivalent to - <option>--private-users=no</option>.</para></listitem> + <option>--private-users=no</option>.</para> + + <para>Note that <option>-U</option> is the default if the <filename>systemd-nspawn@.service</filename> template unit + file is used.</para></listitem> </varlistentry> <varlistentry> @@ -540,6 +557,9 @@ assignment via DHCP. In case <filename>systemd-networkd</filename> is running on both the host and inside the container, automatic IP communication from the container to the host is thus available, with further connectivity to the external network.</para> + + <para>Note that <option>--network-veth</option> is the default if the + <filename>systemd-nspawn@.service</filename> template unit file is used.</para> </listitem> </varlistentry> @@ -705,7 +725,10 @@ Effectively, booting a container once with <literal>guest</literal> or <literal>host</literal> will link the journal persistently if further on the default of - <literal>auto</literal> is used.</para></listitem> + <literal>auto</literal> is used.</para> + + <para>Note that <option>--link-journal=try-guest</option> is the default if the + <filename>systemd-nspawn@.service</filename> template unit file is used.</para></listitem> </varlistentry> <varlistentry> @@ -910,8 +933,8 @@ <literal>tmpfs</literal> instance, and <filename>/usr</filename> from the OS tree is mounted into it in read-only mode (the system thus starts up with read-only OS - resources, but pristine state and configuration, any changes - to the either are lost on shutdown). When the mode parameter + image, but pristine state and configuration, any changes + are lost on shutdown). When the mode parameter is specified as <option>state</option>, the OS tree is mounted read-only, but <filename>/var</filename> is mounted as a <literal>tmpfs</literal> instance into it (the system thus @@ -980,6 +1003,19 @@ effect.</para></listitem> </varlistentry> + <varlistentry> + <term><option>--notify-ready=</option></term> + + <listitem><para>Configures support for notifications from the container's init process. + <option>--notify-ready=</option> takes a boolean (<option>no</option> and <option>yes</option>). + With option <option>no</option> systemd-nspawn notifies systemd + with a <literal>READY=1</literal> message when the init process is created. + With option <option>yes</option> systemd-nspawn waits for the + <literal>READY=1</literal> message from the init process in the container + before sending its own to systemd. For more details about notifications + see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>).</para></listitem> + </varlistentry> + <xi:include href="standard-options.xml" xpointer="help" /> <xi:include href="standard-options.xml" xpointer="version" /> </variablelist> diff --git a/src/systemd-nspawn/systemd-nspawn@.service.in b/src/systemd-nspawn/systemd-nspawn@.service.in index ea28941507..c8141639b6 100644 --- a/src/systemd-nspawn/systemd-nspawn@.service.in +++ b/src/systemd-nspawn/systemd-nspawn@.service.in @@ -20,20 +20,13 @@ RestartForceExitStatus=133 SuccessExitStatus=133 Slice=machine.slice Delegate=yes -TasksMax=8192 +TasksMax=16384 # Enforce a strict device policy, similar to the one nspawn configures # when it allocates its own scope unit. Make sure to keep these # policies in sync if you change them! -DevicePolicy=strict -DeviceAllow=/dev/null rwm -DeviceAllow=/dev/zero rwm -DeviceAllow=/dev/full rwm -DeviceAllow=/dev/random rwm -DeviceAllow=/dev/urandom rwm -DeviceAllow=/dev/tty rwm +DevicePolicy=closed DeviceAllow=/dev/net/tun rwm -DeviceAllow=/dev/pts/ptmx rw DeviceAllow=char-pts rw # nspawn itself needs access to /dev/loop-control and /dev/loop, to |