diff options
Diffstat (limited to 'src/nspawn')
-rw-r--r-- | src/nspawn/nspawn-patch-uid.c | 15 | ||||
-rw-r--r-- | src/nspawn/nspawn-seccomp.c | 143 | ||||
-rw-r--r-- | src/nspawn/nspawn-seccomp.h | 24 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 112 |
4 files changed, 186 insertions, 108 deletions
diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c index c7382d412d..cc79597c95 100644 --- a/src/nspawn/nspawn-patch-uid.c +++ b/src/nspawn/nspawn-patch-uid.c @@ -280,7 +280,13 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift return r > 0 || changed; } -static int is_procfs_sysfs_or_suchlike(int fd) { +/* + * Check if the filesystem is fully compatible with user namespaces or + * UID/GID patching. Some filesystems in this list can be fully mounted inside + * user namespaces, however their inodes may relate to host resources or only + * valid in the global user namespace, therefore no patching should be applied. + */ +static int is_fs_fully_userns_compatible(int fd) { struct statfs sfs; assert(fd >= 0); @@ -300,6 +306,9 @@ static int is_procfs_sysfs_or_suchlike(int fd) { F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) || F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); } @@ -311,8 +320,8 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's - * stop the recursion when we hit a procfs or sysfs file system. */ - r = is_procfs_sysfs_or_suchlike(fd); + * stop the recursion when we hit procfs, sysfs or some other special file systems. */ + r = is_fs_fully_userns_compatible(fd); if (r < 0) goto finish; if (r > 0) { diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c new file mode 100644 index 0000000000..2d145b68a7 --- /dev/null +++ b/src/nspawn/nspawn-seccomp.c @@ -0,0 +1,143 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <errno.h> +#include <linux/netlink.h> +#include <sys/capability.h> +#include <sys/types.h> + +#ifdef HAVE_SECCOMP +#include <seccomp.h> +#endif + +#include "log.h" + +#ifdef HAVE_SECCOMP +#include "seccomp-util.h" +#endif + +#include "nspawn-seccomp.h" + +#ifdef HAVE_SECCOMP + +static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, + uint64_t cap_list_retain) { + unsigned i; + int r; + static const struct { + uint64_t capability; + int syscall_num; + } blacklist[] = { + { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, + { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, + { CAP_SYS_BOOT, SCMP_SYS(kexec_load) }, + { CAP_SYS_ADMIN, SCMP_SYS(swapon) }, + { CAP_SYS_ADMIN, SCMP_SYS(swapoff) }, + { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) }, + { CAP_SYS_MODULE, SCMP_SYS(init_module) }, + { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, + { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, + { CAP_SYSLOG, SCMP_SYS(syslog) }, + }; + + for (i = 0; i < ELEMENTSOF(blacklist); i++) { + if (cap_list_retain & (1ULL << blacklist[i].capability)) + continue; + + r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); + if (r == -EFAULT) + continue; /* unknown syscall */ + if (r < 0) { + log_error_errno(r, "Failed to block syscall: %m"); + return r; + } + } + + return 0; +} + +int setup_seccomp(uint64_t cap_list_retain) { + scmp_filter_ctx seccomp; + int r; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return log_oom(); + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) { + log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); + goto finish; + } + + r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); + if (r < 0) + goto finish; + + /* + Audit is broken in containers, much of the userspace audit + hookup will fail if running inside a container. We don't + care and just turn off creation of audit sockets. + + This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail + with EAFNOSUPPORT which audit userspace uses as indication + that audit is disabled in the kernel. + */ + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 2, + SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), + SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); + if (r < 0) { + log_error_errno(r, "Failed to add audit seccomp rule: %m"); + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) { + log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); + goto finish; + } + + r = seccomp_load(seccomp); + if (r == -EINVAL) { + log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m"); + r = 0; + goto finish; + } + if (r < 0) { + log_error_errno(r, "Failed to install seccomp audit filter: %m"); + goto finish; + } + +finish: + seccomp_release(seccomp); + return r; +} + +#else + +int setup_seccomp(uint64_t cap_list_retain) { + return 0; +} + +#endif diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h new file mode 100644 index 0000000000..5bde16faf9 --- /dev/null +++ b/src/nspawn/nspawn-seccomp.h @@ -0,0 +1,24 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> + +int setup_seccomp(uint64_t cap_list_retain); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index ac11bcea5a..b421c182ce 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -26,9 +26,6 @@ #include <linux/loop.h> #include <pwd.h> #include <sched.h> -#ifdef HAVE_SECCOMP -#include <seccomp.h> -#endif #ifdef HAVE_SELINUX #include <selinux/selinux.h> #endif @@ -82,15 +79,13 @@ #include "nspawn-settings.h" #include "nspawn-setuid.h" #include "nspawn-stub-pid1.h" +#include "nspawn-seccomp.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" #include "ptyfwd.h" #include "random-util.h" #include "rm-rf.h" -#ifdef HAVE_SECCOMP -#include "seccomp-util.h" -#endif #include "selinux-util.h" #include "signal-util.h" #include "socket-util.h" @@ -136,7 +131,7 @@ static StartMode arg_start_mode = START_PID1; static bool arg_ephemeral = false; static LinkJournal arg_link_journal = LINK_AUTO; static bool arg_link_journal_try = false; -static uint64_t arg_retain = +static uint64_t arg_caps_retain = (1ULL << CAP_CHOWN) | (1ULL << CAP_DAC_OVERRIDE) | (1ULL << CAP_DAC_READ_SEARCH) | @@ -1075,7 +1070,7 @@ static int parse_argv(int argc, char *argv[]) { if (mask_all_settings) arg_settings_mask = _SETTINGS_MASK_ALL; - arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; + arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; r = detect_unified_cgroup_hierarchy(); if (r < 0) @@ -1632,7 +1627,7 @@ static int setup_journal(const char *directory) { } static int drop_capabilities(void) { - return capability_bounding_set_drop(arg_retain, false); + return capability_bounding_set_drop(arg_caps_retain, false); } static int reset_audit_loginuid(void) { @@ -1667,99 +1662,6 @@ static int reset_audit_loginuid(void) { return 0; } -static int setup_seccomp(void) { - -#ifdef HAVE_SECCOMP - static const struct { - uint64_t capability; - int syscall_num; - } blacklist[] = { - { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, - { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, - { CAP_SYS_BOOT, SCMP_SYS(kexec_load) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapon) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapoff) }, - { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) }, - { CAP_SYS_MODULE, SCMP_SYS(init_module) }, - { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, - { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, - { CAP_SYSLOG, SCMP_SYS(syslog) }, - }; - - scmp_filter_ctx seccomp; - unsigned i; - int r; - - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); - goto finish; - } - - for (i = 0; i < ELEMENTSOF(blacklist); i++) { - if (arg_retain & (1ULL << blacklist[i].capability)) - continue; - - r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); - if (r == -EFAULT) - continue; /* unknown syscall */ - if (r < 0) { - log_error_errno(r, "Failed to block syscall: %m"); - goto finish; - } - } - - /* - Audit is broken in containers, much of the userspace audit - hookup will fail if running inside a container. We don't - care and just turn off creation of audit sockets. - - This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail - with EAFNOSUPPORT which audit userspace uses as indication - that audit is disabled in the kernel. - */ - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EAFNOSUPPORT), - SCMP_SYS(socket), - 2, - SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), - SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); - if (r < 0) { - log_error_errno(r, "Failed to add audit seccomp rule: %m"); - goto finish; - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - - r = seccomp_load(seccomp); - if (r == -EINVAL) { - log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m"); - r = 0; - goto finish; - } - if (r < 0) { - log_error_errno(r, "Failed to install seccomp audit filter: %m"); - goto finish; - } - -finish: - seccomp_release(seccomp); - return r; -#else - return 0; -#endif - -} static int setup_propagate(const char *root) { const char *p, *q; @@ -2988,7 +2890,7 @@ static int outer_child( if (r < 0) return r; - r = setup_seccomp(); + r = setup_seccomp(arg_caps_retain); if (r < 0) return r; @@ -3272,9 +3174,9 @@ static int load_settings(void) { if (settings->capability != 0) log_warning("Ignoring Capability= setting, file %s is not trusted.", p); } else - arg_retain |= plus; + arg_caps_retain |= plus; - arg_retain &= ~settings->drop_capability; + arg_caps_retain &= ~settings->drop_capability; } if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && |