/*** This file is part of systemd. Copyright 2016 Lennart Poettering systemd is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. systemd is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ #include <errno.h> #include <linux/netlink.h> #include <sys/capability.h> #include <sys/types.h> #ifdef HAVE_SECCOMP #include <seccomp.h> #endif #include "log.h" #ifdef HAVE_SECCOMP #include "seccomp-util.h" #endif #include "nspawn-seccomp.h" #ifdef HAVE_SECCOMP static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, uint64_t cap_list_retain) { unsigned i; int r; static const struct { uint64_t capability; int syscall_num; } blacklist[] = { { 0, SCMP_SYS(_sysctl) }, /* obsolete syscall */ { 0, SCMP_SYS(add_key) }, /* keyring is not namespaced */ { 0, SCMP_SYS(afs_syscall) }, /* obsolete syscall */ { 0, SCMP_SYS(bdflush) }, #ifdef __NR_bpf { 0, SCMP_SYS(bpf) }, #endif { 0, SCMP_SYS(break) }, /* obsolete syscall */ { 0, SCMP_SYS(create_module) }, /* obsolete syscall */ { 0, SCMP_SYS(ftime) }, /* obsolete syscall */ { 0, SCMP_SYS(get_kernel_syms) }, /* obsolete syscall */ { 0, SCMP_SYS(getpmsg) }, /* obsolete syscall */ { 0, SCMP_SYS(gtty) }, /* obsolete syscall */ #ifdef __NR_kexec_file_load { 0, SCMP_SYS(kexec_file_load) }, #endif { 0, SCMP_SYS(kexec_load) }, { 0, SCMP_SYS(keyctl) }, /* keyring is not namespaced */ { 0, SCMP_SYS(lock) }, /* obsolete syscall */ { 0, SCMP_SYS(lookup_dcookie) }, { 0, SCMP_SYS(mpx) }, /* obsolete syscall */ { 0, SCMP_SYS(nfsservctl) }, /* obsolete syscall */ { 0, SCMP_SYS(open_by_handle_at) }, { 0, SCMP_SYS(perf_event_open) }, { 0, SCMP_SYS(prof) }, /* obsolete syscall */ { 0, SCMP_SYS(profil) }, /* obsolete syscall */ { 0, SCMP_SYS(putpmsg) }, /* obsolete syscall */ { 0, SCMP_SYS(query_module) }, /* obsolete syscall */ { 0, SCMP_SYS(quotactl) }, { 0, SCMP_SYS(request_key) }, /* keyring is not namespaced */ { 0, SCMP_SYS(security) }, /* obsolete syscall */ { 0, SCMP_SYS(sgetmask) }, /* obsolete syscall */ { 0, SCMP_SYS(ssetmask) }, /* obsolete syscall */ { 0, SCMP_SYS(stty) }, /* obsolete syscall */ { 0, SCMP_SYS(swapoff) }, { 0, SCMP_SYS(swapon) }, { 0, SCMP_SYS(sysfs) }, /* obsolete syscall */ { 0, SCMP_SYS(tuxcall) }, /* obsolete syscall */ { 0, SCMP_SYS(ulimit) }, /* obsolete syscall */ { 0, SCMP_SYS(uselib) }, /* obsolete syscall */ { 0, SCMP_SYS(ustat) }, /* obsolete syscall */ { 0, SCMP_SYS(vserver) }, /* obsolete syscall */ { CAP_SYSLOG, SCMP_SYS(syslog) }, { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, { CAP_SYS_MODULE, SCMP_SYS(init_module) }, { CAP_SYS_PACCT, SCMP_SYS(acct) }, { CAP_SYS_PTRACE, SCMP_SYS(process_vm_readv) }, { CAP_SYS_PTRACE, SCMP_SYS(process_vm_writev) }, { CAP_SYS_PTRACE, SCMP_SYS(ptrace) }, { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_iobase) }, { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_read) }, { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_write) }, #ifdef __NR_s390_pci_mmio_read { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_read) }, #endif #ifdef __NR_s390_pci_mmio_write { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_write) }, #endif { CAP_SYS_TIME, SCMP_SYS(adjtimex) }, { CAP_SYS_TIME, SCMP_SYS(clock_adjtime) }, { CAP_SYS_TIME, SCMP_SYS(clock_settime) }, { CAP_SYS_TIME, SCMP_SYS(settimeofday) }, { CAP_SYS_TIME, SCMP_SYS(stime) }, }; for (i = 0; i < ELEMENTSOF(blacklist); i++) { if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) continue; r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); if (r == -EFAULT) continue; /* unknown syscall */ if (r < 0) return log_error_errno(r, "Failed to block syscall: %m"); } return 0; } int setup_seccomp(uint64_t cap_list_retain) { scmp_filter_ctx seccomp; int r; if (!is_seccomp_available()) { log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter"); return 0; } seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return log_oom(); r = seccomp_add_secondary_archs(seccomp); if (r < 0) { log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); goto finish; } r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); if (r < 0) goto finish; /* Audit is broken in containers, much of the userspace audit hookup will fail if running inside a container. We don't care and just turn off creation of audit sockets. This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses as indication that audit is disabled in the kernel. */ r = seccomp_rule_add( seccomp, SCMP_ACT_ERRNO(EAFNOSUPPORT), SCMP_SYS(socket), 2, SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); if (r < 0) { log_error_errno(r, "Failed to add audit seccomp rule: %m"); goto finish; } r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); if (r < 0) { log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); goto finish; } r = seccomp_load(seccomp); if (r < 0) { log_error_errno(r, "Failed to install seccomp audit filter: %m"); goto finish; } finish: seccomp_release(seccomp); return r; } #else int setup_seccomp(uint64_t cap_list_retain) { return 0; } #endif