diff options
Diffstat (limited to 'src/libsystemd-shared/src/seccomp-util.c')
-rw-r--r-- | src/libsystemd-shared/src/seccomp-util.c | 579 |
1 files changed, 579 insertions, 0 deletions
diff --git a/src/libsystemd-shared/src/seccomp-util.c b/src/libsystemd-shared/src/seccomp-util.c new file mode 100644 index 0000000000..bcb55e3777 --- /dev/null +++ b/src/libsystemd-shared/src/seccomp-util.c @@ -0,0 +1,579 @@ +/*** + This file is part of systemd. + + Copyright 2014 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <errno.h> +#include <seccomp.h> +#include <stddef.h> +#include <sys/prctl.h> + +#include <linux/seccomp.h> + +#include "systemd-basic/macro.h" +#include "systemd-basic/string-util.h" +#include "systemd-basic/util.h" +#include "systemd-shared/seccomp-util.h" + +const char* seccomp_arch_to_string(uint32_t c) { + /* Maintain order used in <seccomp.h>. + * + * Names used here should be the same as those used for ConditionArchitecture=, + * except for "subarchitectures" like x32. */ + + switch(c) { + case SCMP_ARCH_NATIVE: + return "native"; + case SCMP_ARCH_X86: + return "x86"; + case SCMP_ARCH_X86_64: + return "x86-64"; + case SCMP_ARCH_X32: + return "x32"; + case SCMP_ARCH_ARM: + return "arm"; + case SCMP_ARCH_AARCH64: + return "arm64"; + case SCMP_ARCH_MIPS: + return "mips"; + case SCMP_ARCH_MIPS64: + return "mips64"; + case SCMP_ARCH_MIPS64N32: + return "mips64-n32"; + case SCMP_ARCH_MIPSEL: + return "mips-le"; + case SCMP_ARCH_MIPSEL64: + return "mips64-le"; + case SCMP_ARCH_MIPSEL64N32: + return "mips64-le-n32"; + case SCMP_ARCH_PPC: + return "ppc"; + case SCMP_ARCH_PPC64: + return "ppc64"; + case SCMP_ARCH_PPC64LE: + return "ppc64-le"; + case SCMP_ARCH_S390: + return "s390"; + case SCMP_ARCH_S390X: + return "s390x"; + default: + return NULL; + } +} + +int seccomp_arch_from_string(const char *n, uint32_t *ret) { + if (!n) + return -EINVAL; + + assert(ret); + + if (streq(n, "native")) + *ret = SCMP_ARCH_NATIVE; + else if (streq(n, "x86")) + *ret = SCMP_ARCH_X86; + else if (streq(n, "x86-64")) + *ret = SCMP_ARCH_X86_64; + else if (streq(n, "x32")) + *ret = SCMP_ARCH_X32; + else if (streq(n, "arm")) + *ret = SCMP_ARCH_ARM; + else if (streq(n, "arm64")) + *ret = SCMP_ARCH_AARCH64; + else if (streq(n, "mips")) + *ret = SCMP_ARCH_MIPS; + else if (streq(n, "mips64")) + *ret = SCMP_ARCH_MIPS64; + else if (streq(n, "mips64-n32")) + *ret = SCMP_ARCH_MIPS64N32; + else if (streq(n, "mips-le")) + *ret = SCMP_ARCH_MIPSEL; + else if (streq(n, "mips64-le")) + *ret = SCMP_ARCH_MIPSEL64; + else if (streq(n, "mips64-le-n32")) + *ret = SCMP_ARCH_MIPSEL64N32; + else if (streq(n, "ppc")) + *ret = SCMP_ARCH_PPC; + else if (streq(n, "ppc64")) + *ret = SCMP_ARCH_PPC64; + else if (streq(n, "ppc64-le")) + *ret = SCMP_ARCH_PPC64LE; + else if (streq(n, "s390")) + *ret = SCMP_ARCH_S390; + else if (streq(n, "s390x")) + *ret = SCMP_ARCH_S390X; + else + return -EINVAL; + + return 0; +} + +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) { + scmp_filter_ctx seccomp; + int r; + + /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are + * added by default, and NNP is turned off. */ + + seccomp = seccomp_init(default_action); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + *ret = seccomp; + return 0; + +finish: + seccomp_release(seccomp); + return r; +} + +int seccomp_add_secondary_archs(scmp_filter_ctx ctx) { + + /* Add in all possible secondary archs we are aware of that + * this kernel might support. */ + + static const int seccomp_arches[] = { +#if defined(__i386__) || defined(__x86_64__) + SCMP_ARCH_X86, + SCMP_ARCH_X86_64, + SCMP_ARCH_X32, + +#elif defined(__arm__) || defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, + +#elif defined(__arm__) || defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, + +#elif defined(__mips__) || defined(__mips64__) + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPSEL64N32, + +#elif defined(__powerpc__) || defined(__powerpc64__) + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64, + SCMP_ARCH_PPC64LE, + +#elif defined(__s390__) || defined(__s390x__) + SCMP_ARCH_S390, + SCMP_ARCH_S390X, +#endif + }; + + unsigned i; + int r; + + for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) { + r = seccomp_arch_add(ctx, seccomp_arches[i]); + if (r < 0 && r != -EEXIST) + return r; + } + + return 0; +} + +static bool is_basic_seccomp_available(void) { + int r; + r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + return r >= 0; +} + +static bool is_seccomp_filter_available(void) { + int r; + r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0); + return r < 0 && errno == EFAULT; +} + +bool is_seccomp_available(void) { + static int cached_enabled = -1; + if (cached_enabled < 0) + cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available(); + return cached_enabled; +} + +const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_BASIC_IO] = { + /* Basic IO */ + .name = "@basic-io", + .value = + "close\0" + "dup2\0" + "dup3\0" + "dup\0" + "lseek\0" + "pread64\0" + "preadv\0" + "pwrite64\0" + "pwritev\0" + "read\0" + "readv\0" + "write\0" + "writev\0" + }, + [SYSCALL_FILTER_SET_CLOCK] = { + /* Clock */ + .name = "@clock", + .value = + "adjtimex\0" + "clock_adjtime\0" + "clock_settime\0" + "settimeofday\0" + "stime\0" + }, + [SYSCALL_FILTER_SET_CPU_EMULATION] = { + /* CPU emulation calls */ + .name = "@cpu-emulation", + .value = + "modify_ldt\0" + "subpage_prot\0" + "switch_endian\0" + "vm86\0" + "vm86old\0" + }, + [SYSCALL_FILTER_SET_DEBUG] = { + /* Debugging/Performance Monitoring/Tracing */ + .name = "@debug", + .value = + "lookup_dcookie\0" + "perf_event_open\0" + "process_vm_readv\0" + "process_vm_writev\0" + "ptrace\0" + "rtas\0" +#ifdef __NR_s390_runtime_instr + "s390_runtime_instr\0" +#endif + "sys_debug_setcontext\0" + }, + [SYSCALL_FILTER_SET_DEFAULT] = { + /* Default list: the most basic of operations */ + .name = "@default", + .value = + "clock_getres\0" + "clock_gettime\0" + "clock_nanosleep\0" + "execve\0" + "exit\0" + "exit_group\0" + "getrlimit\0" /* make sure processes can query stack size and such */ + "gettimeofday\0" + "nanosleep\0" + "pause\0" + "rt_sigreturn\0" + "sigreturn\0" + "time\0" + }, + [SYSCALL_FILTER_SET_IO_EVENT] = { + /* Event loop use */ + .name = "@io-event", + .value = + "_newselect\0" + "epoll_create1\0" + "epoll_create\0" + "epoll_ctl\0" + "epoll_ctl_old\0" + "epoll_pwait\0" + "epoll_wait\0" + "epoll_wait_old\0" + "eventfd2\0" + "eventfd\0" + "poll\0" + "ppoll\0" + "pselect6\0" + "select\0" + }, + [SYSCALL_FILTER_SET_IPC] = { + /* Message queues, SYSV IPC or other IPC */ + .name = "@ipc", + .value = "ipc\0" + "memfd_create\0" + "mq_getsetattr\0" + "mq_notify\0" + "mq_open\0" + "mq_timedreceive\0" + "mq_timedsend\0" + "mq_unlink\0" + "msgctl\0" + "msgget\0" + "msgrcv\0" + "msgsnd\0" + "pipe2\0" + "pipe\0" + "process_vm_readv\0" + "process_vm_writev\0" + "semctl\0" + "semget\0" + "semop\0" + "semtimedop\0" + "shmat\0" + "shmctl\0" + "shmdt\0" + "shmget\0" + }, + [SYSCALL_FILTER_SET_KEYRING] = { + /* Keyring */ + .name = "@keyring", + .value = + "add_key\0" + "keyctl\0" + "request_key\0" + }, + [SYSCALL_FILTER_SET_MODULE] = { + /* Kernel module control */ + .name = "@module", + .value = + "delete_module\0" + "finit_module\0" + "init_module\0" + }, + [SYSCALL_FILTER_SET_MOUNT] = { + /* Mounting */ + .name = "@mount", + .value = + "chroot\0" + "mount\0" + "pivot_root\0" + "umount2\0" + "umount\0" + }, + [SYSCALL_FILTER_SET_NETWORK_IO] = { + /* Network or Unix socket IO, should not be needed if not network facing */ + .name = "@network-io", + .value = + "accept4\0" + "accept\0" + "bind\0" + "connect\0" + "getpeername\0" + "getsockname\0" + "getsockopt\0" + "listen\0" + "recv\0" + "recvfrom\0" + "recvmmsg\0" + "recvmsg\0" + "send\0" + "sendmmsg\0" + "sendmsg\0" + "sendto\0" + "setsockopt\0" + "shutdown\0" + "socket\0" + "socketcall\0" + "socketpair\0" + }, + [SYSCALL_FILTER_SET_OBSOLETE] = { + /* Unusual, obsolete or unimplemented, some unknown even to libseccomp */ + .name = "@obsolete", + .value = + "_sysctl\0" + "afs_syscall\0" + "break\0" + "create_module\0" + "ftime\0" + "get_kernel_syms\0" + "getpmsg\0" + "gtty\0" + "lock\0" + "mpx\0" + "prof\0" + "profil\0" + "putpmsg\0" + "query_module\0" + "security\0" + "sgetmask\0" + "ssetmask\0" + "stty\0" + "sysfs\0" + "tuxcall\0" + "ulimit\0" + "uselib\0" + "ustat\0" + "vserver\0" + }, + [SYSCALL_FILTER_SET_PRIVILEGED] = { + /* Nice grab-bag of all system calls which need superuser capabilities */ + .name = "@privileged", + .value = + "@clock\0" + "@module\0" + "@raw-io\0" + "acct\0" + "bdflush\0" + "bpf\0" + "capset\0" + "chown32\0" + "chown\0" + "chroot\0" + "fchown32\0" + "fchown\0" + "fchownat\0" + "kexec_file_load\0" + "kexec_load\0" + "lchown32\0" + "lchown\0" + "nfsservctl\0" + "pivot_root\0" + "quotactl\0" + "reboot\0" + "setdomainname\0" + "setfsuid32\0" + "setfsuid\0" + "setgroups32\0" + "setgroups\0" + "sethostname\0" + "setresuid32\0" + "setresuid\0" + "setreuid32\0" + "setreuid\0" + "setuid32\0" + "setuid\0" + "swapoff\0" + "swapon\0" + "_sysctl\0" + "vhangup\0" + }, + [SYSCALL_FILTER_SET_PROCESS] = { + /* Process control, execution, namespaces */ + .name = "@process", + .value = + "arch_prctl\0" + "clone\0" + "execveat\0" + "fork\0" + "kill\0" + "prctl\0" + "setns\0" + "tgkill\0" + "tkill\0" + "unshare\0" + "vfork\0" + }, + [SYSCALL_FILTER_SET_RAW_IO] = { + /* Raw I/O ports */ + .name = "@raw-io", + .value = + "ioperm\0" + "iopl\0" + "pciconfig_iobase\0" + "pciconfig_read\0" + "pciconfig_write\0" +#ifdef __NR_s390_pci_mmio_read + "s390_pci_mmio_read\0" +#endif +#ifdef __NR_s390_pci_mmio_write + "s390_pci_mmio_write\0" +#endif + }, + [SYSCALL_FILTER_SET_RESOURCES] = { + /* Alter resource settings */ + .name = "@resources", + .value = + "sched_setparam\0" + "sched_setscheduler\0" + "sched_setaffinity\0" + "setpriority\0" + "setrlimit\0" + "set_mempolicy\0" + "migrate_pages\0" + "move_pages\0" + "mbind\0" + "sched_setattr\0" + "prlimit64\0" + }, +}; + +const SyscallFilterSet *syscall_filter_set_find(const char *name) { + unsigned i; + + if (isempty(name) || name[0] != '@') + return NULL; + + for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) + if (streq(syscall_filter_sets[i].name, name)) + return syscall_filter_sets + i; + + return NULL; +} + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) { + const char *sys; + int r; + + assert(seccomp); + assert(set); + + NULSTR_FOREACH(sys, set->value) { + int id; + + if (sys[0] == '@') { + const SyscallFilterSet *other; + + other = syscall_filter_set_find(sys); + if (!other) + return -EINVAL; + + r = seccomp_add_syscall_filter_set(seccomp, other, action); + } else { + id = seccomp_syscall_resolve_name(sys); + if (id == __NR_SCMP_ERROR) + return -EINVAL; + + r = seccomp_rule_add(seccomp, action, id, 0); + } + if (r < 0) + return r; + } + + return 0; +} + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) { + scmp_filter_ctx seccomp; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */ + + r = seccomp_init_conservative(&seccomp, default_action); + if (r < 0) + return r; + + r = seccomp_add_syscall_filter_set(seccomp, set, action); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; + +} |