From 4a4485ae69bddf6cc01d4c50f3f53535c2d8fea4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 17 Aug 2016 17:53:25 +0200 Subject: seccomp: make sure getrlimit() is among the default permitted syscalls A lot of basic code wants to know the stack size, and it is safe if they do, hence let's permit getrlimit() (but not setrlimit()) by default. See: #3970 --- src/shared/seccomp-util.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src/shared/seccomp-util.c') diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 8656d112b8..b549426e2b 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -127,6 +127,7 @@ const SystemCallFilterSet syscall_filter_sets[] = { "execve\0" "exit\0" "exit_group\0" + "getrlimit\0" /* make sure processes can query stack size and such */ "rt_sigreturn\0" "sigreturn\0" }, { -- cgit v1.2.3-54-g00ecf From 83f12b27d14853e7c89a326f7cd31a6c739d378e Mon Sep 17 00:00:00 2001 From: Felipe Sateler Date: Mon, 22 Aug 2016 16:40:58 -0300 Subject: core: do not fail at step SECCOMP if there is no kernel support (#4004) Fixes #3882 --- src/core/execute.c | 38 ++++++++++++++++++++++++++++++-------- src/core/main.c | 6 ++++++ src/shared/seccomp-util.c | 10 ++++++++++ src/shared/seccomp-util.h | 2 ++ src/test/test-execute.c | 11 ++++++++++- 5 files changed, 58 insertions(+), 9 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/src/core/execute.c b/src/core/execute.c index 0af8eb5a02..55f15d7e49 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1074,7 +1074,17 @@ static void rename_process_from_path(const char *path) { #ifdef HAVE_SECCOMP -static int apply_seccomp(const ExecContext *c) { +static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { + if (!is_seccomp_available()) { + log_open(); + log_unit_debug(u, "SECCOMP not detected in the kernel, skipping %s", msg); + log_close(); + return true; + } + return false; +} + +static int apply_seccomp(const Unit* u, const ExecContext *c) { uint32_t negative_action, action; scmp_filter_ctx *seccomp; Iterator i; @@ -1083,6 +1093,9 @@ static int apply_seccomp(const ExecContext *c) { assert(c); + if (skip_seccomp_unavailable(u, "syscall filtering")) + return 0; + negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW); @@ -1123,13 +1136,16 @@ finish: return r; } -static int apply_address_families(const ExecContext *c) { +static int apply_address_families(const Unit* u, const ExecContext *c) { scmp_filter_ctx *seccomp; Iterator i; int r; assert(c); + if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return -ENOMEM; @@ -1244,12 +1260,15 @@ finish: return r; } -static int apply_memory_deny_write_execute(const ExecContext *c) { +static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { scmp_filter_ctx *seccomp; int r; assert(c); + if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return -ENOMEM; @@ -1283,7 +1302,7 @@ finish: return r; } -static int apply_restrict_realtime(const ExecContext *c) { +static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { static const int permitted_policies[] = { SCHED_OTHER, SCHED_BATCH, @@ -1296,6 +1315,9 @@ static int apply_restrict_realtime(const ExecContext *c) { assert(c); + if (skip_seccomp_unavailable(u, "RestrictRealtime=")) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return -ENOMEM; @@ -2403,7 +2425,7 @@ static int exec_child( #ifdef HAVE_SECCOMP if (use_address_families) { - r = apply_address_families(context); + r = apply_address_families(unit, context); if (r < 0) { *exit_status = EXIT_ADDRESS_FAMILIES; return r; @@ -2411,7 +2433,7 @@ static int exec_child( } if (context->memory_deny_write_execute) { - r = apply_memory_deny_write_execute(context); + r = apply_memory_deny_write_execute(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; return r; @@ -2419,7 +2441,7 @@ static int exec_child( } if (context->restrict_realtime) { - r = apply_restrict_realtime(context); + r = apply_restrict_realtime(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; return r; @@ -2427,7 +2449,7 @@ static int exec_child( } if (use_syscall_filter) { - r = apply_seccomp(context); + r = apply_seccomp(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; return r; diff --git a/src/core/main.c b/src/core/main.c index 125cfb28f0..7d8322ebd8 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -72,6 +72,9 @@ #include "process-util.h" #include "raw-clone.h" #include "rlimit-util.h" +#ifdef HAVE_SECCOMP +#include "seccomp-util.h" +#endif #include "selinux-setup.h" #include "selinux-util.h" #include "signal-util.h" @@ -1186,6 +1189,9 @@ static int enforce_syscall_archs(Set *archs) { void *id; int r; + if (!is_seccomp_available()) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return log_oom(); diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 8656d112b8..4667f508c7 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -21,6 +21,8 @@ #include #include +#include "alloc-util.h" +#include "fileio.h" #include "macro.h" #include "seccomp-util.h" #include "string-util.h" @@ -89,6 +91,14 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { } +bool is_seccomp_available(void) { + _cleanup_free_ char* field = NULL; + static int cached_enabled = -1; + if (cached_enabled < 0) + cached_enabled = get_proc_field("/proc/self/status", "Seccomp", "\n", &field) == 0; + return cached_enabled; +} + const SystemCallFilterSet syscall_filter_sets[] = { { /* Clock */ diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index be33eecb85..cca7c17912 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -27,6 +27,8 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret); int seccomp_add_secondary_archs(scmp_filter_ctx *c); +bool is_seccomp_available(void); + typedef struct SystemCallFilterSet { const char *set_name; const char *value; diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 1d24115b5c..05ec1d2eb1 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -30,6 +30,9 @@ #include "mkdir.h" #include "path-util.h" #include "rm-rf.h" +#ifdef HAVE_SECCOMP +#include "seccomp-util.h" +#endif #include "test-helper.h" #include "unit.h" #include "util.h" @@ -132,21 +135,27 @@ static void test_exec_privatedevices(Manager *m) { static void test_exec_systemcallfilter(Manager *m) { #ifdef HAVE_SECCOMP + if (!is_seccomp_available()) + return; test(m, "exec-systemcallfilter-not-failing.service", 0, CLD_EXITED); test(m, "exec-systemcallfilter-not-failing2.service", 0, CLD_EXITED); test(m, "exec-systemcallfilter-failing.service", SIGSYS, CLD_KILLED); test(m, "exec-systemcallfilter-failing2.service", SIGSYS, CLD_KILLED); + #endif } static void test_exec_systemcallerrornumber(Manager *m) { #ifdef HAVE_SECCOMP - test(m, "exec-systemcallerrornumber.service", 1, CLD_EXITED); + if (is_seccomp_available()) + test(m, "exec-systemcallerrornumber.service", 1, CLD_EXITED); #endif } static void test_exec_systemcall_system_mode_with_user(Manager *m) { #ifdef HAVE_SECCOMP + if (!is_seccomp_available()) + return; if (getpwnam("nobody")) test(m, "exec-systemcallfilter-system-user.service", 0, CLD_EXITED); else if (getpwnam("nfsnobody")) -- cgit v1.2.3-54-g00ecf From d347d9029c7ec6b30eaaab93649105d935061b55 Mon Sep 17 00:00:00 2001 From: Felipe Sateler Date: Wed, 31 Aug 2016 10:00:35 -0300 Subject: seccomp: also detect if seccomp filtering is enabled In https://github.com/systemd/systemd/pull/4004 , a runtime detection method for seccomp was added. However, it does not detect the case where CONFIG_SECCOMP=y but CONFIG_SECCOMP_FILTER=n. This is possible if the architecture does not support filtering yet. Add a check for that case too. While at it, change get_proc_field usage to use PR_GET_SECCOMP prctl, as that should save a few system calls and (unnecessary) allocations. Previously, reading of /proc/self/stat was done as recommended by prctl(2) as safer. However, given that we need to do the prctl call anyway, lets skip opening, reading and parsing the file. Code for checking inspired by https://outflux.net/teach-seccomp/autodetect.html --- src/core/execute.c | 2 +- src/shared/seccomp-util.c | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/src/core/execute.c b/src/core/execute.c index 55f15d7e49..2026137721 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1077,7 +1077,7 @@ static void rename_process_from_path(const char *path) { static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { if (!is_seccomp_available()) { log_open(); - log_unit_debug(u, "SECCOMP not detected in the kernel, skipping %s", msg); + log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); log_close(); return true; } diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 6c489284d1..2f42381fc1 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -20,9 +20,9 @@ #include #include #include +#include +#include -#include "alloc-util.h" -#include "fileio.h" #include "macro.h" #include "seccomp-util.h" #include "string-util.h" @@ -91,11 +91,22 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { } +static bool is_basic_seccomp_available(void) { + int r; + r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + return r >= 0; +} + +static bool is_seccomp_filter_available(void) { + int r; + r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0); + return r < 0 && errno == EFAULT; +} + bool is_seccomp_available(void) { - _cleanup_free_ char* field = NULL; static int cached_enabled = -1; if (cached_enabled < 0) - cached_enabled = get_proc_field("/proc/self/status", "Seccomp", "\n", &field) == 0; + cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available(); return cached_enabled; } -- cgit v1.2.3-54-g00ecf From 6abfd30372f9440c3dc3cf2fcf6d3eee85037ccb Mon Sep 17 00:00:00 2001 From: hbrueckner Date: Wed, 5 Oct 2016 13:58:55 +0200 Subject: seccomp: add support for the s390 architecture (#4287) Add seccomp support for the s390 architecture (31-bit and 64-bit) to systemd. This requires libseccomp >= 2.3.1. --- README | 2 +- configure.ac | 2 +- man/systemd.exec.xml | 3 ++- src/shared/seccomp-util.c | 22 ++++++++++++++++++++++ 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/README b/README index fb6fd6381b..d610baaf76 100644 --- a/README +++ b/README @@ -120,7 +120,7 @@ REQUIREMENTS: libcap libmount >= 2.27.1 (from util-linux) (util-linux *must* be built with --enable-libmount-force-mountinfo) - libseccomp >= 1.0.0 (optional) + libseccomp >= 2.3.1 (optional) libblkid >= 2.24 (from util-linux) (optional) libkmod >= 15 (optional) PAM >= 1.1.2 (optional) diff --git a/configure.ac b/configure.ac index 4181483798..ccd212ef13 100644 --- a/configure.ac +++ b/configure.ac @@ -459,7 +459,7 @@ AM_CONDITIONAL(HAVE_LIBMOUNT, [test "$have_libmount" = "yes"]) have_seccomp=no AC_ARG_ENABLE(seccomp, AS_HELP_STRING([--disable-seccomp], [Disable optional SECCOMP support])) if test "x$enable_seccomp" != "xno"; then - PKG_CHECK_MODULES(SECCOMP, [libseccomp >= 1.0.0], + PKG_CHECK_MODULES(SECCOMP, [libseccomp >= 2.3.1], [AC_DEFINE(HAVE_SECCOMP, 1, [Define if seccomp is available]) have_seccomp=yes M4_DEFINES="$M4_DEFINES -DHAVE_SECCOMP"], diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 2054267b90..5e6787338d 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1334,7 +1334,8 @@ identifiers to include in the system call filter. The known architecture identifiers are x86, x86-64, x32, - arm as well as the special identifier + arm, s390, + s390x as well as the special identifier native. Only system calls of the specified architectures will be permitted to processes of this unit. This is an effective way to disable compatibility with diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 2f42381fc1..8116c7671f 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -39,6 +39,10 @@ const char* seccomp_arch_to_string(uint32_t c) { return "x32"; if (c == SCMP_ARCH_ARM) return "arm"; + if (c == SCMP_ARCH_S390) + return "s390"; + if (c == SCMP_ARCH_S390X) + return "s390x"; return NULL; } @@ -59,6 +63,10 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) { *ret = SCMP_ARCH_X32; else if (streq(n, "arm")) *ret = SCMP_ARCH_ARM; + else if (streq(n, "s390")) + *ret = SCMP_ARCH_S390; + else if (streq(n, "s390x")) + *ret = SCMP_ARCH_S390X; else return -EINVAL; @@ -85,6 +93,20 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { if (r < 0 && r != -EEXIST) return r; +#elif defined(__s390__) || defined(__s390x__) + int r; + + /* Add in all possible secondary archs we are aware of that + * this kernel might support. */ + + r = seccomp_arch_add(c, SCMP_ARCH_S390); + if (r < 0 && r != -EEXIST) + return r; + + r = seccomp_arch_add(c, SCMP_ARCH_S390X); + if (r < 0 && r != -EEXIST) + return r; + #endif return 0; -- cgit v1.2.3-54-g00ecf From 8130926d32d76193e98ba783ba932816f276bfad Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 21:50:05 +0200 Subject: core: rework syscall filter set handling A variety of fixes: - rename the SystemCallFilterSet structure to SyscallFilterSet. So far the main instance of it (the syscall_filter_sets[] array) used to abbreviate "SystemCall" as "Syscall". Let's stick to one of the two syntaxes, and not mix and match too wildly. Let's pick the shorter name in this case, as it is sufficiently well established to not confuse hackers reading this. - Export explicit indexes into the syscall_filter_sets[] array via an enum. This way, code that wants to make use of a specific filter set, can index it directly via the enum, instead of having to search for it. This makes apply_private_devices() in particular a lot simpler. - Provide two new helper calls in seccomp-util.c: syscall_filter_set_find() to find a set by its name, seccomp_add_syscall_filter_set() to add a set to a seccomp object. - Update SystemCallFilter= parser to use extract_first_word(). Let's work on deprecating FOREACH_WORD_QUOTED(). - Simplify apply_private_devices() using this functionality --- src/core/execute.c | 41 +-------------- src/core/load-fragment.c | 54 ++++++++++--------- src/shared/seccomp-util.c | 128 ++++++++++++++++++++++++++++++++++------------ src/shared/seccomp-util.h | 32 ++++++++++-- 4 files changed, 155 insertions(+), 100 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/src/core/execute.c b/src/core/execute.c index e63a12f934..18bb67cda9 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1578,10 +1578,7 @@ finish: } static int apply_private_devices(Unit *u, const ExecContext *c) { - const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; - const char *sys; - bool syscalls_found = false; int r; assert(c); @@ -1599,43 +1596,9 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - for (set = syscall_filter_sets; set->set_name; set++) - if (streq(set->set_name, "@raw-io")) { - syscalls_found = true; - break; - } - - /* We should never fail here */ - if (!syscalls_found) { - r = -EOPNOTSUPP; + r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); + if (r < 0) goto finish; - } - - NULSTR_FOREACH(sys, set->value) { - int id; - bool add = true; - -#ifndef __NR_s390_pci_mmio_read - if (streq(sys, "s390_pci_mmio_read")) - add = false; -#endif -#ifndef __NR_s390_pci_mmio_write - if (streq(sys, "s390_pci_mmio_write")) - add = false; -#endif - - if (!add) - continue; - - id = seccomp_syscall_resolve_name(sys); - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - id, 0); - if (r < 0) - goto finish; - } r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); if (r < 0) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 6f68e23340..118b39c1cf 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -2618,6 +2618,7 @@ int config_parse_documentation(const char *unit, } #ifdef HAVE_SECCOMP + static int syscall_filter_parse_one( const char *unit, const char *filename, @@ -2628,27 +2629,29 @@ static int syscall_filter_parse_one( bool warn) { int r; - if (*t == '@') { - const SystemCallFilterSet *set; + if (t[0] == '@') { + const SyscallFilterSet *set; + const char *i; - for (set = syscall_filter_sets; set->set_name; set++) - if (streq(set->set_name, t)) { - const char *sys; + set = syscall_filter_set_find(t); + if (!set) { + if (warn) + log_syntax(unit, LOG_WARNING, filename, line, 0, "Don't know system call group, ignoring: %s", t); + return 0; + } - NULSTR_FOREACH(sys, set->value) { - r = syscall_filter_parse_one(unit, filename, line, c, invert, sys, false); - if (r < 0) - return r; - } - break; - } + NULSTR_FOREACH(i, set->value) { + r = syscall_filter_parse_one(unit, filename, line, c, invert, i, false); + if (r < 0) + return r; + } } else { int id; id = seccomp_syscall_resolve_name(t); if (id == __NR_SCMP_ERROR) { if (warn) - log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse system call, ignoring: %s", t); + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse system call, ignoring: %s", t); return 0; } @@ -2662,8 +2665,9 @@ static int syscall_filter_parse_one( if (r < 0) return log_oom(); } else - set_remove(c->syscall_filter, INT_TO_PTR(id + 1)); + (void) set_remove(c->syscall_filter, INT_TO_PTR(id + 1)); } + return 0; } @@ -2682,8 +2686,7 @@ int config_parse_syscall_filter( ExecContext *c = data; Unit *u = userdata; bool invert = false; - const char *word, *state; - size_t l; + const char *p; int r; assert(filename); @@ -2722,19 +2725,24 @@ int config_parse_syscall_filter( } } - FOREACH_WORD_QUOTED(word, l, rvalue, state) { - _cleanup_free_ char *t = NULL; + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL; - t = strndup(word, l); - if (!t) + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } - r = syscall_filter_parse_one(unit, filename, line, c, invert, t, true); + r = syscall_filter_parse_one(unit, filename, line, c, invert, word, true); if (r < 0) return r; } - if (!isempty(state)) - log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring."); /* Turn on NNP, but only if it wasn't configured explicitly * before, and only if we are in user mode. */ diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 8116c7671f..1d51f3fd1f 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -26,6 +26,7 @@ #include "macro.h" #include "seccomp-util.h" #include "string-util.h" +#include "util.h" const char* seccomp_arch_to_string(uint32_t c) { @@ -132,28 +133,30 @@ bool is_seccomp_available(void) { return cached_enabled; } -const SystemCallFilterSet syscall_filter_sets[] = { - { +const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_CLOCK] = { /* Clock */ - .set_name = "@clock", + .name = "@clock", .value = "adjtimex\0" "clock_adjtime\0" "clock_settime\0" "settimeofday\0" "stime\0" - }, { + }, + [SYSCALL_FILTER_SET_CPU_EMULATION] = { /* CPU emulation calls */ - .set_name = "@cpu-emulation", + .name = "@cpu-emulation", .value = "modify_ldt\0" "subpage_prot\0" "switch_endian\0" "vm86\0" "vm86old\0" - }, { + }, + [SYSCALL_FILTER_SET_DEBUG] = { /* Debugging/Performance Monitoring/Tracing */ - .set_name = "@debug", + .name = "@debug", .value = "lookup_dcookie\0" "perf_event_open\0" @@ -161,11 +164,14 @@ const SystemCallFilterSet syscall_filter_sets[] = { "process_vm_writev\0" "ptrace\0" "rtas\0" +#ifdef __NR_s390_runtime_instr "s390_runtime_instr\0" +#endif "sys_debug_setcontext\0" - }, { + }, + [SYSCALL_FILTER_SET_DEFAULT] = { /* Default list */ - .set_name = "@default", + .name = "@default", .value = "execve\0" "exit\0" @@ -173,9 +179,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "getrlimit\0" /* make sure processes can query stack size and such */ "rt_sigreturn\0" "sigreturn\0" - }, { + }, + [SYSCALL_FILTER_SET_IO_EVENT] = { /* Event loop use */ - .set_name = "@io-event", + .name = "@io-event", .value = "_newselect\0" "epoll_create1\0" @@ -191,9 +198,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "ppoll\0" "pselect6\0" "select\0" - }, { + }, + [SYSCALL_FILTER_SET_IPC] = { /* Message queues, SYSV IPC or other IPC: unusual */ - .set_name = "@ipc", + .name = "@ipc", .value = "ipc\0" "mq_getsetattr\0" "mq_notify\0" @@ -215,23 +223,26 @@ const SystemCallFilterSet syscall_filter_sets[] = { "shmctl\0" "shmdt\0" "shmget\0" - }, { + }, + [SYSCALL_FILTER_SET_KEYRING] = { /* Keyring */ - .set_name = "@keyring", + .name = "@keyring", .value = "add_key\0" "keyctl\0" "request_key\0" - }, { + }, + [SYSCALL_FILTER_SET_MODULE] = { /* Kernel module control */ - .set_name = "@module", + .name = "@module", .value = "delete_module\0" "finit_module\0" "init_module\0" - }, { + }, + [SYSCALL_FILTER_SET_MOUNT] = { /* Mounting */ - .set_name = "@mount", + .name = "@mount", .value = "chroot\0" "mount\0" @@ -239,9 +250,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "pivot_root\0" "umount2\0" "umount\0" - }, { + }, + [SYSCALL_FILTER_SET_NETWORK_IO] = { /* Network or Unix socket IO, should not be needed if not network facing */ - .set_name = "@network-io", + .name = "@network-io", .value = "accept4\0" "accept\0" @@ -264,9 +276,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "socket\0" "socketcall\0" "socketpair\0" - }, { + }, + [SYSCALL_FILTER_SET_OBSOLETE] = { /* Unusual, obsolete or unimplemented, some unknown even to libseccomp */ - .set_name = "@obsolete", + .name = "@obsolete", .value = "_sysctl\0" "afs_syscall\0" @@ -292,9 +305,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "uselib\0" "ustat\0" "vserver\0" - }, { + }, + [SYSCALL_FILTER_SET_PRIVILEGED] = { /* Nice grab-bag of all system calls which need superuser capabilities */ - .set_name = "@privileged", + .name = "@privileged", .value = "@clock\0" "@module\0" @@ -333,9 +347,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "swapon\0" "sysctl\0" "vhangup\0" - }, { + }, + [SYSCALL_FILTER_SET_PROCESS] = { /* Process control, execution, namespaces */ - .set_name = "@process", + .name = "@process", .value = "arch_prctl\0" "clone\0" @@ -349,19 +364,66 @@ const SystemCallFilterSet syscall_filter_sets[] = { "tkill\0" "unshare\0" "vfork\0" - }, { + }, + [SYSCALL_FILTER_SET_RAW_IO] = { /* Raw I/O ports */ - .set_name = "@raw-io", + .name = "@raw-io", .value = "ioperm\0" "iopl\0" "pciconfig_iobase\0" "pciconfig_read\0" "pciconfig_write\0" +#ifdef __NR_s390_pci_mmio_read "s390_pci_mmio_read\0" +#endif +#ifdef __NR_s390_pci_mmio_write "s390_pci_mmio_write\0" - }, { - .set_name = NULL, - .value = NULL - } +#endif + }, }; + +const SyscallFilterSet *syscall_filter_set_find(const char *name) { + unsigned i; + + if (isempty(name) || name[0] != '@') + return NULL; + + for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) + if (streq(syscall_filter_sets[i].name, name)) + return syscall_filter_sets + i; + + return NULL; +} + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) { + const char *sys; + int r; + + assert(seccomp); + assert(set); + + NULSTR_FOREACH(sys, set->value) { + int id; + + if (sys[0] == '@') { + const SyscallFilterSet *other; + + other = syscall_filter_set_find(sys); + if (!other) + return -EINVAL; + + r = seccomp_add_syscall_filter_set(seccomp, other, action); + } else { + id = seccomp_syscall_resolve_name(sys); + if (id == __NR_SCMP_ERROR) + return -EINVAL; + + r = seccomp_rule_add(seccomp, action, id, 0); + } + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index cca7c17912..34fd49c122 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -29,9 +29,31 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c); bool is_seccomp_available(void); -typedef struct SystemCallFilterSet { - const char *set_name; +typedef struct SyscallFilterSet { + const char *name; const char *value; -} SystemCallFilterSet; - -extern const SystemCallFilterSet syscall_filter_sets[]; +} SyscallFilterSet; + +enum { + SYSCALL_FILTER_SET_CLOCK, + SYSCALL_FILTER_SET_CPU_EMULATION, + SYSCALL_FILTER_SET_DEBUG, + SYSCALL_FILTER_SET_DEFAULT, + SYSCALL_FILTER_SET_IO_EVENT, + SYSCALL_FILTER_SET_IPC, + SYSCALL_FILTER_SET_KEYRING, + SYSCALL_FILTER_SET_MODULE, + SYSCALL_FILTER_SET_MOUNT, + SYSCALL_FILTER_SET_NETWORK_IO, + SYSCALL_FILTER_SET_OBSOLETE, + SYSCALL_FILTER_SET_PRIVILEGED, + SYSCALL_FILTER_SET_PROCESS, + SYSCALL_FILTER_SET_RAW_IO, + _SYSCALL_FILTER_SET_MAX +}; + +extern const SyscallFilterSet syscall_filter_sets[]; + +const SyscallFilterSet *syscall_filter_set_find(const char *name); + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); -- cgit v1.2.3-54-g00ecf From 8d7b0c8fd780e88ab5a6d1d79e09e27247245bee Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 20:28:05 +0200 Subject: seccomp: add new seccomp_init_conservative() helper This adds a new seccomp_init_conservative() helper call that is mostly just a wrapper around seccomp_init(), but turns off NNP and adds in all secondary archs, for best compatibility with everything else. Pretty much all of our code used the very same constructs for these three steps, hence unifying this in one small function makes things a lot shorter. This also changes incorrect usage of the "scmp_filter_ctx" type at various places. libseccomp defines it as typedef to "void*", i.e. it is a pointer type (pretty poor choice already!) that casts implicitly to and from all other pointer types (even poorer choice: you defined a confusing type now, and don't even gain any bit of type safety through it...). A lot of the code assumed the type would refer to a structure, and hence aded additional "*" here and there. Remove that. --- src/core/execute.c | 88 ++++++++++----------------------------------- src/nspawn/nspawn-seccomp.c | 18 ++-------- src/shared/seccomp-util.c | 30 ++++++++++++++-- src/shared/seccomp-util.h | 4 ++- 4 files changed, 53 insertions(+), 87 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/src/core/execute.c b/src/core/execute.c index f435a079c7..668504c5cf 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1197,7 +1197,7 @@ static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { static int apply_seccomp(const Unit* u, const ExecContext *c) { uint32_t negative_action, action; - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; Iterator i; void *id; int r; @@ -1248,7 +1248,7 @@ finish: } static int apply_address_families(const Unit* u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; Iterator i; int r; @@ -1257,13 +1257,9 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; if (c->address_families_whitelist) { int af, first = 0, last = 0; @@ -1360,10 +1356,6 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { } } - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1372,7 +1364,7 @@ finish: } static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1380,13 +1372,9 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_rule_add( seccomp, @@ -1406,10 +1394,6 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1424,7 +1408,7 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { SCHED_IDLE, }; - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; unsigned i; int r, p, max_policy = 0; @@ -1433,13 +1417,9 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "RestrictRealtime=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; /* Determine the highest policy constant we want to allow */ for (i = 0; i < ELEMENTSOF(permitted_policies); i++) @@ -1483,10 +1463,6 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1495,7 +1471,7 @@ finish: } static int apply_protect_sysctl(Unit *u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1506,13 +1482,9 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_rule_add( seccomp, @@ -1522,10 +1494,6 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1534,9 +1502,7 @@ finish: } static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { - - scmp_filter_ctx *seccomp; - const char *sys; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1546,22 +1512,14 @@ static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1570,7 +1528,7 @@ finish: } static int apply_private_devices(Unit *u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1580,22 +1538,14 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 44a0b397ab..03a397d30c 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -135,15 +135,9 @@ int setup_seccomp(uint64_t cap_list_retain) { return 0; } - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); - goto finish; - } + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); if (r < 0) @@ -171,12 +165,6 @@ int setup_seccomp(uint64_t cap_list_retain) { goto finish; } - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - r = seccomp_load(seccomp); if (r < 0) { log_error_errno(r, "Failed to install seccomp audit filter: %m"); diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1d51f3fd1f..0b9fa47c44 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -74,7 +74,34 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) { return 0; } -int seccomp_add_secondary_archs(scmp_filter_ctx *c) { +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) { + scmp_filter_ctx seccomp; + int r; + + /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are + * added by default, and NNP is turned off. */ + + seccomp = seccomp_init(default_action); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + *ret = seccomp; + return 0; + +finish: + seccomp_release(seccomp); + return r; +} + +int seccomp_add_secondary_archs(scmp_filter_ctx c) { #if defined(__i386__) || defined(__x86_64__) int r; @@ -111,7 +138,6 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { #endif return 0; - } static bool is_basic_seccomp_available(void) { diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 34fd49c122..2de429a772 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -25,7 +25,9 @@ const char* seccomp_arch_to_string(uint32_t c); int seccomp_arch_from_string(const char *n, uint32_t *ret); -int seccomp_add_secondary_archs(scmp_filter_ctx *c); +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action); + +int seccomp_add_secondary_archs(scmp_filter_ctx c); bool is_seccomp_available(void); -- cgit v1.2.3-54-g00ecf From 60f547cf684d27e8c0e7ff44663650e90f9e0bcf Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 21:15:43 +0200 Subject: seccomp: two fixes for the syscall set tables "oldumount()" is not a syscall, but simply a wrapper for it, the actual syscall nr is called "umount" (and the nr of umount() is called umount2 internally). "sysctl()" is not a syscall, but "_syscall()" is. Fix this in the table. Without these changes libseccomp cannot actually translate the tables in full. This wasn't noticed before as the code was written defensively for this case. --- src/shared/seccomp-util.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 0b9fa47c44..f1e9de05b2 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -272,7 +272,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { .value = "chroot\0" "mount\0" - "oldumount\0" "pivot_root\0" "umount2\0" "umount\0" @@ -371,7 +370,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "setuid\0" "swapoff\0" "swapon\0" - "sysctl\0" + "_sysctl\0" "vhangup\0" }, [SYSCALL_FILTER_SET_PROCESS] = { -- cgit v1.2.3-54-g00ecf From a3be2849b2570482757f83181b999febbfc7bbef Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 21:18:46 +0200 Subject: seccomp: add new helper call seccomp_load_filter_set() This allows us to unify most of the code in apply_protect_kernel_modules() and apply_private_devices(). --- src/core/execute.c | 34 ++-------------------------------- src/shared/seccomp-util.c | 24 ++++++++++++++++++++++++ src/shared/seccomp-util.h | 2 ++ 3 files changed, 28 insertions(+), 32 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/src/core/execute.c b/src/core/execute.c index 668504c5cf..5e7d7c25d7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1502,9 +1502,6 @@ finish: } static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { - scmp_filter_ctx seccomp; - int r; - assert(c); /* Turn off module syscalls on ProtectKernelModules=yes */ @@ -1512,25 +1509,10 @@ static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); } static int apply_private_devices(Unit *u, const ExecContext *c) { - scmp_filter_ctx seccomp; - int r; - assert(c); /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ @@ -1538,19 +1520,7 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); } #endif diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index f1e9de05b2..6252cd16a6 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -452,3 +452,27 @@ int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterS return 0; } + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) { + scmp_filter_ctx seccomp; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */ + + r = seccomp_init_conservative(&seccomp, default_action); + if (r < 0) + return r; + + r = seccomp_add_syscall_filter_set(seccomp, set, action); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; + +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 2de429a772..667687b14f 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -59,3 +59,5 @@ extern const SyscallFilterSet syscall_filter_sets[]; const SyscallFilterSet *syscall_filter_set_find(const char *name); int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); -- cgit v1.2.3-54-g00ecf From aa34055ffbc0d862333c47023b56ee55d813c2a6 Mon Sep 17 00:00:00 2001 From: Zbigniew Jędrzejewski-Szmek Date: Tue, 1 Nov 2016 11:33:18 -0400 Subject: seccomp: allow specifying arm64, mips, ppc (#4491) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "Secondary arch" table for mips is entirely speculative… --- man/systemd.exec.xml | 41 +++++++-------- src/shared/seccomp-util.c | 129 +++++++++++++++++++++++++++++++++------------- src/test/test-seccomp.c | 33 ++++++++++++ 3 files changed, 145 insertions(+), 58 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index d3a19c505d..11029ca186 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1396,28 +1396,25 @@ SystemCallArchitectures= - Takes a space-separated list of architecture - identifiers to include in the system call filter. The known - architecture identifiers are x86, - x86-64, x32, - arm, s390, - s390x as well as the special identifier - native. Only system calls of the - specified architectures will be permitted to processes of this - unit. This is an effective way to disable compatibility with - non-native architectures for processes, for example to - prohibit execution of 32-bit x86 binaries on 64-bit x86-64 - systems. The special native identifier - implicitly maps to the native architecture of the system (or - more strictly: to the architecture the system manager is - compiled for). If running in user mode, or in system mode, - but without the CAP_SYS_ADMIN - capability (e.g. setting User=nobody), - NoNewPrivileges=yes is implied. Note - that setting this option to a non-empty list implies that - native is included too. By default, this - option is set to the empty list, i.e. no architecture system - call filtering is applied. + Takes a space-separated list of architecture identifiers to + include in the system call filter. The known architecture identifiers are the same + as for ConditionArchitecture= described in + systemd.unit5, + as well as x32, mips64-n32, + mips64-le-n32, and the special identifier + native. Only system calls of the specified architectures will + be permitted to processes of this unit. This is an effective way to disable + compatibility with non-native architectures for processes, for example to prohibit + execution of 32-bit x86 binaries on 64-bit x86-64 systems. The special + native identifier implicitly maps to the native architecture + of the system (or more strictly: to the architecture the system manager is + compiled for). If running in user mode, or in system mode, but without the + CAP_SYS_ADMIN capability (e.g. setting + User=nobody), NoNewPrivileges=yes is + implied. Note that setting this option to a non-empty list implies that + native is included too. By default, this option is set to the + empty list, i.e. no architecture system call filtering is applied. + diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 6252cd16a6..1cbbb9d757 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -29,23 +29,49 @@ #include "util.h" const char* seccomp_arch_to_string(uint32_t c) { + /* Maintain order used in . + * + * Names used here should be the same as those used for ConditionArchitecture=, + * except for "subarchitectures" like x32. */ - if (c == SCMP_ARCH_NATIVE) + switch(c) { + case SCMP_ARCH_NATIVE: return "native"; - if (c == SCMP_ARCH_X86) + case SCMP_ARCH_X86: return "x86"; - if (c == SCMP_ARCH_X86_64) + case SCMP_ARCH_X86_64: return "x86-64"; - if (c == SCMP_ARCH_X32) + case SCMP_ARCH_X32: return "x32"; - if (c == SCMP_ARCH_ARM) + case SCMP_ARCH_ARM: return "arm"; - if (c == SCMP_ARCH_S390) + case SCMP_ARCH_AARCH64: + return "arm64"; + case SCMP_ARCH_MIPS: + return "mips"; + case SCMP_ARCH_MIPS64: + return "mips64"; + case SCMP_ARCH_MIPS64N32: + return "mips64-n32"; + case SCMP_ARCH_MIPSEL: + return "mips-le"; + case SCMP_ARCH_MIPSEL64: + return "mips64-le"; + case SCMP_ARCH_MIPSEL64N32: + return "mips64-le-n32"; + case SCMP_ARCH_PPC: + return "ppc"; + case SCMP_ARCH_PPC64: + return "ppc64"; + case SCMP_ARCH_PPC64LE: + return "ppc64-le"; + case SCMP_ARCH_S390: return "s390"; - if (c == SCMP_ARCH_S390X) + case SCMP_ARCH_S390X: return "s390x"; - - return NULL; + default: + return NULL; + } } int seccomp_arch_from_string(const char *n, uint32_t *ret) { @@ -64,6 +90,26 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) { *ret = SCMP_ARCH_X32; else if (streq(n, "arm")) *ret = SCMP_ARCH_ARM; + else if (streq(n, "arm64")) + *ret = SCMP_ARCH_AARCH64; + else if (streq(n, "mips")) + *ret = SCMP_ARCH_MIPS; + else if (streq(n, "mips64")) + *ret = SCMP_ARCH_MIPS64; + else if (streq(n, "mips64-n32")) + *ret = SCMP_ARCH_MIPS64N32; + else if (streq(n, "mips-le")) + *ret = SCMP_ARCH_MIPSEL; + else if (streq(n, "mips64-le")) + *ret = SCMP_ARCH_MIPSEL64; + else if (streq(n, "mips64-le-n32")) + *ret = SCMP_ARCH_MIPSEL64N32; + else if (streq(n, "ppc")) + *ret = SCMP_ARCH_PPC; + else if (streq(n, "ppc64")) + *ret = SCMP_ARCH_PPC64; + else if (streq(n, "ppc64-le")) + *ret = SCMP_ARCH_PPC64LE; else if (streq(n, "s390")) *ret = SCMP_ARCH_S390; else if (streq(n, "s390x")) @@ -101,41 +147,52 @@ finish: return r; } -int seccomp_add_secondary_archs(scmp_filter_ctx c) { - -#if defined(__i386__) || defined(__x86_64__) - int r; +int seccomp_add_secondary_archs(scmp_filter_ctx ctx) { /* Add in all possible secondary archs we are aware of that * this kernel might support. */ - r = seccomp_arch_add(c, SCMP_ARCH_X86); - if (r < 0 && r != -EEXIST) - return r; - - r = seccomp_arch_add(c, SCMP_ARCH_X86_64); - if (r < 0 && r != -EEXIST) - return r; - - r = seccomp_arch_add(c, SCMP_ARCH_X32); - if (r < 0 && r != -EEXIST) - return r; + static const int seccomp_arches[] = { +#if defined(__i386__) || defined(__x86_64__) + SCMP_ARCH_X86, + SCMP_ARCH_X86_64, + SCMP_ARCH_X32, + +#elif defined(__arm__) || defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, + +#elif defined(__arm__) || defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, + +#elif defined(__mips__) || defined(__mips64__) + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPSEL64N32, + +#elif defined(__powerpc__) || defined(__powerpc64__) + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64, + SCMP_ARCH_PPC64LE, #elif defined(__s390__) || defined(__s390x__) - int r; - - /* Add in all possible secondary archs we are aware of that - * this kernel might support. */ - - r = seccomp_arch_add(c, SCMP_ARCH_S390); - if (r < 0 && r != -EEXIST) - return r; + SCMP_ARCH_S390, + SCMP_ARCH_S390X, +#endif + }; - r = seccomp_arch_add(c, SCMP_ARCH_S390X); - if (r < 0 && r != -EEXIST) - return r; + unsigned i; + int r; -#endif + for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) { + r = seccomp_arch_add(ctx, seccomp_arches[i]); + if (r < 0 && r != -EEXIST) + return r; + } return 0; } diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c index 0060ecdf02..43d1567288 100644 --- a/src/test/test-seccomp.c +++ b/src/test/test-seccomp.c @@ -25,6 +25,8 @@ #include "macro.h" #include "process-util.h" #include "seccomp-util.h" +#include "string-util.h" +#include "util.h" static void test_seccomp_arch_to_string(void) { uint32_t a, b; @@ -38,6 +40,36 @@ static void test_seccomp_arch_to_string(void) { assert_se(a == b); } +static void test_architecture_table(void) { + const char *n, *n2; + + NULSTR_FOREACH(n, + "native\0" + "x86\0" + "x86-64\0" + "x32\0" + "arm\0" + "arm64\0" + "mips\0" + "mips64\0" + "mips64-n32\0" + "mips-le\0" + "mips64-le\0" + "mips64-le-n32\0" + "ppc\0" + "ppc64\0" + "ppc64-le\0" + "s390\0" + "s390x\0") { + uint32_t c; + + assert_se(seccomp_arch_from_string(n, &c) >= 0); + n2 = seccomp_arch_to_string(c); + log_info("seccomp-arch: %s → 0x%"PRIx32" → %s", n, c, n2); + assert_se(streq_ptr(n, n2)); + } +} + static void test_syscall_filter_set_find(void) { assert_se(!syscall_filter_set_find(NULL)); assert_se(!syscall_filter_set_find("")); @@ -96,6 +128,7 @@ static void test_filter_sets(void) { int main(int argc, char *argv[]) { test_seccomp_arch_to_string(); + test_architecture_table(); test_syscall_filter_set_find(); test_filter_sets(); -- cgit v1.2.3-54-g00ecf From c79aff9a82abf361aea47b5c745ed9729c5f0212 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:38:36 +0200 Subject: seccomp: add clock query and sleeping syscalls to "@default" group Timing and sleep are so basic operations, it makes very little sense to ever block them, hence don't. --- man/systemd.exec.xml | 38 ++++++++++++++------------------------ src/shared/seccomp-util.c | 9 ++++++++- 2 files changed, 22 insertions(+), 25 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 11029ca186..e7d8bb23a4 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1255,30 +1255,20 @@ SystemCallFilter= - Takes a space-separated list of system call - names. If this setting is used, all system calls executed by - the unit processes except for the listed ones will result in - immediate process termination with the - SIGSYS signal (whitelisting). If the - first character of the list is ~, the - effect is inverted: only the listed system calls will result - in immediate process termination (blacklisting). If running in - user mode, or in system mode, but without the - CAP_SYS_ADMIN capability (e.g. setting - User=nobody), - NoNewPrivileges=yes is implied. This - feature makes use of the Secure Computing Mode 2 interfaces of - the kernel ('seccomp filtering') and is useful for enforcing a - minimal sandboxing environment. Note that the - execve, - rt_sigreturn, - sigreturn, - exit_group, exit - system calls are implicitly whitelisted and do not need to be - listed explicitly. This option may be specified more than once, - in which case the filter masks are merged. If the empty string - is assigned, the filter is reset, all prior assignments will - have no effect. This does not affect commands prefixed with +. + Takes a space-separated list of system call names. If this setting is used, all system calls + executed by the unit processes except for the listed ones will result in immediate process termination with the + SIGSYS signal (whitelisting). If the first character of the list is ~, + the effect is inverted: only the listed system calls will result in immediate process termination + (blacklisting). If running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=nobody), NoNewPrivileges=yes is + implied. This feature makes use of the Secure Computing Mode 2 interfaces of the kernel ('seccomp filtering') + and is useful for enforcing a minimal sandboxing environment. Note that the execve, + exit, exit_group, getrlimit, + rt_sigreturn, sigreturn system calls and the system calls for + querying time and sleeping are implicitly whitelisted and do not need to be listed explicitly. This option may + be specified more than once, in which case the filter masks are merged. If the empty string is assigned, the + filter is reset, all prior assignments will have no effect. This does not affect commands prefixed with + +. If you specify both types of this option (i.e. whitelisting and blacklisting), the first encountered will diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1cbbb9d757..ad5782fb29 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -253,15 +253,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "sys_debug_setcontext\0" }, [SYSCALL_FILTER_SET_DEFAULT] = { - /* Default list */ + /* Default list: the most basic of operations */ .name = "@default", .value = + "clock_getres\0" + "clock_gettime\0" + "clock_nanosleep\0" "execve\0" "exit\0" "exit_group\0" "getrlimit\0" /* make sure processes can query stack size and such */ + "gettimeofday\0" + "nanosleep\0" + "pause\0" "rt_sigreturn\0" "sigreturn\0" + "time\0" }, [SYSCALL_FILTER_SET_IO_EVENT] = { /* Event loop use */ -- cgit v1.2.3-54-g00ecf From a8c157ff3081ee963adb0d046015abf9a271fa67 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:42:10 +0200 Subject: seccomp: drop execve() from @process list The system call is already part in @default hence implicitly allowed anyway. Also, if it is actually blocked then systemd couldn't execute the service in question anymore, since the application of seccomp is immediately followed by it. --- man/systemd.exec.xml | 2 +- src/shared/seccomp-util.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index e7d8bb23a4..d45e5362dc 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1347,7 +1347,7 @@ @process - Process control, execution, namespaces (execve2, kill2, namespaces7, … + Process control, execution, namespaces (clone2, kill2, namespaces7, … @raw-io diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index ad5782fb29..70723e9e4e 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -443,7 +443,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { .value = "arch_prctl\0" "clone\0" - "execve\0" "execveat\0" "fork\0" "kill\0" -- cgit v1.2.3-54-g00ecf From cd5bfd7e60c08cfad41bcf881f550c424b2f3e44 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:43:31 +0200 Subject: seccomp: include pipes and memfd in @ipc These system calls clearly fall in the @ipc category, hence should be listed there, simply to avoid confusion and surprise by the user. --- man/systemd.exec.xml | 2 +- src/shared/seccomp-util.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'src/shared/seccomp-util.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index d45e5362dc..466511aaf3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1319,7 +1319,7 @@ @ipc - SysV IPC, POSIX Message Queues or other IPC (mq_overview7, svipc7) + Pipes, SysV IPC, POSIX Message Queues and other IPC (mq_overview7, svipc7) @keyring diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 70723e9e4e..e0a61aa358 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -290,9 +290,10 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "select\0" }, [SYSCALL_FILTER_SET_IPC] = { - /* Message queues, SYSV IPC or other IPC: unusual */ + /* Message queues, SYSV IPC or other IPC */ .name = "@ipc", .value = "ipc\0" + "memfd_create\0" "mq_getsetattr\0" "mq_notify\0" "mq_open\0" @@ -303,6 +304,8 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "msgget\0" "msgrcv\0" "msgsnd\0" + "pipe2\0" + "pipe\0" "process_vm_readv\0" "process_vm_writev\0" "semctl\0" -- cgit v1.2.3-54-g00ecf From 133ddbbeae74fc06173633605b3e612e934bc2dd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 2 Nov 2016 08:46:18 -0600 Subject: seccomp: add two new syscall groups @resources contains various syscalls that alter resource limits and memory and scheduling parameters of processes. As such they are good candidates to block for most services. @basic-io contains a number of basic syscalls for I/O, similar to the list seccomp v1 permitted but slightly more complete. It should be useful for building basic whitelisting for minimal sandboxes --- man/systemd.exec.xml | 8 ++++++++ src/shared/seccomp-util.c | 34 ++++++++++++++++++++++++++++++++++ src/shared/seccomp-util.h | 2 ++ 3 files changed, 44 insertions(+) (limited to 'src/shared/seccomp-util.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3b80bcccd0..7daa3ae78e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1301,6 +1301,10 @@ + + @basic-io + System calls for basic I/O: reading, writing, seeking, file descriptor duplication and closing (read2, write2, and related calls) + @clock System calls for changing the system clock (adjtimex2, settimeofday2, and related calls) @@ -1353,6 +1357,10 @@ @raw-io Raw I/O port access (ioperm2, iopl2, pciconfig_read(), …) + + @resources + System calls for changing resource limits, memory and scheduling parameters (setrlimit2, setpriority2, …) + diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index e0a61aa358..c9b24f1065 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -217,6 +217,24 @@ bool is_seccomp_available(void) { } const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_BASIC_IO] = { + /* Basic IO */ + .name = "@basic-io", + .value = + "close\0" + "dup2\0" + "dup3\0" + "dup\0" + "lseek\0" + "pread64\0" + "preadv\0" + "pwrite64\0" + "pwritev\0" + "read\0" + "readv\0" + "write\0" + "writev\0" + }, [SYSCALL_FILTER_SET_CLOCK] = { /* Clock */ .name = "@clock", @@ -472,6 +490,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "s390_pci_mmio_write\0" #endif }, + [SYSCALL_FILTER_SET_RESOURCES] = { + /* Alter resource settings */ + .name = "@resources", + .value = + "sched_setparam\0" + "sched_setscheduler\0" + "sched_setaffinity\0" + "setpriority\0" + "setrlimit\0" + "set_mempolicy\0" + "migrate_pages\0" + "move_pages\0" + "mbind\0" + "sched_setattr\0" + "prlimit64\0" + }, }; const SyscallFilterSet *syscall_filter_set_find(const char *name) { diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 8050fc6fbf..8e209efef2 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -38,6 +38,7 @@ typedef struct SyscallFilterSet { } SyscallFilterSet; enum { + SYSCALL_FILTER_SET_BASIC_IO, SYSCALL_FILTER_SET_CLOCK, SYSCALL_FILTER_SET_CPU_EMULATION, SYSCALL_FILTER_SET_DEBUG, @@ -52,6 +53,7 @@ enum { SYSCALL_FILTER_SET_PRIVILEGED, SYSCALL_FILTER_SET_PROCESS, SYSCALL_FILTER_SET_RAW_IO, + SYSCALL_FILTER_SET_RESOURCES, _SYSCALL_FILTER_SET_MAX }; -- cgit v1.2.3-54-g00ecf