diff options
-rw-r--r-- | man/systemd.exec.xml | 8 | ||||
-rw-r--r-- | src/core/execute.c | 482 | ||||
-rw-r--r-- | src/core/main.c | 34 | ||||
-rw-r--r-- | src/nspawn/nspawn-seccomp.c | 113 | ||||
-rw-r--r-- | src/shared/seccomp-util.c | 686 | ||||
-rw-r--r-- | src/shared/seccomp-util.h | 27 | ||||
-rw-r--r-- | src/test/test-execute.c | 1 | ||||
-rw-r--r-- | src/test/test-seccomp.c | 272 |
8 files changed, 1048 insertions, 575 deletions
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 7bdac1c1d5..07da57e11a 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1435,9 +1435,17 @@ <entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …)</entry> </row> <row> + <entry>@reboot</entry> + <entry>System calls for rebooting and reboot preparation (<citerefentry project='man-pages'><refentrytitle>reboot</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>kexec()</function>, …)</entry> + </row> + <row> <entry>@resources</entry> <entry>System calls for changing resource limits, memory and scheduling parameters (<citerefentry project='man-pages'><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>setpriority</refentrytitle><manvolnum>2</manvolnum></citerefentry>, …)</entry> </row> + <row> + <entry>@swap</entry> + <entry>System calls for enabling/disabling swap devices (<citerefentry project='man-pages'><refentrytitle>swapon</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>swapoff</refentrytitle><manvolnum>2</manvolnum></citerefentry>)</entry> + </row> </tbody> </tgroup> </table> diff --git a/src/core/execute.c b/src/core/execute.c index 3a7f997522..06a291fd39 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1259,6 +1259,41 @@ static void rename_process_from_path(const char *path) { rename_process(process_name); } +static bool context_has_address_families(const ExecContext *c) { + assert(c); + + return c->address_families_whitelist || + !set_isempty(c->address_families); +} + +static bool context_has_syscall_filters(const ExecContext *c) { + assert(c); + + return c->syscall_whitelist || + !set_isempty(c->syscall_filter); +} + +static bool context_has_no_new_privileges(const ExecContext *c) { + assert(c); + + if (c->no_new_privileges) + return true; + + if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ + return false; + + /* We need NNP if we have any form of seccomp and are unprivileged */ + return context_has_address_families(c) || + c->memory_deny_write_execute || + c->restrict_realtime || + exec_context_restrict_namespaces_set(c) || + c->protect_kernel_tunables || + c->protect_kernel_modules || + c->private_devices || + context_has_syscall_filters(c) || + !set_isempty(c->syscall_archs); +} + #ifdef HAVE_SECCOMP static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { @@ -1272,344 +1307,131 @@ static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { return true; } -static int apply_seccomp(const Unit* u, const ExecContext *c) { - uint32_t negative_action, action; - scmp_filter_ctx seccomp; - Iterator i; - void *id; - int r; +static int apply_syscall_filter(const Unit* u, const ExecContext *c) { + uint32_t negative_action, default_action, action; + assert(u); assert(c); - if (skip_seccomp_unavailable(u, "syscall filtering")) + if (!context_has_syscall_filters(c)) return 0; - negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); - - seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - if (c->syscall_archs) { + if (skip_seccomp_unavailable(u, "SystemCallFilter=")) + return 0; - SET_FOREACH(id, c->syscall_archs, i) { - r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1); - if (r == -EEXIST) - continue; - if (r < 0) - goto finish; - } + negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); + if (c->syscall_whitelist) { + default_action = negative_action; + action = SCMP_ACT_ALLOW; } else { - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) - goto finish; + default_action = SCMP_ACT_ALLOW; + action = negative_action; } - action = c->syscall_whitelist ? SCMP_ACT_ALLOW : negative_action; - SET_FOREACH(id, c->syscall_filter, i) { - r = seccomp_rule_add(seccomp, action, PTR_TO_INT(id) - 1, 0); - if (r < 0) - goto finish; - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action); } -static int apply_address_families(const Unit* u, const ExecContext *c) { - scmp_filter_ctx seccomp; - Iterator i; - int r; - +static int apply_syscall_archs(const Unit *u, const ExecContext *c) { + assert(u); assert(c); - if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) + if (set_isempty(c->syscall_archs)) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - if (c->address_families_whitelist) { - int af, first = 0, last = 0; - void *afp; - - /* If this is a whitelist, we first block the address - * families that are out of range and then everything - * that is not in the set. First, we find the lowest - * and highest address family in the set. */ - - SET_FOREACH(afp, c->address_families, i) { - af = PTR_TO_INT(afp); - - if (af <= 0 || af >= af_max()) - continue; - - if (first == 0 || af < first) - first = af; - - if (last == 0 || af > last) - last = af; - } - - assert((first == 0) == (last == 0)); - - if (first == 0) { - - /* No entries in the valid range, block everything */ - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPROTONOSUPPORT), - SCMP_SYS(socket), - 0); - if (r < 0) - goto finish; - - } else { + if (skip_seccomp_unavailable(u, "SystemCallArchitectures=")) + return 0; - /* Block everything below the first entry */ - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPROTONOSUPPORT), - SCMP_SYS(socket), - 1, - SCMP_A0(SCMP_CMP_LT, first)); - if (r < 0) - goto finish; - - /* Block everything above the last entry */ - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPROTONOSUPPORT), - SCMP_SYS(socket), - 1, - SCMP_A0(SCMP_CMP_GT, last)); - if (r < 0) - goto finish; - - /* Block everything between the first and last - * entry */ - for (af = 1; af < af_max(); af++) { - - if (set_contains(c->address_families, INT_TO_PTR(af))) - continue; + return seccomp_restrict_archs(c->syscall_archs); +} - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPROTONOSUPPORT), - SCMP_SYS(socket), - 1, - SCMP_A0(SCMP_CMP_EQ, af)); - if (r < 0) - goto finish; - } - } +static int apply_address_families(const Unit* u, const ExecContext *c) { + assert(u); + assert(c); - } else { - void *af; - - /* If this is a blacklist, then generate one rule for - * each address family that are then combined in OR - * checks. */ - - SET_FOREACH(af, c->address_families, i) { - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPROTONOSUPPORT), - SCMP_SYS(socket), - 1, - SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af))); - if (r < 0) - goto finish; - } - } + if (!context_has_address_families(c)) + return 0; - r = seccomp_load(seccomp); + if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) + return 0; -finish: - seccomp_release(seccomp); - return r; + return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist); } static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { - scmp_filter_ctx seccomp; - int r; - + assert(u); assert(c); - if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) + if (!c->memory_deny_write_execute) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(mmap), - 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE)); - if (r < 0) - goto finish; - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(mprotect), - 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); - if (r < 0) - goto finish; - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(shmat), - 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC)); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); + if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) + return 0; -finish: - seccomp_release(seccomp); - return r; + return seccomp_memory_deny_write_execute(); } static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { - static const int permitted_policies[] = { - SCHED_OTHER, - SCHED_BATCH, - SCHED_IDLE, - }; - - scmp_filter_ctx seccomp; - unsigned i; - int r, p, max_policy = 0; - + assert(u); assert(c); - if (skip_seccomp_unavailable(u, "RestrictRealtime=")) + if (!c->restrict_realtime) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - /* Determine the highest policy constant we want to allow */ - for (i = 0; i < ELEMENTSOF(permitted_policies); i++) - if (permitted_policies[i] > max_policy) - max_policy = permitted_policies[i]; - - /* Go through all policies with lower values than that, and block them -- unless they appear in the - * whitelist. */ - for (p = 0; p < max_policy; p++) { - bool good = false; - - /* Check if this is in the whitelist. */ - for (i = 0; i < ELEMENTSOF(permitted_policies); i++) - if (permitted_policies[i] == p) { - good = true; - break; - } - - if (good) - continue; - - /* Deny this policy */ - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(sched_setscheduler), - 1, - SCMP_A1(SCMP_CMP_EQ, p)); - if (r < 0) - goto finish; - } - - /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here, - * hence no need no check for < 0 values. */ - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(sched_setscheduler), - 1, - SCMP_A1(SCMP_CMP_GT, max_policy)); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); + if (skip_seccomp_unavailable(u, "RestrictRealtime=")) + return 0; -finish: - seccomp_release(seccomp); - return r; + return seccomp_restrict_realtime(); } static int apply_protect_sysctl(const Unit *u, const ExecContext *c) { - scmp_filter_ctx seccomp; - int r; - + assert(u); assert(c); /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but * let's protect even those systems where this is left on in the kernel. */ - if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) + if (!c->protect_kernel_tunables) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(_sysctl), - 0); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); + if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) + return 0; -finish: - seccomp_release(seccomp); - return r; + return seccomp_protect_sysctl(); } static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) { + assert(u); assert(c); /* Turn off module syscalls on ProtectKernelModules=yes */ + if (!c->protect_kernel_modules) + return 0; + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); } static int apply_private_devices(const Unit *u, const ExecContext *c) { + assert(u); assert(c); /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ + if (!c->private_devices) + return 0; + if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); } static int apply_restrict_namespaces(Unit *u, const ExecContext *c) { + assert(u); assert(c); if (!exec_context_restrict_namespaces_set(c)) @@ -2310,41 +2132,6 @@ static int close_remaining_fds( return close_all_fds(dont_close, n_dont_close); } -static bool context_has_address_families(const ExecContext *c) { - assert(c); - - return c->address_families_whitelist || - !set_isempty(c->address_families); -} - -static bool context_has_syscall_filters(const ExecContext *c) { - assert(c); - - return c->syscall_whitelist || - !set_isempty(c->syscall_filter) || - !set_isempty(c->syscall_archs); -} - -static bool context_has_no_new_privileges(const ExecContext *c) { - assert(c); - - if (c->no_new_privileges) - return true; - - if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ - return false; - - /* We need NNP if we have any form of seccomp and are unprivileged */ - return context_has_address_families(c) || - c->memory_deny_write_execute || - c->restrict_realtime || - exec_context_restrict_namespaces_set(c) || - c->protect_kernel_tunables || - c->protect_kernel_modules || - c->private_devices || - context_has_syscall_filters(c); -} - static int send_user_lookup( Unit *unit, int user_lookup_fd, @@ -2942,31 +2729,25 @@ static int exec_child( } #ifdef HAVE_SECCOMP - if (context_has_address_families(context)) { - r = apply_address_families(unit, context); - if (r < 0) { - *exit_status = EXIT_ADDRESS_FAMILIES; - *error_message = strdup("Failed to restrict address families"); - return r; - } + r = apply_address_families(unit, context); + if (r < 0) { + *exit_status = EXIT_ADDRESS_FAMILIES; + *error_message = strdup("Failed to restrict address families"); + return r; } - if (context->memory_deny_write_execute) { - r = apply_memory_deny_write_execute(unit, context); - if (r < 0) { - *exit_status = EXIT_SECCOMP; - *error_message = strdup("Failed to disable writing to executable memory"); - return r; - } + r = apply_memory_deny_write_execute(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to disable writing to executable memory"); + return r; } - if (context->restrict_realtime) { - r = apply_restrict_realtime(unit, context); - if (r < 0) { - *exit_status = EXIT_SECCOMP; - *error_message = strdup("Failed to apply realtime restrictions"); - return r; - } + r = apply_restrict_realtime(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to apply realtime restrictions"); + return r; } r = apply_restrict_namespaces(unit, context); @@ -2976,42 +2757,41 @@ static int exec_child( return r; } - if (context->protect_kernel_tunables) { - r = apply_protect_sysctl(unit, context); - if (r < 0) { - *exit_status = EXIT_SECCOMP; - *error_message = strdup("Failed to apply sysctl restrictions"); - return r; - } + r = apply_protect_sysctl(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to apply sysctl restrictions"); + return r; } - if (context->protect_kernel_modules) { - r = apply_protect_kernel_modules(unit, context); - if (r < 0) { - *exit_status = EXIT_SECCOMP; - *error_message = strdup("Failed to apply module loading restrictions"); - return r; - } + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to apply module loading restrictions"); + return r; } - if (context->private_devices) { - r = apply_private_devices(unit, context); - if (r < 0) { - *exit_status = EXIT_SECCOMP; - *error_message = strdup("Failed to set up private devices"); - return r; - } + r = apply_private_devices(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to set up private devices"); + return r; + } + + r = apply_syscall_archs(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to apply syscall architecture restrictions"); + return r; } /* This really should remain the last step before the execve(), to make sure our own code is unaffected * by the filter as little as possible. */ - if (context_has_syscall_filters(context)) { - r = apply_seccomp(unit, context); - if (r < 0) { - *exit_status = EXIT_SECCOMP; - *error_message = strdup("Failed to apply syscall filters"); - return r; - } + r = apply_syscall_filter(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + *error_message = strdup("Failed to apply syscall filters"); + return r; } #endif } diff --git a/src/core/main.c b/src/core/main.c index 56a81ab94a..ad2ce1330e 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1231,44 +1231,16 @@ oom: static int enforce_syscall_archs(Set *archs) { #ifdef HAVE_SECCOMP - scmp_filter_ctx *seccomp; - Iterator i; - void *id; int r; if (!is_seccomp_available()) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - SET_FOREACH(id, arg_syscall_archs, i) { - r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1); - if (r == -EEXIST) - continue; - if (r < 0) { - log_error_errno(r, "Failed to add architecture to seccomp: %m"); - goto finish; - } - } - - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - - r = seccomp_load(seccomp); + r = seccomp_restrict_archs(arg_syscall_archs); if (r < 0) - log_error_errno(r, "Failed to add install architecture seccomp: %m"); - -finish: - seccomp_release(seccomp); - return r; -#else - return 0; + return log_error_errno(r, "Failed to enforce system call architecture restrication: %m"); #endif + return 0; } static int status_welcome(void) { diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 03a397d30c..72ecc51b16 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -26,20 +26,21 @@ #include <seccomp.h> #endif +#include "alloc-util.h" #include "log.h" - +#include "nspawn-seccomp.h" #ifdef HAVE_SECCOMP #include "seccomp-util.h" #endif - -#include "nspawn-seccomp.h" +#include "string-util.h" #ifdef HAVE_SECCOMP -static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, - uint64_t cap_list_retain) { - unsigned i; - int r; +static int seccomp_add_default_syscall_filter( + scmp_filter_ctx ctx, + uint32_t arch, + uint64_t cap_list_retain) { + static const struct { uint64_t capability; int syscall_num; @@ -111,23 +112,29 @@ static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, { CAP_SYS_TIME, SCMP_SYS(settimeofday) }, { CAP_SYS_TIME, SCMP_SYS(stime) }, }; + unsigned i; + int r, c = 0; for (i = 0; i < ELEMENTSOF(blacklist); i++) { if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) continue; - r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); - if (r == -EFAULT) - continue; /* unknown syscall */ - if (r < 0) - return log_error_errno(r, "Failed to block syscall: %m"); + r = seccomp_rule_add_exact(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); + if (r < 0) { + /* If the system call is not known on this architecture, then that's fine, let's ignore it */ + _cleanup_free_ char *n = NULL; + + n = seccomp_syscall_resolve_num_arch(arch, blacklist[i].syscall_num); + log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", strna(n)); + } else + c++; } - return 0; + return c; } int setup_seccomp(uint64_t cap_list_retain) { - scmp_filter_ctx seccomp; + uint32_t arch; int r; if (!is_seccomp_available()) { @@ -135,45 +142,51 @@ int setup_seccomp(uint64_t cap_list_retain) { return 0; } - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return log_error_errno(r, "Failed to allocate seccomp object: %m"); - - r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); - if (r < 0) - goto finish; - - /* - Audit is broken in containers, much of the userspace audit - hookup will fail if running inside a container. We don't - care and just turn off creation of audit sockets. - - This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail - with EAFNOSUPPORT which audit userspace uses as indication - that audit is disabled in the kernel. - */ - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EAFNOSUPPORT), - SCMP_SYS(socket), - 2, - SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), - SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); - if (r < 0) { - log_error_errno(r, "Failed to add audit seccomp rule: %m"); - goto finish; - } + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int n; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); + + n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain); + if (n < 0) + return n; + + /* + Audit is broken in containers, much of the userspace audit hookup will fail if running inside a + container. We don't care and just turn off creation of audit sockets. + + This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses + as indication that audit is disabled in the kernel. + */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 2, + SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), + SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); + if (r < 0) + log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); + else + n++; + + if (n <= 0) /* no rule added? then skip this architecture */ + continue; - r = seccomp_load(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to install seccomp audit filter: %m"); - goto finish; + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return log_error_errno(r, "Failed to install seccomp audit filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); } -finish: - seccomp_release(seccomp); - return r; + return 0; } #else diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 66b72b2b27..2c73cb8fa4 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -18,17 +18,52 @@ ***/ #include <errno.h> +#include <linux/seccomp.h> #include <seccomp.h> #include <stddef.h> +#include <sys/mman.h> #include <sys/prctl.h> -#include <linux/seccomp.h> +#include <sys/shm.h> +#include "af-list.h" #include "alloc-util.h" #include "macro.h" #include "nsflags.h" #include "seccomp-util.h" #include "string-util.h" #include "util.h" +#include "errno-list.h" + +const uint32_t seccomp_local_archs[] = { + +#if defined(__i386__) || defined(__x86_64__) + SCMP_ARCH_X86, + SCMP_ARCH_X86_64, + SCMP_ARCH_X32, + +#elif defined(__arm__) || defined(__aarch64__) + SCMP_ARCH_ARM, + SCMP_ARCH_AARCH64, + +#elif defined(__mips__) || defined(__mips64__) + SCMP_ARCH_MIPS, + SCMP_ARCH_MIPS64, + SCMP_ARCH_MIPS64N32, + SCMP_ARCH_MIPSEL, + SCMP_ARCH_MIPSEL64, + SCMP_ARCH_MIPSEL64N32, + +#elif defined(__powerpc__) || defined(__powerpc64__) + SCMP_ARCH_PPC, + SCMP_ARCH_PPC64, + SCMP_ARCH_PPC64LE, + +#elif defined(__s390__) || defined(__s390x__) + SCMP_ARCH_S390, + SCMP_ARCH_S390X, +#endif + (uint32_t) -1 + }; const char* seccomp_arch_to_string(uint32_t c) { /* Maintain order used in <seccomp.h>. @@ -122,18 +157,37 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) { return 0; } -int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) { +int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) { scmp_filter_ctx seccomp; int r; - /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are - * added by default, and NNP is turned off. */ + /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting + * any others. Also, turns off the NNP fiddling. */ seccomp = seccomp_init(default_action); if (!seccomp) return -ENOMEM; - r = seccomp_add_secondary_archs(seccomp); + if (arch != SCMP_ARCH_NATIVE && + arch != seccomp_arch_native()) { + + r = seccomp_arch_add(seccomp, arch); + if (r < 0) + goto finish; + + r = seccomp_arch_remove(seccomp, seccomp_arch_native()); + if (r < 0) + goto finish; + + assert(seccomp_arch_exist(seccomp, arch) >= 0); + assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST); + assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST); + } else { + assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0); + assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0); + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW); if (r < 0) goto finish; @@ -149,72 +203,23 @@ finish: return r; } -int seccomp_add_secondary_archs(scmp_filter_ctx ctx) { - - /* Add in all possible secondary archs we are aware of that - * this kernel might support. */ - - static const int seccomp_arches[] = { -#if defined(__i386__) || defined(__x86_64__) - SCMP_ARCH_X86, - SCMP_ARCH_X86_64, - SCMP_ARCH_X32, - -#elif defined(__arm__) || defined(__aarch64__) - SCMP_ARCH_ARM, - SCMP_ARCH_AARCH64, - -#elif defined(__arm__) || defined(__aarch64__) - SCMP_ARCH_ARM, - SCMP_ARCH_AARCH64, - -#elif defined(__mips__) || defined(__mips64__) - SCMP_ARCH_MIPS, - SCMP_ARCH_MIPS64, - SCMP_ARCH_MIPS64N32, - SCMP_ARCH_MIPSEL, - SCMP_ARCH_MIPSEL64, - SCMP_ARCH_MIPSEL64N32, - -#elif defined(__powerpc__) || defined(__powerpc64__) - SCMP_ARCH_PPC, - SCMP_ARCH_PPC64, - SCMP_ARCH_PPC64LE, - -#elif defined(__s390__) || defined(__s390x__) - SCMP_ARCH_S390, - SCMP_ARCH_S390X, -#endif - }; - - unsigned i; - int r; - - for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) { - r = seccomp_arch_add(ctx, seccomp_arches[i]); - if (r < 0 && r != -EEXIST) - return r; - } - - return 0; -} - static bool is_basic_seccomp_available(void) { - int r; - r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0); - return r >= 0; + return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0; } static bool is_seccomp_filter_available(void) { - int r; - r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0); - return r < 0 && errno == EFAULT; + return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 && + errno == EFAULT; } bool is_seccomp_available(void) { static int cached_enabled = -1; + if (cached_enabled < 0) - cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available(); + cached_enabled = + is_basic_seccomp_available() && + is_seccomp_filter_available(); + return cached_enabled; } @@ -469,6 +474,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { .value = "_sysctl\0" "afs_syscall\0" + "bdflush\0" "break\0" "create_module\0" "ftime\0" @@ -500,7 +506,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "@module\0" "@raw-io\0" "acct\0" - "bdflush\0" "bpf\0" "capset\0" "chown32\0" @@ -566,9 +571,17 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "s390_pci_mmio_write\0" #endif }, + [SYSCALL_FILTER_SET_REBOOT] = { + .name = "@reboot", + .help = "Reboot and reboot preparation/kexec", + .value = + "kexec\0" + "kexec_file_load\0" + "reboot\0" + }, [SYSCALL_FILTER_SET_RESOURCES] = { - /* Alter resource settings */ .name = "@resources", + .help = "Alter resource settings", .value = "sched_setparam\0" "sched_setscheduler\0" @@ -582,6 +595,13 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "sched_setattr\0" "prlimit64\0" }, + [SYSCALL_FILTER_SET_SWAP] = { + .name = "@swap", + .help = "Enable/disable swap devices", + .value = + "swapoff\0" + "swapon\0" + }, }; const SyscallFilterSet *syscall_filter_set_find(const char *name) { @@ -597,7 +617,12 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) { return NULL; } -int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) { +static int seccomp_add_syscall_filter_set( + scmp_filter_ctx seccomp, + uint32_t default_action, + const SyscallFilterSet *set, + uint32_t action) { + const char *sys; int r; @@ -614,47 +639,102 @@ int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterS if (!other) return -EINVAL; - r = seccomp_add_syscall_filter_set(seccomp, other, action); + r = seccomp_add_syscall_filter_set(seccomp, default_action, other, action); + if (r < 0) + return r; } else { id = seccomp_syscall_resolve_name(sys); if (id == __NR_SCMP_ERROR) - return -EINVAL; + return -EINVAL; /* Not known at all? Then that's a real error */ - r = seccomp_rule_add(seccomp, action, id, 0); + r = seccomp_rule_add_exact(seccomp, action, id, 0); + if (r < 0) + /* If the system call is not known on this architecture, then that's fine, let's ignore it */ + log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", sys); } + } + + return 0; +} + +int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) { + uint32_t arch; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for + * earch local arch. */ + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, default_action); if (r < 0) return r; + + r = seccomp_add_syscall_filter_set(seccomp, default_action, set, action); + if (r < 0) { + log_debug_errno(r, "Failed to add filter set, ignoring: %m"); + continue; + } + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); } return 0; } -int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) { - scmp_filter_ctx seccomp; +int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) { + uint32_t arch; int r; - assert(set); + /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a + * SyscallFilterSet* table. */ - /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */ + if (set_isempty(set) && default_action == SCMP_ACT_ALLOW) + return 0; - r = seccomp_init_conservative(&seccomp, default_action); - if (r < 0) - return r; + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + Iterator i; + void *id; - r = seccomp_add_syscall_filter_set(seccomp, set, action); - if (r < 0) - goto finish; + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); - r = seccomp_load(seccomp); + r = seccomp_init_for_arch(&seccomp, arch, default_action); + if (r < 0) + return r; -finish: - seccomp_release(seccomp); - return r; + SET_FOREACH(id, set, i) { + r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0); + if (r < 0) { + /* If the system call is not known on this architecture, then that's fine, let's ignore it */ + _cleanup_free_ char *n = NULL; + + n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1); + log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", strna(n)); + } + } + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; } int seccomp_restrict_namespaces(unsigned long retain) { - scmp_filter_ctx seccomp; - unsigned i; + uint32_t arch; int r; if (log_get_max_level() >= LOG_DEBUG) { @@ -668,74 +748,420 @@ int seccomp_restrict_namespaces(unsigned long retain) { if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + unsigned i; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); - if ((retain & NAMESPACE_FLAGS_ALL) == 0) - /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall - * altogether. */ - r = seccomp_rule_add( + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + if ((retain & NAMESPACE_FLAGS_ALL) == 0) + /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall + * altogether. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 0); + else + /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the + * special invocation with a zero flags argument, right here. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_EQ, 0)); + if (r < 0) { + log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + for (i = 0; namespace_flag_map[i].name; i++) { + unsigned long f; + + f = namespace_flag_map[i].flag; + if ((retain & f) == f) { + log_debug("Permitting %s.", namespace_flag_map[i].name); + continue; + } + + log_debug("Blocking %s.", namespace_flag_map[i].name); + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(unshare), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + break; + } + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(clone), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + break; + } + + if ((retain & NAMESPACE_FLAGS_ALL) != 0) { + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) { + log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + break; + } + } + } + if (r < 0) + continue; + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_protect_sysctl(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( seccomp, SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(setns), + SCMP_SYS(_sysctl), 0); - else - /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the - * special invocation with a zero flags argument, right here. */ - r = seccomp_rule_add( + if (r < 0) { + log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_restrict_address_families(Set *address_families, bool whitelist) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + Iterator i; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + if (whitelist) { + int af, first = 0, last = 0; + void *afp; + + /* If this is a whitelist, we first block the address families that are out of range and then + * everything that is not in the set. First, we find the lowest and highest address family in + * the set. */ + + SET_FOREACH(afp, address_families, i) { + af = PTR_TO_INT(afp); + + if (af <= 0 || af >= af_max()) + continue; + + if (first == 0 || af < first) + first = af; + + if (last == 0 || af > last) + last = af; + } + + assert((first == 0) == (last == 0)); + + if (first == 0) { + + /* No entries in the valid range, block everything */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 0); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + } else { + + /* Block everything below the first entry */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_LT, first)); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + /* Block everything above the last entry */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_GT, last)); + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + /* Block everything between the first and last entry */ + for (af = 1; af < af_max(); af++) { + + if (set_contains(address_families, INT_TO_PTR(af))) + continue; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_EQ, af)); + if (r < 0) + break; + } + + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + } + + } else { + void *af; + + /* If this is a blacklist, then generate one rule for + * each address family that are then combined in OR + * checks. */ + + SET_FOREACH(af, address_families, i) { + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EAFNOSUPPORT), + SCMP_SYS(socket), + 1, + SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af))); + if (r < 0) + break; + } + + if (r < 0) { + log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + } + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} + +int seccomp_restrict_realtime(void) { + static const int permitted_policies[] = { + SCHED_OTHER, + SCHED_BATCH, + SCHED_IDLE, + }; + + int r, max_policy = 0; + uint32_t arch; + unsigned i; + + /* Determine the highest policy constant we want to allow */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] > max_policy) + max_policy = permitted_policies[i]; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + int p; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + /* Go through all policies with lower values than that, and block them -- unless they appear in the + * whitelist. */ + for (p = 0; p < max_policy; p++) { + bool good = false; + + /* Check if this is in the whitelist. */ + for (i = 0; i < ELEMENTSOF(permitted_policies); i++) + if (permitted_policies[i] == p) { + good = true; + break; + } + + if (good) + continue; + + /* Deny this policy */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(sched_setscheduler), + 1, + SCMP_A1(SCMP_CMP_EQ, p)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + } + + /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are + * unsigned here, hence no need no check for < 0 values. */ + r = seccomp_rule_add_exact( seccomp, SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(setns), + SCMP_SYS(sched_setscheduler), 1, - SCMP_A1(SCMP_CMP_EQ, 0)); - if (r < 0) - goto finish; + SCMP_A1(SCMP_CMP_GT, max_policy)); + if (r < 0) { + log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} - for (i = 0; namespace_flag_map[i].name; i++) { - unsigned long f; +int seccomp_memory_deny_write_execute(void) { + uint32_t arch; + int r; + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); - f = namespace_flag_map[i].flag; - if ((retain & f) == f) { - log_debug("Permitting %s.", namespace_flag_map[i].name); + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(mmap), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE)); + if (r < 0) { + log_debug_errno(r, "Failed to add mmap() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); continue; } - log_debug("Blocking %s.", namespace_flag_map[i].name); - - r = seccomp_rule_add( + r = seccomp_rule_add_exact( seccomp, SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(unshare), + SCMP_SYS(mprotect), 1, - SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); - if (r < 0) - goto finish; + SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC)); + if (r < 0) { + log_debug_errno(r, "Failed to add mprotect() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } - r = seccomp_rule_add( + r = seccomp_rule_add_exact( seccomp, SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(clone), + SCMP_SYS(shmat), 1, - SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC)); + if (r < 0) { + log_debug_errno(r, "Failed to add shmat() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + continue; + } + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return r; if (r < 0) - goto finish; + log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } - if ((retain & NAMESPACE_FLAGS_ALL) != 0) { - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - SCMP_SYS(setns), - 1, - SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); - if (r < 0) - goto finish; - } + return 0; +} + +int seccomp_restrict_archs(Set *archs) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + Iterator i; + void *id; + int r; + + /* This installs a filter with no rules, but that restricts the system call architectures to the specified + * list. */ + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + SET_FOREACH(id, archs, i) { + r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1); + if (r == -EEXIST) + continue; + if (r < 0) + return r; } - r = seccomp_load(seccomp); + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + return r; -finish: - seccomp_release(seccomp); - return r; + return seccomp_load(seccomp); } diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 01cf331b29..4438e87fa6 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -23,12 +23,12 @@ #include <stdbool.h> #include <stdint.h> +#include "set.h" + const char* seccomp_arch_to_string(uint32_t c); int seccomp_arch_from_string(const char *n, uint32_t *ret); -int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action); - -int seccomp_add_secondary_archs(scmp_filter_ctx c); +int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action); bool is_seccomp_available(void); @@ -56,7 +56,9 @@ enum { SYSCALL_FILTER_SET_PRIVILEGED, SYSCALL_FILTER_SET_PROCESS, SYSCALL_FILTER_SET_RAW_IO, + SYSCALL_FILTER_SET_REBOOT, SYSCALL_FILTER_SET_RESOURCES, + SYSCALL_FILTER_SET_SWAP, _SYSCALL_FILTER_SET_MAX }; @@ -64,8 +66,21 @@ extern const SyscallFilterSet syscall_filter_sets[]; const SyscallFilterSet *syscall_filter_set_find(const char *name); -int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); - -int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); +int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); +int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action); +int seccomp_restrict_archs(Set *archs); int seccomp_restrict_namespaces(unsigned long retain); +int seccomp_protect_sysctl(void); +int seccomp_restrict_address_families(Set *address_families, bool whitelist); +int seccomp_restrict_realtime(void); +int seccomp_memory_deny_write_execute(void); + +extern const uint32_t seccomp_local_archs[]; + +#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \ + for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \ + seccomp_local_archs[_i] != (uint32_t) -1; \ + (arch) = seccomp_local_archs[++_i]) + +DEFINE_TRIVIAL_CLEANUP_FUNC(scmp_filter_ctx, seccomp_release); diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 3254f0f231..bc9a2021f9 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -483,6 +483,7 @@ int main(int argc, char *argv[]) { }; int r; + log_set_max_level(LOG_DEBUG); log_parse_environment(); log_open(); diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c index beb6a7f422..6f15879c45 100644 --- a/src/test/test-seccomp.c +++ b/src/test/test-seccomp.c @@ -17,10 +17,12 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ +#include <sched.h> #include <stdlib.h> #include <sys/eventfd.h> +#include <sys/mman.h> #include <unistd.h> -#include <sched.h> +#include <sys/poll.h> #include "alloc-util.h" #include "fd-util.h" @@ -30,8 +32,10 @@ #include "process-util.h" #include "raw-clone.h" #include "seccomp-util.h" +#include "set.h" #include "string-util.h" #include "util.h" +#include "virt.h" static void test_seccomp_arch_to_string(void) { uint32_t a, b; @@ -92,7 +96,6 @@ static void test_filter_sets(void) { if (!is_seccomp_available()) return; - if (geteuid() != 0) return; @@ -108,16 +111,16 @@ static void test_filter_sets(void) { int fd; if (i == SYSCALL_FILTER_SET_DEFAULT) /* if we look at the default set, whitelist instead of blacklist */ - r = seccomp_load_filter_set(SCMP_ACT_ERRNO(EPERM), syscall_filter_sets + i, SCMP_ACT_ALLOW); + r = seccomp_load_syscall_filter_set(SCMP_ACT_ERRNO(EUCLEAN), syscall_filter_sets + i, SCMP_ACT_ALLOW); else - r = seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EPERM)); + r = seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + i, SCMP_ACT_ERRNO(EUCLEAN)); if (r < 0) _exit(EXIT_FAILURE); /* Test the sycall filter with one random system call */ fd = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC); if (IN_SET(i, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_DEFAULT)) - assert_se(fd < 0 && errno == EPERM); + assert_se(fd < 0 && errno == EUCLEAN); else { assert_se(fd >= 0); safe_close(fd); @@ -132,8 +135,8 @@ static void test_filter_sets(void) { static void test_restrict_namespace(void) { _cleanup_free_ char *s = NULL; - pid_t pid; unsigned long ul; + pid_t pid; assert_se(namespace_flag_to_string(0) == NULL); assert_se(streq(namespace_flag_to_string(CLONE_NEWNS), "mnt")); @@ -157,7 +160,6 @@ static void test_restrict_namespace(void) { if (!is_seccomp_available()) return; - if (geteuid() != 0) return; @@ -216,6 +218,256 @@ static void test_restrict_namespace(void) { assert_se(wait_for_terminate_and_warn("nsseccomp", pid, true) == EXIT_SUCCESS); } +static void test_protect_sysctl(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + if (detect_container() > 0) /* in containers _sysctl() is likely missing anyway */ + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + assert_se(syscall(__NR__sysctl, NULL) < 0); + assert_se(errno == EFAULT); + + assert_se(seccomp_protect_sysctl() >= 0); + + assert_se(syscall(__NR__sysctl, 0, 0, 0) < 0); + assert_se(errno == EPERM); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("sysctlseccomp", pid, true) == EXIT_SUCCESS); +} + +static void test_restrict_address_families(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + int fd; + Set *s; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + fd = socket(AF_UNIX, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + fd = socket(AF_NETLINK, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + assert_se(s = set_new(NULL)); + assert_se(set_put(s, INT_TO_PTR(AF_UNIX)) >= 0); + + assert_se(seccomp_restrict_address_families(s, false) >= 0); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + assert_se(socket(AF_UNIX, SOCK_DGRAM, 0) < 0); + assert_se(errno == EAFNOSUPPORT); + + fd = socket(AF_NETLINK, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + set_clear(s); + + assert_se(set_put(s, INT_TO_PTR(AF_INET)) >= 0); + + assert_se(seccomp_restrict_address_families(s, true) >= 0); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + assert_se(fd >= 0); + safe_close(fd); + + assert_se(socket(AF_UNIX, SOCK_DGRAM, 0) < 0); + assert_se(errno == EAFNOSUPPORT); + + assert_se(socket(AF_NETLINK, SOCK_DGRAM, 0) < 0); + assert_se(errno == EAFNOSUPPORT); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("socketseccomp", pid, true) == EXIT_SUCCESS); +} + +static void test_restrict_realtime(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + if (detect_container() > 0) /* in containers RT privs are likely missing anyway */ + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + assert_se(sched_setscheduler(0, SCHED_FIFO, &(struct sched_param) { .sched_priority = 1 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_RR, &(struct sched_param) { .sched_priority = 1 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_IDLE, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_BATCH, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_OTHER, &(struct sched_param) {}) >= 0); + + assert_se(seccomp_restrict_realtime() >= 0); + + assert_se(sched_setscheduler(0, SCHED_IDLE, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_BATCH, &(struct sched_param) { .sched_priority = 0 }) >= 0); + assert_se(sched_setscheduler(0, SCHED_OTHER, &(struct sched_param) {}) >= 0); + + assert_se(sched_setscheduler(0, SCHED_FIFO, &(struct sched_param) { .sched_priority = 1 }) < 0); + assert_se(errno == EPERM); + assert_se(sched_setscheduler(0, SCHED_RR, &(struct sched_param) { .sched_priority = 1 }) < 0); + assert_se(errno == EPERM); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("realtimeseccomp", pid, true) == EXIT_SUCCESS); +} + +static void test_memory_deny_write_execute(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + void *p; + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert_se(p != MAP_FAILED); + assert_se(munmap(p, page_size()) >= 0); + + seccomp_memory_deny_write_execute(); + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert_se(p == MAP_FAILED); + assert_se(errno == EPERM); + + p = mmap(NULL, page_size(), PROT_WRITE|PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert_se(p != MAP_FAILED); + assert_se(munmap(p, page_size()) >= 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("memoryseccomp", pid, true) == EXIT_SUCCESS); +} + +static void test_restrict_archs(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + _cleanup_set_free_ Set *s = NULL; + + assert_se(access("/", F_OK) >= 0); + + assert_se(s = set_new(NULL)); + +#ifdef __x86_64__ + assert_se(set_put(s, UINT32_TO_PTR(SCMP_ARCH_X86+1)) >= 0); +#endif + assert_se(seccomp_restrict_archs(s) >= 0); + + assert_se(access("/", F_OK) >= 0); + assert_se(seccomp_restrict_archs(NULL) >= 0); + + assert_se(access("/", F_OK) >= 0); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("archseccomp", pid, true) == EXIT_SUCCESS); +} + +static void test_load_syscall_filter_set_raw(void) { + pid_t pid; + + if (!is_seccomp_available()) + return; + if (geteuid() != 0) + return; + + pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + _cleanup_set_free_ Set *s = NULL; + + assert_se(access("/", F_OK) >= 0); + assert_se(poll(NULL, 0, 0) == 0); + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, SCMP_ACT_KILL) >= 0); + assert_se(access("/", F_OK) >= 0); + assert_se(poll(NULL, 0, 0) == 0); + + assert_se(s = set_new(NULL)); + assert_se(set_put(s, UINT32_TO_PTR(__NR_access + 1)) >= 0); + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUCLEAN)) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EUCLEAN); + + assert_se(poll(NULL, 0, 0) == 0); + + s = set_free(s); + + assert_se(s = set_new(NULL)); + assert_se(set_put(s, UINT32_TO_PTR(__NR_poll + 1)) >= 0); + + assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUNATCH)) >= 0); + + assert_se(access("/", F_OK) < 0); + assert_se(errno == EUCLEAN); + + assert_se(poll(NULL, 0, 0) < 0); + assert_se(errno == EUNATCH); + + _exit(EXIT_SUCCESS); + } + + assert_se(wait_for_terminate_and_warn("syscallrawseccomp", pid, true) == EXIT_SUCCESS); +} + int main(int argc, char *argv[]) { log_set_max_level(LOG_DEBUG); @@ -225,6 +477,12 @@ int main(int argc, char *argv[]) { test_syscall_filter_set_find(); test_filter_sets(); test_restrict_namespace(); + test_protect_sysctl(); + test_restrict_address_families(); + test_restrict_realtime(); + test_memory_deny_write_execute(); + test_restrict_archs(); + test_load_syscall_filter_set_raw(); return 0; } |