From add005357d535681c7075ced8eec2b6e61b43728 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 1 Nov 2016 20:25:19 -0600 Subject: core: add new RestrictNamespaces= unit file setting This new setting permits restricting whether namespaces may be created and managed by processes started by a unit. It installs a seccomp filter blocking certain invocations of unshare(), clone() and setns(). RestrictNamespaces=no is the default, and does not restrict namespaces in any way. RestrictNamespaces=yes takes away the ability to create or manage any kind of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces so that only mount and IPC namespaces may be created/managed, but no other kind of namespaces. This setting should be improve security quite a bit as in particular user namespacing was a major source of CVEs in the kernel in the past, and is accessible to unprivileged processes. With this setting the entire attack surface may be removed for system services that do not make use of namespaces. --- src/shared/seccomp-util.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) (limited to 'src/shared/seccomp-util.c') diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index fc1f6b68f2..4e4b2faca9 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -23,7 +23,9 @@ #include #include +#include "alloc-util.h" #include "macro.h" +#include "nsflags.h" #include "seccomp-util.h" #include "string-util.h" #include "util.h" @@ -576,5 +578,92 @@ int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set finish: seccomp_release(seccomp); return r; +} + +int seccomp_restrict_namespaces(unsigned long retain) { + scmp_filter_ctx seccomp; + unsigned i; + int r; + + if (log_get_max_level() >= LOG_DEBUG) { + _cleanup_free_ char *s = NULL; + + (void) namespace_flag_to_string_many(retain, &s); + log_debug("Restricting namespace to: %s.", strna(s)); + } + + /* NOOP? */ + if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL) + return 0; + + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + if ((retain & NAMESPACE_FLAGS_ALL) == 0) + /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall + * altogether. */ + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 0); + else + /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the + * special invocation with a zero flags argument, right here. */ + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_EQ, 0)); + if (r < 0) + goto finish; + + for (i = 0; namespace_flag_map[i].name; i++) { + unsigned long f; + + f = namespace_flag_map[i].flag; + if ((retain & f) == f) { + log_debug("Permitting %s.", namespace_flag_map[i].name); + continue; + } + log_debug("Blocking %s.", namespace_flag_map[i].name); + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(unshare), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) + goto finish; + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(clone), + 1, + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) + goto finish; + + if ((retain & NAMESPACE_FLAGS_ALL) != 0) { + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(setns), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); + if (r < 0) + goto finish; + } + } + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; } -- cgit v1.2.3-54-g00ecf