diff options
author | Lennart Poettering <lennart@poettering.net> | 2016-11-01 20:25:19 -0600 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2016-11-04 07:40:13 -0600 |
commit | add005357d535681c7075ced8eec2b6e61b43728 (patch) | |
tree | b780280f06df0b09c738173602cb90c599597996 /src/core/execute.c | |
parent | 9156493171cf2d78e1ac1a3746c385b0e281acf1 (diff) |
core: add new RestrictNamespaces= unit file setting
This new setting permits restricting whether namespaces may be created and
managed by processes started by a unit. It installs a seccomp filter blocking
certain invocations of unshare(), clone() and setns().
RestrictNamespaces=no is the default, and does not restrict namespaces in any
way. RestrictNamespaces=yes takes away the ability to create or manage any kind
of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces
so that only mount and IPC namespaces may be created/managed, but no other
kind of namespaces.
This setting should be improve security quite a bit as in particular user
namespacing was a major source of CVEs in the kernel in the past, and is
accessible to unprivileged processes. With this setting the entire attack
surface may be removed for system services that do not make use of namespaces.
Diffstat (limited to 'src/core/execute.c')
-rw-r--r-- | src/core/execute.c | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/src/core/execute.c b/src/core/execute.c index 5bb23e2e4a..b8a0246173 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1534,6 +1534,18 @@ static int apply_private_devices(const Unit *u, const ExecContext *c) { return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); } +static int apply_restrict_namespaces(Unit *u, const ExecContext *c) { + assert(c); + + if (!exec_context_restrict_namespaces_set(c)) + return 0; + + if (skip_seccomp_unavailable(u, "RestrictNamespaces=")) + return 0; + + return seccomp_restrict_namespaces(c->restrict_namespaces); +} + #endif static void do_idle_pipe_dance(int idle_pipe[4]) { @@ -2183,6 +2195,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) { return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */ c->memory_deny_write_execute || c->restrict_realtime || + exec_context_restrict_namespaces_set(c) || c->protect_kernel_tunables || c->protect_kernel_modules || c->private_devices || @@ -2764,6 +2777,12 @@ static int exec_child( } } + r = apply_restrict_namespaces(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + if (context->protect_kernel_tunables) { r = apply_protect_sysctl(unit, context); if (r < 0) { @@ -2947,6 +2966,7 @@ void exec_context_init(ExecContext *c) { c->personality = PERSONALITY_INVALID; c->runtime_directory_mode = 0755; c->capability_bounding_set = CAP_ALL; + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; } void exec_context_done(ExecContext *c) { @@ -3244,6 +3264,7 @@ static void strv_fprintf(FILE *f, char **l) { void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { char **e, **d; unsigned i; + int r; assert(c); assert(f); @@ -3524,6 +3545,15 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { fputc('\n', f); } + if (exec_context_restrict_namespaces_set(c)) { + _cleanup_free_ char *s = NULL; + + r = namespace_flag_to_string_many(c->restrict_namespaces, &s); + if (r >= 0) + fprintf(f, "%sRestrictNamespaces: %s\n", + prefix, s); + } + if (c->syscall_errno > 0) fprintf(f, "%sSystemCallErrorNumber: %s\n", |