summaryrefslogtreecommitdiff
path: root/src/shared
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2016-11-01 20:25:19 -0600
committerLennart Poettering <lennart@poettering.net>2016-11-04 07:40:13 -0600
commitadd005357d535681c7075ced8eec2b6e61b43728 (patch)
treeb780280f06df0b09c738173602cb90c599597996 /src/shared
parent9156493171cf2d78e1ac1a3746c385b0e281acf1 (diff)
core: add new RestrictNamespaces= unit file setting
This new setting permits restricting whether namespaces may be created and managed by processes started by a unit. It installs a seccomp filter blocking certain invocations of unshare(), clone() and setns(). RestrictNamespaces=no is the default, and does not restrict namespaces in any way. RestrictNamespaces=yes takes away the ability to create or manage any kind of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces so that only mount and IPC namespaces may be created/managed, but no other kind of namespaces. This setting should be improve security quite a bit as in particular user namespacing was a major source of CVEs in the kernel in the past, and is accessible to unprivileged processes. With this setting the entire attack surface may be removed for system services that do not make use of namespaces.
Diffstat (limited to 'src/shared')
-rw-r--r--src/shared/bus-unit-util.c25
-rw-r--r--src/shared/nsflags.c126
-rw-r--r--src/shared/nsflags.h49
-rw-r--r--src/shared/seccomp-util.c89
-rw-r--r--src/shared/seccomp-util.h2
5 files changed, 291 insertions, 0 deletions
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index f639e0e832..35e2c8f18e 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -27,6 +27,7 @@
#include "hashmap.h"
#include "list.h"
#include "locale-util.h"
+#include "nsflags.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@@ -553,6 +554,30 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
r = sd_bus_message_close_container(m);
+ } else if (streq(field, "RestrictNamespaces")) {
+ bool invert = false;
+ uint64_t flags = 0;
+
+ if (eq[0] == '~') {
+ invert = true;
+ eq++;
+ }
+
+ r = parse_boolean(eq);
+ if (r > 0)
+ flags = 0;
+ else if (r == 0)
+ flags = NAMESPACE_FLAGS_ALL;
+ else {
+ r = namespace_flag_from_string_many(eq, &flags);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value %s.", field, eq);
+ }
+
+ if (invert)
+ flags = (~flags) & NAMESPACE_FLAGS_ALL;
+
+ r = sd_bus_message_append(m, "v", "t", flags);
} else {
log_error("Unknown assignment %s.", assignment);
return -EINVAL;
diff --git a/src/shared/nsflags.c b/src/shared/nsflags.c
new file mode 100644
index 0000000000..8fcbe97ba7
--- /dev/null
+++ b/src/shared/nsflags.c
@@ -0,0 +1,126 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sched.h>
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "nsflags.h"
+#include "seccomp-util.h"
+#include "string-util.h"
+
+const struct namespace_flag_map namespace_flag_map[] = {
+ { CLONE_NEWCGROUP, "cgroup" },
+ { CLONE_NEWIPC, "ipc" },
+ { CLONE_NEWNET, "net" },
+ /* So, the mount namespace flag is called CLONE_NEWNS for historical reasons. Let's expose it here under a more
+ * explanatory name: "mnt". This is in-line with how the kernel exposes namespaces in /proc/$PID/ns. */
+ { CLONE_NEWNS, "mnt" },
+ { CLONE_NEWPID, "pid" },
+ { CLONE_NEWUSER, "user" },
+ { CLONE_NEWUTS, "uts" },
+ {}
+};
+
+const char* namespace_flag_to_string(unsigned long flag) {
+ unsigned i;
+
+ flag &= NAMESPACE_FLAGS_ALL;
+
+ for (i = 0; namespace_flag_map[i].name; i++)
+ if (flag == namespace_flag_map[i].flag)
+ return namespace_flag_map[i].name;
+
+ return NULL; /* either unknown namespace flag, or a combination of many. This call supports neither. */
+}
+
+unsigned long namespace_flag_from_string(const char *name) {
+ unsigned i;
+
+ if (isempty(name))
+ return 0;
+
+ for (i = 0; namespace_flag_map[i].name; i++)
+ if (streq(name, namespace_flag_map[i].name))
+ return namespace_flag_map[i].flag;
+
+ return 0;
+}
+
+int namespace_flag_from_string_many(const char *name, unsigned long *ret) {
+ unsigned long flags = 0;
+ int r;
+
+ assert_se(ret);
+
+ if (!name) {
+ *ret = 0;
+ return 0;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ unsigned long f;
+
+ r = extract_first_word(&name, &word, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ f = namespace_flag_from_string(word);
+ if (f == 0)
+ return -EINVAL;
+
+ flags |= f;
+ }
+
+ *ret = flags;
+ return 0;
+}
+
+int namespace_flag_to_string_many(unsigned long flags, char **ret) {
+ _cleanup_free_ char *s = NULL;
+ unsigned i;
+
+ for (i = 0; namespace_flag_map[i].name; i++) {
+ if ((flags & namespace_flag_map[i].flag) != namespace_flag_map[i].flag)
+ continue;
+
+ if (!s) {
+ s = strdup(namespace_flag_map[i].name);
+ if (!s)
+ return -ENOMEM;
+ } else {
+ if (!strextend(&s, " ", namespace_flag_map[i].name, NULL))
+ return -ENOMEM;
+ }
+ }
+
+ if (!s) {
+ s = strdup("");
+ if (!s)
+ return -ENOMEM;
+ }
+
+ *ret = s;
+ s = NULL;
+
+ return 0;
+}
diff --git a/src/shared/nsflags.h b/src/shared/nsflags.h
new file mode 100644
index 0000000000..152ab8b936
--- /dev/null
+++ b/src/shared/nsflags.h
@@ -0,0 +1,49 @@
+#pragma once
+
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sched.h>
+
+#include "missing.h"
+
+/* The combination of all namespace flags defined by the kernel. The right type for this isn't clear. setns() and
+ * unshare() expect these flags to be passed as (signed) "int", while clone() wants them as "unsigned long". The latter
+ * is definitely more appropriate for a flags parameter, and also the larger type of the two, hence let's stick to that
+ * here. */
+#define NAMESPACE_FLAGS_ALL \
+ ((unsigned long) (CLONE_NEWCGROUP| \
+ CLONE_NEWIPC| \
+ CLONE_NEWNET| \
+ CLONE_NEWNS| \
+ CLONE_NEWPID| \
+ CLONE_NEWUSER| \
+ CLONE_NEWUTS))
+
+const char* namespace_flag_to_string(unsigned long flag);
+unsigned long namespace_flag_from_string(const char *name);
+int namespace_flag_from_string_many(const char *name, unsigned long *ret);
+int namespace_flag_to_string_many(unsigned long flags, char **ret);
+
+struct namespace_flag_map {
+ unsigned long flag;
+ const char *name;
+};
+
+extern const struct namespace_flag_map namespace_flag_map[];
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index fc1f6b68f2..4e4b2faca9 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -23,7 +23,9 @@
#include <sys/prctl.h>
#include <linux/seccomp.h>
+#include "alloc-util.h"
#include "macro.h"
+#include "nsflags.h"
#include "seccomp-util.h"
#include "string-util.h"
#include "util.h"
@@ -576,5 +578,92 @@ int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set
finish:
seccomp_release(seccomp);
return r;
+}
+
+int seccomp_restrict_namespaces(unsigned long retain) {
+ scmp_filter_ctx seccomp;
+ unsigned i;
+ int r;
+
+ if (log_get_max_level() >= LOG_DEBUG) {
+ _cleanup_free_ char *s = NULL;
+
+ (void) namespace_flag_to_string_many(retain, &s);
+ log_debug("Restricting namespace to: %s.", strna(s));
+ }
+
+ /* NOOP? */
+ if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
+ return 0;
+
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ if ((retain & NAMESPACE_FLAGS_ALL) == 0)
+ /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
+ * altogether. */
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 0);
+ else
+ /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
+ * special invocation with a zero flags argument, right here. */
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, 0));
+ if (r < 0)
+ goto finish;
+
+ for (i = 0; namespace_flag_map[i].name; i++) {
+ unsigned long f;
+
+ f = namespace_flag_map[i].flag;
+ if ((retain & f) == f) {
+ log_debug("Permitting %s.", namespace_flag_map[i].name);
+ continue;
+ }
+ log_debug("Blocking %s.", namespace_flag_map[i].name);
+
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(unshare),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0)
+ goto finish;
+
+ if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0)
+ goto finish;
+ }
+ }
+
+ r = seccomp_load(seccomp);
+
+finish:
+ seccomp_release(seccomp);
+ return r;
}
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index f0b9f455ab..438a6671bc 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -66,3 +66,5 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name);
int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
+
+int seccomp_restrict_namespaces(unsigned long retain);