summaryrefslogtreecommitdiff
path: root/src/nspawn
diff options
context:
space:
mode:
Diffstat (limited to 'src/nspawn')
-rw-r--r--src/nspawn/nspawn-patch-uid.c15
-rw-r--r--src/nspawn/nspawn-seccomp.c143
-rw-r--r--src/nspawn/nspawn-seccomp.h24
-rw-r--r--src/nspawn/nspawn.c112
4 files changed, 186 insertions, 108 deletions
diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c
index c7382d412d..cc79597c95 100644
--- a/src/nspawn/nspawn-patch-uid.c
+++ b/src/nspawn/nspawn-patch-uid.c
@@ -280,7 +280,13 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
return r > 0 || changed;
}
-static int is_procfs_sysfs_or_suchlike(int fd) {
+/*
+ * Check if the filesystem is fully compatible with user namespaces or
+ * UID/GID patching. Some filesystems in this list can be fully mounted inside
+ * user namespaces, however their inodes may relate to host resources or only
+ * valid in the global user namespace, therefore no patching should be applied.
+ */
+static int is_fs_fully_userns_compatible(int fd) {
struct statfs sfs;
assert(fd >= 0);
@@ -300,6 +306,9 @@ static int is_procfs_sysfs_or_suchlike(int fd) {
F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) ||
+ F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
}
@@ -311,8 +320,8 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
* probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
- * stop the recursion when we hit a procfs or sysfs file system. */
- r = is_procfs_sysfs_or_suchlike(fd);
+ * stop the recursion when we hit procfs, sysfs or some other special file systems. */
+ r = is_fs_fully_userns_compatible(fd);
if (r < 0)
goto finish;
if (r > 0) {
diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c
new file mode 100644
index 0000000000..2d145b68a7
--- /dev/null
+++ b/src/nspawn/nspawn-seccomp.c
@@ -0,0 +1,143 @@
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <linux/netlink.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+
+#ifdef HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+
+#include "log.h"
+
+#ifdef HAVE_SECCOMP
+#include "seccomp-util.h"
+#endif
+
+#include "nspawn-seccomp.h"
+
+#ifdef HAVE_SECCOMP
+
+static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx,
+ uint64_t cap_list_retain) {
+ unsigned i;
+ int r;
+ static const struct {
+ uint64_t capability;
+ int syscall_num;
+ } blacklist[] = {
+ { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
+ { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
+ { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
+ { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
+ { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
+ { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
+ { CAP_SYS_MODULE, SCMP_SYS(init_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
+ { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
+ { CAP_SYSLOG, SCMP_SYS(syslog) },
+ };
+
+ for (i = 0; i < ELEMENTSOF(blacklist); i++) {
+ if (cap_list_retain & (1ULL << blacklist[i].capability))
+ continue;
+
+ r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
+ if (r == -EFAULT)
+ continue; /* unknown syscall */
+ if (r < 0) {
+ log_error_errno(r, "Failed to block syscall: %m");
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int setup_seccomp(uint64_t cap_list_retain) {
+ scmp_filter_ctx seccomp;
+ int r;
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return log_oom();
+
+ r = seccomp_add_secondary_archs(seccomp);
+ if (r < 0) {
+ log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
+ goto finish;
+ }
+
+ r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
+ if (r < 0)
+ goto finish;
+
+ /*
+ Audit is broken in containers, much of the userspace audit
+ hookup will fail if running inside a container. We don't
+ care and just turn off creation of audit sockets.
+
+ This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
+ with EAFNOSUPPORT which audit userspace uses as indication
+ that audit is disabled in the kernel.
+ */
+
+ r = seccomp_rule_add(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 2,
+ SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
+ SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
+ if (r < 0) {
+ log_error_errno(r, "Failed to add audit seccomp rule: %m");
+ goto finish;
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0) {
+ log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
+ goto finish;
+ }
+
+ r = seccomp_load(seccomp);
+ if (r == -EINVAL) {
+ log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
+ r = 0;
+ goto finish;
+ }
+ if (r < 0) {
+ log_error_errno(r, "Failed to install seccomp audit filter: %m");
+ goto finish;
+ }
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
+#else
+
+int setup_seccomp(uint64_t cap_list_retain) {
+ return 0;
+}
+
+#endif
diff --git a/src/nspawn/nspawn-seccomp.h b/src/nspawn/nspawn-seccomp.h
new file mode 100644
index 0000000000..5bde16faf9
--- /dev/null
+++ b/src/nspawn/nspawn-seccomp.h
@@ -0,0 +1,24 @@
+#pragma once
+
+/***
+ This file is part of systemd.
+
+ Copyright 2016 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+
+int setup_seccomp(uint64_t cap_list_retain);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index ac11bcea5a..b421c182ce 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -26,9 +26,6 @@
#include <linux/loop.h>
#include <pwd.h>
#include <sched.h>
-#ifdef HAVE_SECCOMP
-#include <seccomp.h>
-#endif
#ifdef HAVE_SELINUX
#include <selinux/selinux.h>
#endif
@@ -82,15 +79,13 @@
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
+#include "nspawn-seccomp.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "ptyfwd.h"
#include "random-util.h"
#include "rm-rf.h"
-#ifdef HAVE_SECCOMP
-#include "seccomp-util.h"
-#endif
#include "selinux-util.h"
#include "signal-util.h"
#include "socket-util.h"
@@ -136,7 +131,7 @@ static StartMode arg_start_mode = START_PID1;
static bool arg_ephemeral = false;
static LinkJournal arg_link_journal = LINK_AUTO;
static bool arg_link_journal_try = false;
-static uint64_t arg_retain =
+static uint64_t arg_caps_retain =
(1ULL << CAP_CHOWN) |
(1ULL << CAP_DAC_OVERRIDE) |
(1ULL << CAP_DAC_READ_SEARCH) |
@@ -1075,7 +1070,7 @@ static int parse_argv(int argc, char *argv[]) {
if (mask_all_settings)
arg_settings_mask = _SETTINGS_MASK_ALL;
- arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
+ arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
r = detect_unified_cgroup_hierarchy();
if (r < 0)
@@ -1632,7 +1627,7 @@ static int setup_journal(const char *directory) {
}
static int drop_capabilities(void) {
- return capability_bounding_set_drop(arg_retain, false);
+ return capability_bounding_set_drop(arg_caps_retain, false);
}
static int reset_audit_loginuid(void) {
@@ -1667,99 +1662,6 @@ static int reset_audit_loginuid(void) {
return 0;
}
-static int setup_seccomp(void) {
-
-#ifdef HAVE_SECCOMP
- static const struct {
- uint64_t capability;
- int syscall_num;
- } blacklist[] = {
- { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
- { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
- { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
- { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
- { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
- { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
- { CAP_SYS_MODULE, SCMP_SYS(init_module) },
- { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
- { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
- { CAP_SYSLOG, SCMP_SYS(syslog) },
- };
-
- scmp_filter_ctx seccomp;
- unsigned i;
- int r;
-
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return log_oom();
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0) {
- log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
- goto finish;
- }
-
- for (i = 0; i < ELEMENTSOF(blacklist); i++) {
- if (arg_retain & (1ULL << blacklist[i].capability))
- continue;
-
- r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
- if (r == -EFAULT)
- continue; /* unknown syscall */
- if (r < 0) {
- log_error_errno(r, "Failed to block syscall: %m");
- goto finish;
- }
- }
-
- /*
- Audit is broken in containers, much of the userspace audit
- hookup will fail if running inside a container. We don't
- care and just turn off creation of audit sockets.
-
- This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
- with EAFNOSUPPORT which audit userspace uses as indication
- that audit is disabled in the kernel.
- */
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EAFNOSUPPORT),
- SCMP_SYS(socket),
- 2,
- SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
- SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
- if (r < 0) {
- log_error_errno(r, "Failed to add audit seccomp rule: %m");
- goto finish;
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
- goto finish;
- }
-
- r = seccomp_load(seccomp);
- if (r == -EINVAL) {
- log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
- r = 0;
- goto finish;
- }
- if (r < 0) {
- log_error_errno(r, "Failed to install seccomp audit filter: %m");
- goto finish;
- }
-
-finish:
- seccomp_release(seccomp);
- return r;
-#else
- return 0;
-#endif
-
-}
static int setup_propagate(const char *root) {
const char *p, *q;
@@ -2988,7 +2890,7 @@ static int outer_child(
if (r < 0)
return r;
- r = setup_seccomp();
+ r = setup_seccomp(arg_caps_retain);
if (r < 0)
return r;
@@ -3272,9 +3174,9 @@ static int load_settings(void) {
if (settings->capability != 0)
log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
} else
- arg_retain |= plus;
+ arg_caps_retain |= plus;
- arg_retain &= ~settings->drop_capability;
+ arg_caps_retain &= ~settings->drop_capability;
}
if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&