summaryrefslogtreecommitdiff
path: root/src/core/execute.c
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2016-12-27 15:28:25 +0100
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2017-01-17 22:14:27 -0500
commit469830d1426a91e0897c321fdc8ee428f0a750c1 (patch)
treeb77a2ba472790641c14e440f5732688b645c1dae /src/core/execute.c
parent802fa07a4ad5b29a798896f1566c5e2f85897767 (diff)
seccomp: rework seccomp code, to improve compat with some archs
This substantially reworks the seccomp code, to ensure better compatibility with some architectures, including i386. So far we relied on libseccomp's internal handling of the multiple syscall ABIs supported on Linux. This is problematic however, as it does not define clear semantics if an ABI is not able to support specific seccomp rules we install. This rework hence changes a couple of things: - We no longer use seccomp_rule_add(), but only seccomp_rule_add_exact(), and fail the installation of a filter if the architecture doesn't support it. - We no longer rely on adding multiple syscall architectures to a single filter, but instead install a separate filter for each syscall architecture supported. This way, we can install a strict filter for x86-64, while permitting a less strict filter for i386. - All high-level filter additions are now moved from execute.c to seccomp-util.c, so that we can test them independently of the service execution logic. - Tests have been added for all types of our seccomp filters. - SystemCallFilters= and SystemCallArchitectures= are now implemented in independent filters and installation logic, as they semantically are very much independent of each other. Fixes: #4575
Diffstat (limited to 'src/core/execute.c')
-rw-r--r--src/core/execute.c467
1 files changed, 123 insertions, 344 deletions
diff --git a/src/core/execute.c b/src/core/execute.c
index 4ff6f4ebd0..a77edbb162 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1259,6 +1259,41 @@ static void rename_process_from_path(const char *path) {
rename_process(process_name);
}
+static bool context_has_address_families(const ExecContext *c) {
+ assert(c);
+
+ return c->address_families_whitelist ||
+ !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+ assert(c);
+
+ return c->syscall_whitelist ||
+ !set_isempty(c->syscall_filter);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+ assert(c);
+
+ if (c->no_new_privileges)
+ return true;
+
+ if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+ return false;
+
+ /* We need NNP if we have any form of seccomp and are unprivileged */
+ return context_has_address_families(c) ||
+ c->memory_deny_write_execute ||
+ c->restrict_realtime ||
+ exec_context_restrict_namespaces_set(c) ||
+ c->protect_kernel_tunables ||
+ c->protect_kernel_modules ||
+ c->private_devices ||
+ context_has_syscall_filters(c) ||
+ !set_isempty(c->syscall_archs);
+}
+
#ifdef HAVE_SECCOMP
static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
@@ -1272,344 +1307,131 @@ static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
return true;
}
-static int apply_seccomp(const Unit* u, const ExecContext *c) {
- uint32_t negative_action, action;
- scmp_filter_ctx seccomp;
- Iterator i;
- void *id;
- int r;
+static int apply_syscall_filter(const Unit* u, const ExecContext *c) {
+ uint32_t negative_action, default_action, action;
+ assert(u);
assert(c);
- if (skip_seccomp_unavailable(u, "syscall filtering"))
+ if (!context_has_syscall_filters(c))
return 0;
- negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
-
- seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW);
- if (!seccomp)
- return -ENOMEM;
-
- if (c->syscall_archs) {
+ if (skip_seccomp_unavailable(u, "SystemCallFilter="))
+ return 0;
- SET_FOREACH(id, c->syscall_archs, i) {
- r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
- if (r == -EEXIST)
- continue;
- if (r < 0)
- goto finish;
- }
+ negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
+ if (c->syscall_whitelist) {
+ default_action = negative_action;
+ action = SCMP_ACT_ALLOW;
} else {
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0)
- goto finish;
+ default_action = SCMP_ACT_ALLOW;
+ action = negative_action;
}
- action = c->syscall_whitelist ? SCMP_ACT_ALLOW : negative_action;
- SET_FOREACH(id, c->syscall_filter, i) {
- r = seccomp_rule_add(seccomp, action, PTR_TO_INT(id) - 1, 0);
- if (r < 0)
- goto finish;
- }
-
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
-
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
}
-static int apply_address_families(const Unit* u, const ExecContext *c) {
- scmp_filter_ctx seccomp;
- Iterator i;
- int r;
-
+static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
- if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+ if (set_isempty(c->syscall_archs))
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- if (c->address_families_whitelist) {
- int af, first = 0, last = 0;
- void *afp;
-
- /* If this is a whitelist, we first block the address
- * families that are out of range and then everything
- * that is not in the set. First, we find the lowest
- * and highest address family in the set. */
-
- SET_FOREACH(afp, c->address_families, i) {
- af = PTR_TO_INT(afp);
-
- if (af <= 0 || af >= af_max())
- continue;
-
- if (first == 0 || af < first)
- first = af;
-
- if (last == 0 || af > last)
- last = af;
- }
-
- assert((first == 0) == (last == 0));
-
- if (first == 0) {
-
- /* No entries in the valid range, block everything */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 0);
- if (r < 0)
- goto finish;
-
- } else {
+ if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
+ return 0;
- /* Block everything below the first entry */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_LT, first));
- if (r < 0)
- goto finish;
-
- /* Block everything above the last entry */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_GT, last));
- if (r < 0)
- goto finish;
-
- /* Block everything between the first and last
- * entry */
- for (af = 1; af < af_max(); af++) {
-
- if (set_contains(c->address_families, INT_TO_PTR(af)))
- continue;
+ return seccomp_restrict_archs(c->syscall_archs);
+}
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_EQ, af));
- if (r < 0)
- goto finish;
- }
- }
+static int apply_address_families(const Unit* u, const ExecContext *c) {
+ assert(u);
+ assert(c);
- } else {
- void *af;
-
- /* If this is a blacklist, then generate one rule for
- * each address family that are then combined in OR
- * checks. */
-
- SET_FOREACH(af, c->address_families, i) {
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPROTONOSUPPORT),
- SCMP_SYS(socket),
- 1,
- SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
- if (r < 0)
- goto finish;
- }
- }
+ if (!context_has_address_families(c))
+ return 0;
- r = seccomp_load(seccomp);
+ if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
+ return 0;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
}
static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
- scmp_filter_ctx seccomp;
- int r;
-
+ assert(u);
assert(c);
- if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+ if (!c->memory_deny_write_execute)
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(mmap),
- 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
- if (r < 0)
- goto finish;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(mprotect),
- 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
- if (r < 0)
- goto finish;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(shmat),
- 1,
- SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
+ if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
+ return 0;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_memory_deny_write_execute();
}
static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
- static const int permitted_policies[] = {
- SCHED_OTHER,
- SCHED_BATCH,
- SCHED_IDLE,
- };
-
- scmp_filter_ctx seccomp;
- unsigned i;
- int r, p, max_policy = 0;
-
+ assert(u);
assert(c);
- if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+ if (!c->restrict_realtime)
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- /* Determine the highest policy constant we want to allow */
- for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
- if (permitted_policies[i] > max_policy)
- max_policy = permitted_policies[i];
-
- /* Go through all policies with lower values than that, and block them -- unless they appear in the
- * whitelist. */
- for (p = 0; p < max_policy; p++) {
- bool good = false;
-
- /* Check if this is in the whitelist. */
- for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
- if (permitted_policies[i] == p) {
- good = true;
- break;
- }
-
- if (good)
- continue;
-
- /* Deny this policy */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(sched_setscheduler),
- 1,
- SCMP_A1(SCMP_CMP_EQ, p));
- if (r < 0)
- goto finish;
- }
-
- /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
- * hence no need no check for < 0 values. */
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(sched_setscheduler),
- 1,
- SCMP_A1(SCMP_CMP_GT, max_policy));
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
+ if (skip_seccomp_unavailable(u, "RestrictRealtime="))
+ return 0;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_restrict_realtime();
}
static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
- scmp_filter_ctx seccomp;
- int r;
-
+ assert(u);
assert(c);
/* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
* let's protect even those systems where this is left on in the kernel. */
- if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+ if (!c->protect_kernel_tunables)
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
-
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(_sysctl),
- 0);
- if (r < 0)
- goto finish;
-
- r = seccomp_load(seccomp);
+ if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+ return 0;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_protect_sysctl();
}
static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
/* Turn off module syscalls on ProtectKernelModules=yes */
+ if (!c->protect_kernel_modules)
+ return 0;
+
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
- return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
}
static int apply_private_devices(const Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
/* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
+ if (!c->private_devices)
+ return 0;
+
if (skip_seccomp_unavailable(u, "PrivateDevices="))
return 0;
- return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
}
static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
+ assert(u);
assert(c);
if (!exec_context_restrict_namespaces_set(c))
@@ -2310,41 +2132,6 @@ static int close_remaining_fds(
return close_all_fds(dont_close, n_dont_close);
}
-static bool context_has_address_families(const ExecContext *c) {
- assert(c);
-
- return c->address_families_whitelist ||
- !set_isempty(c->address_families);
-}
-
-static bool context_has_syscall_filters(const ExecContext *c) {
- assert(c);
-
- return c->syscall_whitelist ||
- !set_isempty(c->syscall_filter) ||
- !set_isempty(c->syscall_archs);
-}
-
-static bool context_has_no_new_privileges(const ExecContext *c) {
- assert(c);
-
- if (c->no_new_privileges)
- return true;
-
- if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
- return false;
-
- /* We need NNP if we have any form of seccomp and are unprivileged */
- return context_has_address_families(c) ||
- c->memory_deny_write_execute ||
- c->restrict_realtime ||
- exec_context_restrict_namespaces_set(c) ||
- c->protect_kernel_tunables ||
- c->protect_kernel_modules ||
- c->private_devices ||
- context_has_syscall_filters(c);
-}
-
static int send_user_lookup(
Unit *unit,
int user_lookup_fd,
@@ -2904,28 +2691,22 @@ static int exec_child(
}
#ifdef HAVE_SECCOMP
- if (context_has_address_families(context)) {
- r = apply_address_families(unit, context);
- if (r < 0) {
- *exit_status = EXIT_ADDRESS_FAMILIES;
- return r;
- }
+ r = apply_address_families(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_ADDRESS_FAMILIES;
+ return r;
}
- if (context->memory_deny_write_execute) {
- r = apply_memory_deny_write_execute(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_memory_deny_write_execute(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
- if (context->restrict_realtime) {
- r = apply_restrict_realtime(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_restrict_realtime(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
r = apply_restrict_namespaces(unit, context);
@@ -2934,38 +2715,36 @@ static int exec_child(
return r;
}
- if (context->protect_kernel_tunables) {
- r = apply_protect_sysctl(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_protect_sysctl(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
- if (context->protect_kernel_modules) {
- r = apply_protect_kernel_modules(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_protect_kernel_modules(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
- if (context->private_devices) {
- r = apply_private_devices(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_private_devices(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
+ }
+
+ r = apply_syscall_archs(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
/* This really should remain the last step before the execve(), to make sure our own code is unaffected
* by the filter as little as possible. */
- if (context_has_syscall_filters(context)) {
- r = apply_seccomp(unit, context);
- if (r < 0) {
- *exit_status = EXIT_SECCOMP;
- return r;
- }
+ r = apply_syscall_filter(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
}
#endif
}