summaryrefslogtreecommitdiff
path: root/src/shared
diff options
context:
space:
mode:
Diffstat (limited to 'src/shared')
-rw-r--r--src/shared/seccomp-util.c686
-rw-r--r--src/shared/seccomp-util.h27
2 files changed, 577 insertions, 136 deletions
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index 66b72b2b27..2c73cb8fa4 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -18,17 +18,52 @@
***/
#include <errno.h>
+#include <linux/seccomp.h>
#include <seccomp.h>
#include <stddef.h>
+#include <sys/mman.h>
#include <sys/prctl.h>
-#include <linux/seccomp.h>
+#include <sys/shm.h>
+#include "af-list.h"
#include "alloc-util.h"
#include "macro.h"
#include "nsflags.h"
#include "seccomp-util.h"
#include "string-util.h"
#include "util.h"
+#include "errno-list.h"
+
+const uint32_t seccomp_local_archs[] = {
+
+#if defined(__i386__) || defined(__x86_64__)
+ SCMP_ARCH_X86,
+ SCMP_ARCH_X86_64,
+ SCMP_ARCH_X32,
+
+#elif defined(__arm__) || defined(__aarch64__)
+ SCMP_ARCH_ARM,
+ SCMP_ARCH_AARCH64,
+
+#elif defined(__mips__) || defined(__mips64__)
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPS64,
+ SCMP_ARCH_MIPS64N32,
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPSEL64,
+ SCMP_ARCH_MIPSEL64N32,
+
+#elif defined(__powerpc__) || defined(__powerpc64__)
+ SCMP_ARCH_PPC,
+ SCMP_ARCH_PPC64,
+ SCMP_ARCH_PPC64LE,
+
+#elif defined(__s390__) || defined(__s390x__)
+ SCMP_ARCH_S390,
+ SCMP_ARCH_S390X,
+#endif
+ (uint32_t) -1
+ };
const char* seccomp_arch_to_string(uint32_t c) {
/* Maintain order used in <seccomp.h>.
@@ -122,18 +157,37 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) {
return 0;
}
-int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
+int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
scmp_filter_ctx seccomp;
int r;
- /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
- * added by default, and NNP is turned off. */
+ /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
+ * any others. Also, turns off the NNP fiddling. */
seccomp = seccomp_init(default_action);
if (!seccomp)
return -ENOMEM;
- r = seccomp_add_secondary_archs(seccomp);
+ if (arch != SCMP_ARCH_NATIVE &&
+ arch != seccomp_arch_native()) {
+
+ r = seccomp_arch_add(seccomp, arch);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_arch_remove(seccomp, seccomp_arch_native());
+ if (r < 0)
+ goto finish;
+
+ assert(seccomp_arch_exist(seccomp, arch) >= 0);
+ assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
+ assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
+ } else {
+ assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
+ assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
if (r < 0)
goto finish;
@@ -149,72 +203,23 @@ finish:
return r;
}
-int seccomp_add_secondary_archs(scmp_filter_ctx ctx) {
-
- /* Add in all possible secondary archs we are aware of that
- * this kernel might support. */
-
- static const int seccomp_arches[] = {
-#if defined(__i386__) || defined(__x86_64__)
- SCMP_ARCH_X86,
- SCMP_ARCH_X86_64,
- SCMP_ARCH_X32,
-
-#elif defined(__arm__) || defined(__aarch64__)
- SCMP_ARCH_ARM,
- SCMP_ARCH_AARCH64,
-
-#elif defined(__arm__) || defined(__aarch64__)
- SCMP_ARCH_ARM,
- SCMP_ARCH_AARCH64,
-
-#elif defined(__mips__) || defined(__mips64__)
- SCMP_ARCH_MIPS,
- SCMP_ARCH_MIPS64,
- SCMP_ARCH_MIPS64N32,
- SCMP_ARCH_MIPSEL,
- SCMP_ARCH_MIPSEL64,
- SCMP_ARCH_MIPSEL64N32,
-
-#elif defined(__powerpc__) || defined(__powerpc64__)
- SCMP_ARCH_PPC,
- SCMP_ARCH_PPC64,
- SCMP_ARCH_PPC64LE,
-
-#elif defined(__s390__) || defined(__s390x__)
- SCMP_ARCH_S390,
- SCMP_ARCH_S390X,
-#endif
- };
-
- unsigned i;
- int r;
-
- for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) {
- r = seccomp_arch_add(ctx, seccomp_arches[i]);
- if (r < 0 && r != -EEXIST)
- return r;
- }
-
- return 0;
-}
-
static bool is_basic_seccomp_available(void) {
- int r;
- r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
- return r >= 0;
+ return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
}
static bool is_seccomp_filter_available(void) {
- int r;
- r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
- return r < 0 && errno == EFAULT;
+ return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
+ errno == EFAULT;
}
bool is_seccomp_available(void) {
static int cached_enabled = -1;
+
if (cached_enabled < 0)
- cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available();
+ cached_enabled =
+ is_basic_seccomp_available() &&
+ is_seccomp_filter_available();
+
return cached_enabled;
}
@@ -469,6 +474,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
.value =
"_sysctl\0"
"afs_syscall\0"
+ "bdflush\0"
"break\0"
"create_module\0"
"ftime\0"
@@ -500,7 +506,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"@module\0"
"@raw-io\0"
"acct\0"
- "bdflush\0"
"bpf\0"
"capset\0"
"chown32\0"
@@ -566,9 +571,17 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"s390_pci_mmio_write\0"
#endif
},
+ [SYSCALL_FILTER_SET_REBOOT] = {
+ .name = "@reboot",
+ .help = "Reboot and reboot preparation/kexec",
+ .value =
+ "kexec\0"
+ "kexec_file_load\0"
+ "reboot\0"
+ },
[SYSCALL_FILTER_SET_RESOURCES] = {
- /* Alter resource settings */
.name = "@resources",
+ .help = "Alter resource settings",
.value =
"sched_setparam\0"
"sched_setscheduler\0"
@@ -582,6 +595,13 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"sched_setattr\0"
"prlimit64\0"
},
+ [SYSCALL_FILTER_SET_SWAP] = {
+ .name = "@swap",
+ .help = "Enable/disable swap devices",
+ .value =
+ "swapoff\0"
+ "swapon\0"
+ },
};
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
@@ -597,7 +617,12 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) {
return NULL;
}
-int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
+static int seccomp_add_syscall_filter_set(
+ scmp_filter_ctx seccomp,
+ uint32_t default_action,
+ const SyscallFilterSet *set,
+ uint32_t action) {
+
const char *sys;
int r;
@@ -614,47 +639,102 @@ int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterS
if (!other)
return -EINVAL;
- r = seccomp_add_syscall_filter_set(seccomp, other, action);
+ r = seccomp_add_syscall_filter_set(seccomp, default_action, other, action);
+ if (r < 0)
+ return r;
} else {
id = seccomp_syscall_resolve_name(sys);
if (id == __NR_SCMP_ERROR)
- return -EINVAL;
+ return -EINVAL; /* Not known at all? Then that's a real error */
- r = seccomp_rule_add(seccomp, action, id, 0);
+ r = seccomp_rule_add_exact(seccomp, action, id, 0);
+ if (r < 0)
+ /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", sys);
}
+ }
+
+ return 0;
+}
+
+int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
+ uint32_t arch;
+ int r;
+
+ assert(set);
+
+ /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
+ * earch local arch. */
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, default_action);
if (r < 0)
return r;
+
+ r = seccomp_add_syscall_filter_set(seccomp, default_action, set, action);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add filter set, ignoring: %m");
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
}
return 0;
}
-int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
- scmp_filter_ctx seccomp;
+int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
+ uint32_t arch;
int r;
- assert(set);
+ /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
+ * SyscallFilterSet* table. */
- /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
+ if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
+ return 0;
- r = seccomp_init_conservative(&seccomp, default_action);
- if (r < 0)
- return r;
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ Iterator i;
+ void *id;
- r = seccomp_add_syscall_filter_set(seccomp, set, action);
- if (r < 0)
- goto finish;
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
- r = seccomp_load(seccomp);
+ r = seccomp_init_for_arch(&seccomp, arch, default_action);
+ if (r < 0)
+ return r;
-finish:
- seccomp_release(seccomp);
- return r;
+ SET_FOREACH(id, set, i) {
+ r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
+ if (r < 0) {
+ /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+ _cleanup_free_ char *n = NULL;
+
+ n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", strna(n));
+ }
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
}
int seccomp_restrict_namespaces(unsigned long retain) {
- scmp_filter_ctx seccomp;
- unsigned i;
+ uint32_t arch;
int r;
if (log_get_max_level() >= LOG_DEBUG) {
@@ -668,74 +748,420 @@ int seccomp_restrict_namespaces(unsigned long retain) {
if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
return 0;
- r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
- if (r < 0)
- return r;
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ unsigned i;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
- if ((retain & NAMESPACE_FLAGS_ALL) == 0)
- /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
- * altogether. */
- r = seccomp_rule_add(
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ if ((retain & NAMESPACE_FLAGS_ALL) == 0)
+ /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
+ * altogether. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 0);
+ else
+ /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
+ * special invocation with a zero flags argument, right here. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, 0));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ for (i = 0; namespace_flag_map[i].name; i++) {
+ unsigned long f;
+
+ f = namespace_flag_map[i].flag;
+ if ((retain & f) == f) {
+ log_debug("Permitting %s.", namespace_flag_map[i].name);
+ continue;
+ }
+
+ log_debug("Blocking %s.", namespace_flag_map[i].name);
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(unshare),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ break;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ break;
+ }
+
+ if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ break;
+ }
+ }
+ }
+ if (r < 0)
+ continue;
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_protect_sysctl(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(setns),
+ SCMP_SYS(_sysctl),
0);
- else
- /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
- * special invocation with a zero flags argument, right here. */
- r = seccomp_rule_add(
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ Iterator i;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ if (whitelist) {
+ int af, first = 0, last = 0;
+ void *afp;
+
+ /* If this is a whitelist, we first block the address families that are out of range and then
+ * everything that is not in the set. First, we find the lowest and highest address family in
+ * the set. */
+
+ SET_FOREACH(afp, address_families, i) {
+ af = PTR_TO_INT(afp);
+
+ if (af <= 0 || af >= af_max())
+ continue;
+
+ if (first == 0 || af < first)
+ first = af;
+
+ if (last == 0 || af > last)
+ last = af;
+ }
+
+ assert((first == 0) == (last == 0));
+
+ if (first == 0) {
+
+ /* No entries in the valid range, block everything */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ } else {
+
+ /* Block everything below the first entry */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_LT, first));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ /* Block everything above the last entry */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_GT, last));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ /* Block everything between the first and last entry */
+ for (af = 1; af < af_max(); af++) {
+
+ if (set_contains(address_families, INT_TO_PTR(af)))
+ continue;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_EQ, af));
+ if (r < 0)
+ break;
+ }
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ } else {
+ void *af;
+
+ /* If this is a blacklist, then generate one rule for
+ * each address family that are then combined in OR
+ * checks. */
+
+ SET_FOREACH(af, address_families, i) {
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
+ if (r < 0)
+ break;
+ }
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_realtime(void) {
+ static const int permitted_policies[] = {
+ SCHED_OTHER,
+ SCHED_BATCH,
+ SCHED_IDLE,
+ };
+
+ int r, max_policy = 0;
+ uint32_t arch;
+ unsigned i;
+
+ /* Determine the highest policy constant we want to allow */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] > max_policy)
+ max_policy = permitted_policies[i];
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int p;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ /* Go through all policies with lower values than that, and block them -- unless they appear in the
+ * whitelist. */
+ for (p = 0; p < max_policy; p++) {
+ bool good = false;
+
+ /* Check if this is in the whitelist. */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] == p) {
+ good = true;
+ break;
+ }
+
+ if (good)
+ continue;
+
+ /* Deny this policy */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(sched_setscheduler),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, p));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
+ * unsigned here, hence no need no check for < 0 values. */
+ r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(setns),
+ SCMP_SYS(sched_setscheduler),
1,
- SCMP_A1(SCMP_CMP_EQ, 0));
- if (r < 0)
- goto finish;
+ SCMP_A1(SCMP_CMP_GT, max_policy));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
- for (i = 0; namespace_flag_map[i].name; i++) {
- unsigned long f;
+int seccomp_memory_deny_write_execute(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
- f = namespace_flag_map[i].flag;
- if ((retain & f) == f) {
- log_debug("Permitting %s.", namespace_flag_map[i].name);
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mmap),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add mmap() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
continue;
}
- log_debug("Blocking %s.", namespace_flag_map[i].name);
-
- r = seccomp_rule_add(
+ r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(unshare),
+ SCMP_SYS(mprotect),
1,
- SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
- if (r < 0)
- goto finish;
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add mprotect() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
- r = seccomp_rule_add(
+ r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(clone),
+ SCMP_SYS(shmat),
1,
- SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add shmat() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (IN_SET(r, -EPERM, -EACCES))
+ return r;
if (r < 0)
- goto finish;
+ log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ }
- if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
- r = seccomp_rule_add(
- seccomp,
- SCMP_ACT_ERRNO(EPERM),
- SCMP_SYS(setns),
- 1,
- SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
- if (r < 0)
- goto finish;
- }
+ return 0;
+}
+
+int seccomp_restrict_archs(Set *archs) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ Iterator i;
+ void *id;
+ int r;
+
+ /* This installs a filter with no rules, but that restricts the system call architectures to the specified
+ * list. */
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ SET_FOREACH(id, archs, i) {
+ r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
+ if (r == -EEXIST)
+ continue;
+ if (r < 0)
+ return r;
}
- r = seccomp_load(seccomp);
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ return r;
-finish:
- seccomp_release(seccomp);
- return r;
+ return seccomp_load(seccomp);
}
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index 01cf331b29..4438e87fa6 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -23,12 +23,12 @@
#include <stdbool.h>
#include <stdint.h>
+#include "set.h"
+
const char* seccomp_arch_to_string(uint32_t c);
int seccomp_arch_from_string(const char *n, uint32_t *ret);
-int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action);
-
-int seccomp_add_secondary_archs(scmp_filter_ctx c);
+int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action);
bool is_seccomp_available(void);
@@ -56,7 +56,9 @@ enum {
SYSCALL_FILTER_SET_PRIVILEGED,
SYSCALL_FILTER_SET_PROCESS,
SYSCALL_FILTER_SET_RAW_IO,
+ SYSCALL_FILTER_SET_REBOOT,
SYSCALL_FILTER_SET_RESOURCES,
+ SYSCALL_FILTER_SET_SWAP,
_SYSCALL_FILTER_SET_MAX
};
@@ -64,8 +66,21 @@ extern const SyscallFilterSet syscall_filter_sets[];
const SyscallFilterSet *syscall_filter_set_find(const char *name);
-int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
-
-int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
+int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
+int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);
+int seccomp_restrict_archs(Set *archs);
int seccomp_restrict_namespaces(unsigned long retain);
+int seccomp_protect_sysctl(void);
+int seccomp_restrict_address_families(Set *address_families, bool whitelist);
+int seccomp_restrict_realtime(void);
+int seccomp_memory_deny_write_execute(void);
+
+extern const uint32_t seccomp_local_archs[];
+
+#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
+ for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \
+ seccomp_local_archs[_i] != (uint32_t) -1; \
+ (arch) = seccomp_local_archs[++_i])
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(scmp_filter_ctx, seccomp_release);