From c79aff9a82abf361aea47b5c745ed9729c5f0212 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:38:36 +0200 Subject: seccomp: add clock query and sleeping syscalls to "@default" group Timing and sleep are so basic operations, it makes very little sense to ever block them, hence don't. --- src/shared/seccomp-util.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1cbbb9d757..ad5782fb29 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -253,15 +253,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "sys_debug_setcontext\0" }, [SYSCALL_FILTER_SET_DEFAULT] = { - /* Default list */ + /* Default list: the most basic of operations */ .name = "@default", .value = + "clock_getres\0" + "clock_gettime\0" + "clock_nanosleep\0" "execve\0" "exit\0" "exit_group\0" "getrlimit\0" /* make sure processes can query stack size and such */ + "gettimeofday\0" + "nanosleep\0" + "pause\0" "rt_sigreturn\0" "sigreturn\0" + "time\0" }, [SYSCALL_FILTER_SET_IO_EVENT] = { /* Event loop use */ -- cgit v1.2.3-54-g00ecf From a8c157ff3081ee963adb0d046015abf9a271fa67 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:42:10 +0200 Subject: seccomp: drop execve() from @process list The system call is already part in @default hence implicitly allowed anyway. Also, if it is actually blocked then systemd couldn't execute the service in question anymore, since the application of seccomp is immediately followed by it. --- man/systemd.exec.xml | 2 +- src/shared/seccomp-util.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'src') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index e7d8bb23a4..d45e5362dc 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1347,7 +1347,7 @@ @process - Process control, execution, namespaces (execve2, kill2, namespaces7, … + Process control, execution, namespaces (clone2, kill2, namespaces7, … @raw-io diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index ad5782fb29..70723e9e4e 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -443,7 +443,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { .value = "arch_prctl\0" "clone\0" - "execve\0" "execveat\0" "fork\0" "kill\0" -- cgit v1.2.3-54-g00ecf From cd5bfd7e60c08cfad41bcf881f550c424b2f3e44 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:43:31 +0200 Subject: seccomp: include pipes and memfd in @ipc These system calls clearly fall in the @ipc category, hence should be listed there, simply to avoid confusion and surprise by the user. --- man/systemd.exec.xml | 2 +- src/shared/seccomp-util.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index d45e5362dc..466511aaf3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1319,7 +1319,7 @@ @ipc - SysV IPC, POSIX Message Queues or other IPC (mq_overview7, svipc7) + Pipes, SysV IPC, POSIX Message Queues and other IPC (mq_overview7, svipc7) @keyring diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 70723e9e4e..e0a61aa358 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -290,9 +290,10 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "select\0" }, [SYSCALL_FILTER_SET_IPC] = { - /* Message queues, SYSV IPC or other IPC: unusual */ + /* Message queues, SYSV IPC or other IPC */ .name = "@ipc", .value = "ipc\0" + "memfd_create\0" "mq_getsetattr\0" "mq_notify\0" "mq_open\0" @@ -303,6 +304,8 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "msgget\0" "msgrcv\0" "msgsnd\0" + "pipe2\0" + "pipe\0" "process_vm_readv\0" "process_vm_writev\0" "semctl\0" -- cgit v1.2.3-54-g00ecf From 133ddbbeae74fc06173633605b3e612e934bc2dd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 2 Nov 2016 08:46:18 -0600 Subject: seccomp: add two new syscall groups @resources contains various syscalls that alter resource limits and memory and scheduling parameters of processes. As such they are good candidates to block for most services. @basic-io contains a number of basic syscalls for I/O, similar to the list seccomp v1 permitted but slightly more complete. It should be useful for building basic whitelisting for minimal sandboxes --- man/systemd.exec.xml | 8 ++++++++ src/shared/seccomp-util.c | 34 ++++++++++++++++++++++++++++++++++ src/shared/seccomp-util.h | 2 ++ 3 files changed, 44 insertions(+) (limited to 'src') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3b80bcccd0..7daa3ae78e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1301,6 +1301,10 @@ + + @basic-io + System calls for basic I/O: reading, writing, seeking, file descriptor duplication and closing (read2, write2, and related calls) + @clock System calls for changing the system clock (adjtimex2, settimeofday2, and related calls) @@ -1353,6 +1357,10 @@ @raw-io Raw I/O port access (ioperm2, iopl2, pciconfig_read(), …) + + @resources + System calls for changing resource limits, memory and scheduling parameters (setrlimit2, setpriority2, …) + diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index e0a61aa358..c9b24f1065 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -217,6 +217,24 @@ bool is_seccomp_available(void) { } const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_BASIC_IO] = { + /* Basic IO */ + .name = "@basic-io", + .value = + "close\0" + "dup2\0" + "dup3\0" + "dup\0" + "lseek\0" + "pread64\0" + "preadv\0" + "pwrite64\0" + "pwritev\0" + "read\0" + "readv\0" + "write\0" + "writev\0" + }, [SYSCALL_FILTER_SET_CLOCK] = { /* Clock */ .name = "@clock", @@ -472,6 +490,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "s390_pci_mmio_write\0" #endif }, + [SYSCALL_FILTER_SET_RESOURCES] = { + /* Alter resource settings */ + .name = "@resources", + .value = + "sched_setparam\0" + "sched_setscheduler\0" + "sched_setaffinity\0" + "setpriority\0" + "setrlimit\0" + "set_mempolicy\0" + "migrate_pages\0" + "move_pages\0" + "mbind\0" + "sched_setattr\0" + "prlimit64\0" + }, }; const SyscallFilterSet *syscall_filter_set_find(const char *name) { diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 8050fc6fbf..8e209efef2 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -38,6 +38,7 @@ typedef struct SyscallFilterSet { } SyscallFilterSet; enum { + SYSCALL_FILTER_SET_BASIC_IO, SYSCALL_FILTER_SET_CLOCK, SYSCALL_FILTER_SET_CPU_EMULATION, SYSCALL_FILTER_SET_DEBUG, @@ -52,6 +53,7 @@ enum { SYSCALL_FILTER_SET_PRIVILEGED, SYSCALL_FILTER_SET_PROCESS, SYSCALL_FILTER_SET_RAW_IO, + SYSCALL_FILTER_SET_RESOURCES, _SYSCALL_FILTER_SET_MAX }; -- cgit v1.2.3-54-g00ecf From 5cd9cd3537d1afca85877103615e61e6c03e7079 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:52:54 +0200 Subject: execute: apply seccomp filters after changing selinux/aa/smack contexts Seccomp is generally an unprivileged operation, changing security contexts is most likely associated with some form of policy. Moreover, while seccomp may influence our own flow of code quite a bit (much more than the security context change) make sure to apply the seccomp filters immediately before executing the binary to invoke. This also moves enforcement of NNP after the security context change, so that NNP cannot affect it anymore. (However, the security policy now has to permit the NNP change). This change has a good chance of breaking current SELinux/AA/SMACK setups, because the policy might not expect this change of behaviour. However, it's technically the better choice I think and should hence be applied. Fixes: #3993 --- src/core/execute.c | 70 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 31 deletions(-) (limited to 'src') diff --git a/src/core/execute.c b/src/core/execute.c index ae9df41b99..c73e2e7220 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2538,12 +2538,6 @@ static int exec_child( (void) umask(context->umask); if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - r = setup_smack(context, command); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - if (context->pam_name && username) { r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { @@ -2693,6 +2687,41 @@ static int exec_child( } } + /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to + * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires + * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls + * are restricted. */ + +#ifdef HAVE_SELINUX + if (mac_selinux_use()) { + char *exec_context = mac_selinux_context_net ?: context->selinux_context; + + if (exec_context) { + r = setexeccon(exec_context); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return r; + } + } + } +#endif + + r = setup_smack(context, command); + if (r < 0) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return r; + } + +#ifdef HAVE_APPARMOR + if (context->apparmor_profile && mac_apparmor_use()) { + r = aa_change_onexec(context->apparmor_profile); + if (r < 0 && !context->apparmor_profile_ignore) { + *exit_status = EXIT_APPARMOR_PROFILE; + return -errno; + } + } +#endif + /* PR_GET_SECUREBITS is not privileged, while * PR_SET_SECUREBITS is. So to suppress * potential EPERMs we'll try not to call @@ -2758,6 +2787,8 @@ static int exec_child( } } + /* This really should remain the last step before the execve(), to make sure our own code is unaffected + * by the filter as little as possible. */ if (context_has_syscall_filters(context)) { r = apply_seccomp(unit, context); if (r < 0) { @@ -2766,30 +2797,6 @@ static int exec_child( } } #endif - -#ifdef HAVE_SELINUX - if (mac_selinux_use()) { - char *exec_context = mac_selinux_context_net ?: context->selinux_context; - - if (exec_context) { - r = setexeccon(exec_context); - if (r < 0) { - *exit_status = EXIT_SELINUX_CONTEXT; - return r; - } - } - } -#endif - -#ifdef HAVE_APPARMOR - if (context->apparmor_profile && mac_apparmor_use()) { - r = aa_change_onexec(context->apparmor_profile); - if (r < 0 && !context->apparmor_profile_ignore) { - *exit_status = EXIT_APPARMOR_PROFILE; - return -errno; - } - } -#endif } final_argv = replace_env_argv(argv, accum_env); @@ -3611,7 +3618,8 @@ char *exec_command_line(char **argv) { STRV_FOREACH(a, argv) k += strlen(*a)+3; - if (!(n = new(char, k))) + n = new(char, k); + if (!n) return NULL; p = n; -- cgit v1.2.3-54-g00ecf From 999a6c5d9c8bc9536f98ce26dde1de5adaabe29b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 1 Nov 2016 20:30:11 -0600 Subject: tests: make sure tests pass when invoked in "sudo" This is a follow-up for 6309e51ea32d64524431ee65c49eecd44390da8f and makes sure we compare test results with the right user identifier. --- src/test/test-unit-file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/test/test-unit-file.c b/src/test/test-unit-file.c index 7ef087a2e3..12f48bf435 100644 --- a/src/test/test-unit-file.c +++ b/src/test/test-unit-file.c @@ -589,7 +589,7 @@ static void test_install_printf(void) { assert_se(specifier_machine_id('m', NULL, NULL, &mid) >= 0 && mid); assert_se(specifier_boot_id('b', NULL, NULL, &bid) >= 0 && bid); assert_se((host = gethostname_malloc())); - assert_se((user = getusername_malloc())); + assert_se((user = uid_to_name(getuid()))); assert_se(asprintf(&uid, UID_FMT, getuid()) >= 0); #define expect(src, pattern, result) \ -- cgit v1.2.3-54-g00ecf