From 0e3f29f03ffa9f65aa314fc7301f9ce2e9ee7203 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jun 2016 17:30:35 +0200 Subject: update TODO --- TODO | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TODO b/TODO index aeed0c84d2..929fb96491 100644 --- a/TODO +++ b/TODO @@ -47,6 +47,10 @@ Features: * RestrictNamespaces= or so in services (taking away the ability to create namespaces, with setns, unshare, clone) +* RestrictRealtime= which takes aware ability to create realtime processes + +* nspawn: make /proc/sys/net writable? + * make sure the ratelimit object can deal with USEC_INFINITY as way to turn off things * journalctl: make sure -f ends when the container indicated by -M terminates -- cgit v1.2.3-54-g00ecf From 50b52222f2d54a3c4d81e0e5987a0400cbcefb53 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jun 2016 17:31:06 +0200 Subject: nspawn: order caps to retain alphabetically --- src/nspawn/nspawn.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index ea24de7608..73c56d7310 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -137,6 +137,8 @@ static bool arg_ephemeral = false; static LinkJournal arg_link_journal = LINK_AUTO; static bool arg_link_journal_try = false; static uint64_t arg_caps_retain = + (1ULL << CAP_AUDIT_CONTROL) | + (1ULL << CAP_AUDIT_WRITE) | (1ULL << CAP_CHOWN) | (1ULL << CAP_DAC_OVERRIDE) | (1ULL << CAP_DAC_READ_SEARCH) | @@ -146,23 +148,21 @@ static uint64_t arg_caps_retain = (1ULL << CAP_KILL) | (1ULL << CAP_LEASE) | (1ULL << CAP_LINUX_IMMUTABLE) | + (1ULL << CAP_MKNOD) | (1ULL << CAP_NET_BIND_SERVICE) | (1ULL << CAP_NET_BROADCAST) | (1ULL << CAP_NET_RAW) | - (1ULL << CAP_SETGID) | (1ULL << CAP_SETFCAP) | + (1ULL << CAP_SETGID) | (1ULL << CAP_SETPCAP) | (1ULL << CAP_SETUID) | (1ULL << CAP_SYS_ADMIN) | + (1ULL << CAP_SYS_BOOT) | (1ULL << CAP_SYS_CHROOT) | (1ULL << CAP_SYS_NICE) | (1ULL << CAP_SYS_PTRACE) | - (1ULL << CAP_SYS_TTY_CONFIG) | (1ULL << CAP_SYS_RESOURCE) | - (1ULL << CAP_SYS_BOOT) | - (1ULL << CAP_AUDIT_WRITE) | - (1ULL << CAP_AUDIT_CONTROL) | - (1ULL << CAP_MKNOD); + (1ULL << CAP_SYS_TTY_CONFIG); static CustomMount *arg_custom_mounts = NULL; static unsigned arg_n_custom_mounts = 0; static char **arg_setenv = NULL; -- cgit v1.2.3-54-g00ecf From 1f9ac68b5bc671f1f8b0a32084810d39394208a6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jun 2016 17:43:38 +0200 Subject: core: improve seccomp syscall grouping a bit This adds three new seccomp syscall groups: @keyring for kernel keyring access, @cpu-emulation for CPU emulation features, for exampe vm86() for dosemu and suchlike, and @debug for ptrace() and related calls. Also, the @clock group is updated with more syscalls that alter the system clock. capset() is added to @privileged, and pciconfig_iobase() is added to @raw-io. Finally, @obsolete is a cleaned up. A number of syscalls that never existed on Linux and have no number assigned on any architecture are removed, as they only exist in the man pages and other operating sytems, but not in code at all. create_module() is moved from @module to @obsolete, as it is an obsolete system call. mem_getpolicy() is removed from the @obsolete list, as it is not obsolete, but simply a NUMA API. --- man/systemd.exec.xml | 38 ++++++++++++++++++-------------- src/shared/seccomp-util.c | 55 +++++++++++++++++++++++++++++++---------------- 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 1c3256a662..a39e800854 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1218,49 +1218,55 @@ @clock - System calls for changing the system clock (adjtimex(), - settimeofday()) + System calls for changing the system clock (adjtimex2, settimeofday2, and related calls) + + + @cpu-emulation + System calls for CPU emulation functionality (vm862 and related calls) + + + @debug + Debugging, performance monitoring and tracing functionality (ptrace2, perf_event_open2 and related calls) @io-event - Event loop use (poll(), select(), - epoll7, - eventfd()...) + Event loop system calls (poll2, select2, epoll7, eventfd2 and related calls) @ipc - SysV IPC, POSIX Message Queues or other IPC (mq_overview7, - svipc7) + SysV IPC, POSIX Message Queues or other IPC (mq_overview7, svipc7) + + + @keyring + Kernel keyring access (keyctl2 and related calls) @module - Kernel module control (create_module(), init_module()...) + Kernel module control (init_module2, delete_module2 and related calls) @mount - File system mounting and unmounting (chroot(), mount()...) + File system mounting and unmounting (mount2, chroot2, and related calls) @network-io - Socket I/O (including local AF_UNIX): - socket7, - unix7 + Socket I/O (including local AF_UNIX): socket7, unix7 @obsolete - Unusual, obsolete or unimplemented (fattach(), gtty(), vm86()...) + Unusual, obsolete or unimplemented (create_module2, gtty2, …) @privileged - All system calls which need superuser capabilities (capabilities7) + All system calls which need super-user capabilities (capabilities7) @process - Process control, execution, namespaces (execve(), kill(), namespaces7...) + Process control, execution, namespaces (execve2, kill2, namespaces7, … @raw-io - Raw I/O ports (ioperm(), iopl(), pciconfig_read()...) + Raw I/O port access (ioperm2, iopl2, pciconfig_read(), … diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 30d22d2242..8656d112b8 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -95,7 +95,31 @@ const SystemCallFilterSet syscall_filter_sets[] = { .set_name = "@clock", .value = "adjtimex\0" + "clock_adjtime\0" + "clock_settime\0" "settimeofday\0" + "stime\0" + }, { + /* CPU emulation calls */ + .set_name = "@cpu-emulation", + .value = + "modify_ldt\0" + "subpage_prot\0" + "switch_endian\0" + "vm86\0" + "vm86old\0" + }, { + /* Debugging/Performance Monitoring/Tracing */ + .set_name = "@debug", + .value = + "lookup_dcookie\0" + "perf_event_open\0" + "process_vm_readv\0" + "process_vm_writev\0" + "ptrace\0" + "rtas\0" + "s390_runtime_instr\0" + "sys_debug_setcontext\0" }, { /* Default list */ .set_name = "@default", @@ -147,11 +171,17 @@ const SystemCallFilterSet syscall_filter_sets[] = { "shmctl\0" "shmdt\0" "shmget\0" + }, { + /* Keyring */ + .set_name = "@keyring", + .value = + "add_key\0" + "keyctl\0" + "request_key\0" }, { /* Kernel module control */ .set_name = "@module", .value = - "create_module\0" "delete_module\0" "finit_module\0" "init_module\0" @@ -197,40 +227,26 @@ const SystemCallFilterSet syscall_filter_sets[] = { "_sysctl\0" "afs_syscall\0" "break\0" - "fattach\0" - "fdetach\0" + "create_module\0" "ftime\0" "get_kernel_syms\0" - "get_mempolicy\0" - "getmsg\0" "getpmsg\0" "gtty\0" - "isastream\0" "lock\0" - "madvise1\0" - "modify_ldt\0" "mpx\0" - "pciconfig_iobase\0" - "perf_event_open\0" "prof\0" "profil\0" - "putmsg\0" "putpmsg\0" "query_module\0" - "rtas\0" - "s390_runtime_instr\0" "security\0" "sgetmask\0" "ssetmask\0" "stty\0" - "subpage_prot\0" - "switch_endian\0" - "sys_debug_setcontext\0" + "sysfs\0" "tuxcall\0" "ulimit\0" "uselib\0" - "vm86\0" - "vm86old\0" + "ustat\0" "vserver\0" }, { /* Nice grab-bag of all system calls which need superuser capabilities */ @@ -242,6 +258,7 @@ const SystemCallFilterSet syscall_filter_sets[] = { "acct\0" "bdflush\0" "bpf\0" + "capset\0" "chown32\0" "chown\0" "chroot\0" @@ -268,7 +285,6 @@ const SystemCallFilterSet syscall_filter_sets[] = { "setreuid\0" "setuid32\0" "setuid\0" - "stime\0" "swapoff\0" "swapon\0" "sysctl\0" @@ -295,6 +311,7 @@ const SystemCallFilterSet syscall_filter_sets[] = { .value = "ioperm\0" "iopl\0" + "pciconfig_iobase\0" "pciconfig_read\0" "pciconfig_write\0" "s390_pci_mmio_read\0" -- cgit v1.2.3-54-g00ecf From 4e069746fe0de1f60bd1b75c113b0f40ffe86736 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jun 2016 18:00:12 +0200 Subject: units: tighten system call filters a bit Take away kernel keyring access, CPU emulation system calls and various debug system calls from the various daemons we have. --- units/systemd-hostnamed.service.in | 2 +- units/systemd-importd.service.in | 2 +- units/systemd-journald.service.in | 2 +- units/systemd-localed.service.in | 2 +- units/systemd-logind.service.in | 2 +- units/systemd-machined.service.in | 2 +- units/systemd-networkd.service.m4.in | 2 +- units/systemd-resolved.service.m4.in | 2 +- units/systemd-timedated.service.in | 2 +- units/systemd-timesyncd.service.in | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/units/systemd-hostnamed.service.in b/units/systemd-hostnamed.service.in index d8f18bed53..0b03a589ea 100644 --- a/units/systemd-hostnamed.service.in +++ b/units/systemd-hostnamed.service.in @@ -21,4 +21,4 @@ PrivateNetwork=yes ProtectSystem=yes ProtectHome=yes MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io diff --git a/units/systemd-importd.service.in b/units/systemd-importd.service.in index a3d1a1519b..0f5489e7e3 100644 --- a/units/systemd-importd.service.in +++ b/units/systemd-importd.service.in @@ -18,4 +18,4 @@ NoNewPrivileges=yes WatchdogSec=3min KillMode=mixed MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in index 58808d4f8c..08ace8ae44 100644 --- a/units/systemd-journald.service.in +++ b/units/systemd-journald.service.in @@ -25,7 +25,7 @@ CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG C WatchdogSec=3min FileDescriptorStoreMax=1024 MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io # Increase the default a bit in order to allow many simultaneous # services being run since we keep one fd open per service. Also, when diff --git a/units/systemd-localed.service.in b/units/systemd-localed.service.in index 5efa677548..1f3151c2b5 100644 --- a/units/systemd-localed.service.in +++ b/units/systemd-localed.service.in @@ -21,4 +21,4 @@ PrivateNetwork=yes ProtectSystem=yes ProtectHome=yes MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @privileged @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io diff --git a/units/systemd-logind.service.in b/units/systemd-logind.service.in index a9598760e2..bee08d011f 100644 --- a/units/systemd-logind.service.in +++ b/units/systemd-logind.service.in @@ -26,7 +26,7 @@ BusName=org.freedesktop.login1 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_MAC_ADMIN CAP_AUDIT_CONTROL CAP_CHOWN CAP_KILL CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_FOWNER CAP_SYS_TTY_CONFIG WatchdogSec=3min MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io # Increase the default a bit in order to allow many simultaneous # logins since we keep one fd open per session. diff --git a/units/systemd-machined.service.in b/units/systemd-machined.service.in index 82dca05338..cd4a097f5a 100644 --- a/units/systemd-machined.service.in +++ b/units/systemd-machined.service.in @@ -18,7 +18,7 @@ BusName=org.freedesktop.machine1 CapabilityBoundingSet=CAP_KILL CAP_SYS_PTRACE CAP_SYS_ADMIN CAP_SETGID CAP_SYS_CHROOT CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_CHOWN CAP_FOWNER CAP_FSETID CAP_MKNOD WatchdogSec=3min MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io # Note that machined cannot be placed in a mount namespace, since it # needs access to the host's mount namespace in order to implement the diff --git a/units/systemd-networkd.service.m4.in b/units/systemd-networkd.service.m4.in index 3feb2b84f5..38d967d2d1 100644 --- a/units/systemd-networkd.service.m4.in +++ b/units/systemd-networkd.service.m4.in @@ -32,7 +32,7 @@ ProtectSystem=full ProtectHome=yes WatchdogSec=3min MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io [Install] WantedBy=multi-user.target diff --git a/units/systemd-resolved.service.m4.in b/units/systemd-resolved.service.m4.in index 4a94f747e2..a9cc3988ed 100644 --- a/units/systemd-resolved.service.m4.in +++ b/units/systemd-resolved.service.m4.in @@ -28,7 +28,7 @@ ProtectSystem=full ProtectHome=yes WatchdogSec=3min MemoryDenyWriteExecute=yes -SystemCallFilter=~@clock @module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io [Install] WantedBy=multi-user.target diff --git a/units/systemd-timedated.service.in b/units/systemd-timedated.service.in index 1bdbe65aad..bc1795d747 100644 --- a/units/systemd-timedated.service.in +++ b/units/systemd-timedated.service.in @@ -19,4 +19,4 @@ PrivateTmp=yes ProtectSystem=yes ProtectHome=yes MemoryDenyWriteExecute=yes -SystemCallFilter=~@module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io diff --git a/units/systemd-timesyncd.service.in b/units/systemd-timesyncd.service.in index 8c86021f5e..df1e339196 100644 --- a/units/systemd-timesyncd.service.in +++ b/units/systemd-timesyncd.service.in @@ -29,7 +29,7 @@ ProtectSystem=full ProtectHome=yes WatchdogSec=3min MemoryDenyWriteExecute=yes -SystemCallFilter=~@module @mount @obsolete @raw-io ptrace +SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io [Install] WantedBy=sysinit.target -- cgit v1.2.3-54-g00ecf From 54a17e01de048a2275f8861b211f10d11e56407d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Jun 2016 18:04:02 +0200 Subject: nspawn: lock down system call filter a bit Let's block access to the kernel keyring and a number of obsolete system calls. Also, update list of syscalls that may alter the system clock, and do raw IO access. Filter ptrace() if CAP_SYS_PTRACE is not passed to the container and acct() if CAP_SYS_PACCT is not passed. This also changes things so that kexec(), some profiling calls, the swap calls and quotactl() is never available to containers, not even if CAP_SYS_ADMIN is passed. After all we currently permit CAP_SYS_ADMIN to containers by default, but these calls should not be available, even then. --- src/nspawn/nspawn-seccomp.c | 78 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 11 deletions(-) diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 2d145b68a7..54db1b47f8 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -44,20 +44,76 @@ static int seccomp_add_default_syscall_filter(scmp_filter_ctx ctx, uint64_t capability; int syscall_num; } blacklist[] = { - { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, - { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, - { CAP_SYS_BOOT, SCMP_SYS(kexec_load) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapon) }, - { CAP_SYS_ADMIN, SCMP_SYS(swapoff) }, - { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) }, - { CAP_SYS_MODULE, SCMP_SYS(init_module) }, - { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, - { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, - { CAP_SYSLOG, SCMP_SYS(syslog) }, + { 0, SCMP_SYS(_sysctl) }, /* obsolete syscall */ + { 0, SCMP_SYS(add_key) }, /* keyring is not namespaced */ + { 0, SCMP_SYS(afs_syscall) }, /* obsolete syscall */ + { 0, SCMP_SYS(bdflush) }, +#ifdef __NR_bpf + { 0, SCMP_SYS(bpf) }, +#endif + { 0, SCMP_SYS(break) }, /* obsolete syscall */ + { 0, SCMP_SYS(create_module) }, /* obsolete syscall */ + { 0, SCMP_SYS(ftime) }, /* obsolete syscall */ + { 0, SCMP_SYS(get_kernel_syms) }, /* obsolete syscall */ + { 0, SCMP_SYS(getpmsg) }, /* obsolete syscall */ + { 0, SCMP_SYS(gtty) }, /* obsolete syscall */ +#ifdef __NR_kexec_file_load + { 0, SCMP_SYS(kexec_file_load) }, +#endif + { 0, SCMP_SYS(kexec_load) }, + { 0, SCMP_SYS(keyctl) }, /* keyring is not namespaced */ + { 0, SCMP_SYS(lock) }, /* obsolete syscall */ + { 0, SCMP_SYS(lookup_dcookie) }, + { 0, SCMP_SYS(mpx) }, /* obsolete syscall */ + { 0, SCMP_SYS(nfsservctl) }, /* obsolete syscall */ + { 0, SCMP_SYS(open_by_handle_at) }, + { 0, SCMP_SYS(perf_event_open) }, + { 0, SCMP_SYS(prof) }, /* obsolete syscall */ + { 0, SCMP_SYS(profil) }, /* obsolete syscall */ + { 0, SCMP_SYS(putpmsg) }, /* obsolete syscall */ + { 0, SCMP_SYS(query_module) }, /* obsolete syscall */ + { 0, SCMP_SYS(quotactl) }, + { 0, SCMP_SYS(request_key) }, /* keyring is not namespaced */ + { 0, SCMP_SYS(security) }, /* obsolete syscall */ + { 0, SCMP_SYS(sgetmask) }, /* obsolete syscall */ + { 0, SCMP_SYS(ssetmask) }, /* obsolete syscall */ + { 0, SCMP_SYS(stty) }, /* obsolete syscall */ + { 0, SCMP_SYS(swapoff) }, + { 0, SCMP_SYS(swapon) }, + { 0, SCMP_SYS(sysfs) }, /* obsolete syscall */ + { 0, SCMP_SYS(tuxcall) }, /* obsolete syscall */ + { 0, SCMP_SYS(ulimit) }, /* obsolete syscall */ + { 0, SCMP_SYS(uselib) }, /* obsolete syscall */ + { 0, SCMP_SYS(ustat) }, /* obsolete syscall */ + { 0, SCMP_SYS(vserver) }, /* obsolete syscall */ + { CAP_SYSLOG, SCMP_SYS(syslog) }, + { CAP_SYS_MODULE, SCMP_SYS(delete_module) }, + { CAP_SYS_MODULE, SCMP_SYS(finit_module) }, + { CAP_SYS_MODULE, SCMP_SYS(init_module) }, + { CAP_SYS_PACCT, SCMP_SYS(acct) }, + { CAP_SYS_PTRACE, SCMP_SYS(process_vm_readv) }, + { CAP_SYS_PTRACE, SCMP_SYS(process_vm_writev) }, + { CAP_SYS_PTRACE, SCMP_SYS(ptrace) }, + { CAP_SYS_RAWIO, SCMP_SYS(ioperm) }, + { CAP_SYS_RAWIO, SCMP_SYS(iopl) }, + { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_iobase) }, + { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_read) }, + { CAP_SYS_RAWIO, SCMP_SYS(pciconfig_write) }, +#ifdef __NR_s390_pci_mmio_read + { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_read) }, +#endif +#ifdef __NR_s390_pci_mmio_write + { CAP_SYS_RAWIO, SCMP_SYS(s390_pci_mmio_write) }, +#endif + { CAP_SYS_TIME, SCMP_SYS(adjtimex) }, + { CAP_SYS_TIME, SCMP_SYS(clock_adjtime) }, + { CAP_SYS_TIME, SCMP_SYS(clock_settime) }, + { CAP_SYS_TIME, SCMP_SYS(settimeofday) }, + { CAP_SYS_TIME, SCMP_SYS(stime) }, }; for (i = 0; i < ELEMENTSOF(blacklist); i++) { - if (cap_list_retain & (1ULL << blacklist[i].capability)) + if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) continue; r = seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0); -- cgit v1.2.3-54-g00ecf