From c92e8afebd6126b4d679ee1a2dc2a5b74a8b49c7 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Tue, 15 Nov 2016 10:15:27 +0100 Subject: core: improve the logic that implies no new privileges The no_new_privileged_set variable is not used any more since commit 9b232d3241fcfbf60af that fixed another thing. So remove it. Also no need to check if we are under user manager, remove that part too. --- src/core/execute.c | 3 ++- src/core/execute.h | 1 - src/core/load-fragment.c | 1 - src/core/unit.c | 8 -------- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index f666f7c6ce..04c4e511f4 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2201,7 +2201,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) { if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ return false; - return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */ + /* We need NNP if we have any form of seccomp and are unprivileged */ + return context_has_address_families(c) || c->memory_deny_write_execute || c->restrict_realtime || exec_context_restrict_namespaces_set(c) || diff --git a/src/core/execute.h b/src/core/execute.h index 56f880cffe..e52640ee91 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -216,7 +216,6 @@ struct ExecContext { bool nice_set:1; bool ioprio_set:1; bool cpu_sched_set:1; - bool no_new_privileges_set:1; }; static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 52079980d8..970eed27c1 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3896,7 +3896,6 @@ int config_parse_no_new_privileges( } c->no_new_privileges = k; - c->no_new_privileges_set = true; return 0; } diff --git a/src/core/unit.c b/src/core/unit.c index bba0f5d357..da9bb58a52 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3429,14 +3429,6 @@ int unit_patch_contexts(Unit *u) { ec->working_directory_missing_ok = true; } - if (MANAGER_IS_USER(u->manager) && - (ec->syscall_whitelist || - !set_isempty(ec->syscall_filter) || - !set_isempty(ec->syscall_archs) || - ec->address_families_whitelist || - !set_isempty(ec->address_families))) - ec->no_new_privileges = true; - if (ec->private_devices) ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); -- cgit v1.2.3-54-g00ecf From a7db8614f390615c3dea8d73adf9a6a2cff88c07 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 14 Nov 2016 10:02:00 +0100 Subject: doc: note when no new privileges is implied --- man/systemd.exec.xml | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3b39a9c912..669b726920 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -999,7 +999,11 @@ using mmap2 of /dev/zero instead of using MAP_ANON. This setting is implied if DynamicUser= is set. For this setting the same restrictions regarding mount propagation and - privileges apply as for ReadOnlyPaths= and related calls, see above. + privileges apply as for ReadOnlyPaths= and related calls, see above. + If turned on and if running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=), NoNewPrivileges=yes + is implied. + @@ -1090,9 +1094,11 @@ mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for most services. For this setting the same restrictions regarding mount propagation and privileges apply as for ReadOnlyPaths= and related calls, see above. Defaults to off. - Note that this option does not prevent kernel tuning through IPC interfaces and external programs. However - InaccessiblePaths= can be used to make some IPC file system objects - inaccessible. + If turned on and if running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=), NoNewPrivileges=yes + is implied. Note that this option does not prevent kernel tuning through IPC interfaces + and external programs. However InaccessiblePaths= can be used to + make some IPC file system objects inaccessible. @@ -1237,7 +1243,7 @@ Takes a boolean argument. If true, ensures that the service process and all its children can never gain new privileges through execve() (e.g. via setuid or setgid bits, or filesystem capabilities). This is the simplest and most effective way to ensure that a process and its children can never - elevate privileges again. Defaults to false, but in the user manager instance certain settings force + elevate privileges again. Defaults to false, but certain settings force NoNewPrivileges=yes, ignoring the value of this setting. This is the case when SystemCallFilter=, SystemCallArchitectures=, RestrictAddressFamilies=, RestrictNamespaces=, @@ -1482,7 +1488,11 @@ setns2 system calls, taking the specified flags parameters into account. Note that — if this option is used — in addition to restricting creation and switching of the specified types of namespaces (or all of them, if true) access to the - setns() system call with a zero flags parameter is prohibited. + setns() system call with a zero flags parameter is prohibited. + If running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=), NoNewPrivileges=yes + is implied. + @@ -1502,7 +1512,11 @@ both privileged and unprivileged. To disable module auto-load feature please see sysctl.d5 kernel.modules_disabled mechanism and - /proc/sys/kernel/modules_disabled documentation. + /proc/sys/kernel/modules_disabled documentation. + If turned on and if running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=), NoNewPrivileges=yes + is implied. + @@ -1563,6 +1577,9 @@ that generate program code dynamically at runtime, such as JIT execution engines, or programs compiled making use of the code "trampoline" feature of various C compilers. This option improves service security, as it makes harder for software exploits to change running code dynamically. + If running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=), NoNewPrivileges=yes + is implied. @@ -1573,7 +1590,10 @@ the unit are refused. This restricts access to realtime task scheduling policies such as SCHED_FIFO, SCHED_RR or SCHED_DEADLINE. See sched7 for details about - these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods + these scheduling policies. If running in user mode, or in system mode, but + without the CAP_SYS_ADMIN capability + (e.g. setting User=), NoNewPrivileges=yes + is implied. Realtime scheduling policies may be used to monopolize CPU time for longer periods of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It is hence recommended to restrict access to realtime scheduling to the few programs that actually require them. Defaults to off. -- cgit v1.2.3-54-g00ecf From 6a8c2d591577fc7768fcfb216de1ca6e1a3b590f Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 14 Nov 2016 08:24:41 +0100 Subject: core: property is RestrictNamespaces with s --- src/core/dbus-execute.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index d7bb0496a0..23c1b44573 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -781,7 +781,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("RestrictNamespace", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_VTABLE_END }; -- cgit v1.2.3-54-g00ecf From 85265556807397546a4742609b5168d19aa0df96 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 14 Nov 2016 08:32:06 +0100 Subject: doc: move ProtectKernelModules= documentation near ProtectKernelTunalbes= --- man/systemd.exec.xml | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 669b726920..f85dbb4cda 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1101,6 +1101,30 @@ make some IPC file system objects inaccessible. + + ProtectKernelModules= + + Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recommended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes CAP_SYS_MODULE from the capability bounding set for + the unit, and installs a system call filter to block module system calls, + also /usr/lib/modules is made inaccessible. For this + setting the same restrictions regarding mount propagation and privileges + apply as for ReadOnlyPaths= and related calls, see above. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + sysctl.d5 + kernel.modules_disabled mechanism and + /proc/sys/kernel/modules_disabled documentation. + If turned on and if running in user mode, or in system mode, but without the CAP_SYS_ADMIN + capability (e.g. setting User=), NoNewPrivileges=yes + is implied. + + + ProtectControlGroups= @@ -1495,30 +1519,6 @@ - - ProtectKernelModules= - - Takes a boolean argument. If true, explicit module loading will - be denied. This allows to turn off module load and unload operations on modular - kernels. It is recommended to turn this on for most services that do not need special - file systems or extra kernel modules to work. Default to off. Enabling this option - removes CAP_SYS_MODULE from the capability bounding set for - the unit, and installs a system call filter to block module system calls, - also /usr/lib/modules is made inaccessible. For this - setting the same restrictions regarding mount propagation and privileges - apply as for ReadOnlyPaths= and related calls, see above. - Note that limited automatic module loading due to user configuration or kernel - mapping tables might still happen as side effect of requested user operations, - both privileged and unprivileged. To disable module auto-load feature please see - sysctl.d5 - kernel.modules_disabled mechanism and - /proc/sys/kernel/modules_disabled documentation. - If turned on and if running in user mode, or in system mode, but without the CAP_SYS_ADMIN - capability (e.g. setting User=), NoNewPrivileges=yes - is implied. - - - Personality= -- cgit v1.2.3-54-g00ecf From d6299d613f916e1d63ef81d6d277d4e98b8e8194 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 14 Nov 2016 09:12:21 +0100 Subject: core:gperf: pass the exec_context struct directly to parse restrict namespaces The RestrictNamespaces= takes yes, no or a list of namespaces types, therefor config_parse_restrict_namespaces() is a bit complex and it operates on the ExecContext, fix this by passing the offset of ExecContext directly otherwise restricting namespaces won't work. --- src/core/load-fragment-gperf.gperf.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index cb2f384f47..f4ef5a0140 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -57,7 +57,7 @@ m4_ifdef(`HAVE_SECCOMP', $1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) $1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context) $1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) -$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context.restrict_namespaces) +$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context) $1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) $1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)', `$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 -- cgit v1.2.3-54-g00ecf From 97e60383c0648e961c317188003130639c1de8d6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Tue, 15 Nov 2016 15:50:19 +0100 Subject: test: add tests for RestrictNamespaces= --- Makefile.am | 4 ++++ src/test/test-execute.c | 13 +++++++++++++ .../exec-restrict-namespaces-mnt-blacklist.service | 7 +++++++ test/test-execute/exec-restrict-namespaces-mnt.service | 7 +++++++ test/test-execute/exec-restrict-namespaces-no.service | 7 +++++++ test/test-execute/exec-restrict-namespaces-yes.service | 7 +++++++ 6 files changed, 45 insertions(+) create mode 100644 test/test-execute/exec-restrict-namespaces-mnt-blacklist.service create mode 100644 test/test-execute/exec-restrict-namespaces-mnt.service create mode 100644 test/test-execute/exec-restrict-namespaces-no.service create mode 100644 test/test-execute/exec-restrict-namespaces-yes.service diff --git a/Makefile.am b/Makefile.am index 6173e7a40f..47c2ec8a8d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1684,6 +1684,10 @@ EXTRA_DIST += \ test/test-execute/exec-runtimedirectory-mode.service \ test/test-execute/exec-runtimedirectory-owner.service \ test/test-execute/exec-runtimedirectory-owner-nfsnobody.service \ + test/test-execute/exec-restrict-namespaces-no.service \ + test/test-execute/exec-restrict-namespaces-yes.service \ + test/test-execute/exec-restrict-namespaces-mnt.service \ + test/test-execute/exec-restrict-namespaces-mnt-blacklist.service \ test/bus-policy/hello.conf \ test/bus-policy/methods.conf \ test/bus-policy/ownerships.conf \ diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 6029853e3e..b2ea358b8c 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -219,6 +219,18 @@ static void test_exec_systemcallerrornumber(Manager *m) { #endif } +static void test_exec_restrict_namespaces(Manager *m) { +#ifdef HAVE_SECCOMP + if (!is_seccomp_available()) + return; + + test(m, "exec-restrict-namespaces-no.service", 0, CLD_EXITED); + test(m, "exec-restrict-namespaces-yes.service", 1, CLD_EXITED); + test(m, "exec-restrict-namespaces-mnt.service", 0, CLD_EXITED); + test(m, "exec-restrict-namespaces-mnt-blacklist.service", 1, CLD_EXITED); +#endif +} + static void test_exec_systemcall_system_mode_with_user(Manager *m) { #ifdef HAVE_SECCOMP if (!is_seccomp_available()) @@ -435,6 +447,7 @@ int main(int argc, char *argv[]) { test_exec_privatenetwork, test_exec_systemcallfilter, test_exec_systemcallerrornumber, + test_exec_restrict_namespaces, test_exec_user, test_exec_group, test_exec_supplementary_groups, diff --git a/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service b/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service new file mode 100644 index 0000000000..ab909cbd94 --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=~mnt + +[Service] +RestrictNamespaces=~mnt +ExecStart=/bin/sh -x -c 'unshare -m' +Type=oneshot diff --git a/test/test-execute/exec-restrict-namespaces-mnt.service b/test/test-execute/exec-restrict-namespaces-mnt.service new file mode 100644 index 0000000000..1aeed72717 --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-mnt.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=mnt + +[Service] +RestrictNamespaces=mnt +ExecStart=/bin/sh -x -c 'unshare -m' +Type=oneshot diff --git a/test/test-execute/exec-restrict-namespaces-no.service b/test/test-execute/exec-restrict-namespaces-no.service new file mode 100644 index 0000000000..33500302d2 --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-no.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=no + +[Service] +RestrictNamespaces=no +ExecStart=/bin/sh -x -c 'unshare -m -u -i -n -p -f' +Type=oneshot diff --git a/test/test-execute/exec-restrict-namespaces-yes.service b/test/test-execute/exec-restrict-namespaces-yes.service new file mode 100644 index 0000000000..3fe70e2bea --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-yes.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=yes + +[Service] +RestrictNamespaces=yes +ExecStart=/bin/sh -x -c 'unshare -m' +Type=oneshot -- cgit v1.2.3-54-g00ecf From 73186d534b1d4a8c217cf102ffd837d8e61a7e42 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Tue, 15 Nov 2016 15:15:37 +0100 Subject: bus-util: print RestrictNamespaces= as a string Allow all callers that want to print RestrictNamespaces= returned from D-Bus as a string instead of a u64 value. --- src/shared/bus-util.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c index 3b8768b9a7..e7b1b1cb20 100644 --- a/src/shared/bus-util.c +++ b/src/shared/bus-util.c @@ -43,6 +43,7 @@ #include "escape.h" #include "fd-util.h" #include "missing.h" +#include "nsflags.h" #include "parse-util.h" #include "proc-cmdline.h" #include "rlimit-util.h" @@ -769,6 +770,23 @@ int bus_print_property(const char *name, sd_bus_message *property, bool value, b char timespan[FORMAT_TIMESPAN_MAX]; print_property(name, "%s", format_timespan(timespan, sizeof(timespan), u, 0)); + } else if (streq(name, "RestrictNamespaces")) { + _cleanup_free_ char *s = NULL; + const char *result = NULL; + + if ((u & NAMESPACE_FLAGS_ALL) == 0) + result = "yes"; + else if ((u & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL) + result = "no"; + else { + r = namespace_flag_to_string_many(u, &s); + if (r < 0) + return r; + + result = s; + } + + print_property(name, "%s", result); } else print_property(name, "%"PRIu64, u); -- cgit v1.2.3-54-g00ecf