diff options
-rw-r--r-- | Makefile.am | 4 | ||||
-rw-r--r-- | man/systemd.exec.xml | 74 | ||||
-rw-r--r-- | src/core/dbus-execute.c | 2 | ||||
-rw-r--r-- | src/core/execute.c | 3 | ||||
-rw-r--r-- | src/core/execute.h | 1 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 2 | ||||
-rw-r--r-- | src/core/load-fragment.c | 1 | ||||
-rw-r--r-- | src/core/unit.c | 8 | ||||
-rw-r--r-- | src/shared/bus-util.c | 18 | ||||
-rw-r--r-- | src/test/test-execute.c | 13 | ||||
-rw-r--r-- | test/test-execute/exec-restrict-namespaces-mnt-blacklist.service | 7 | ||||
-rw-r--r-- | test/test-execute/exec-restrict-namespaces-mnt.service | 7 | ||||
-rw-r--r-- | test/test-execute/exec-restrict-namespaces-no.service | 7 | ||||
-rw-r--r-- | test/test-execute/exec-restrict-namespaces-yes.service | 7 |
14 files changed, 114 insertions, 40 deletions
diff --git a/Makefile.am b/Makefile.am index 6173e7a40f..47c2ec8a8d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1684,6 +1684,10 @@ EXTRA_DIST += \ test/test-execute/exec-runtimedirectory-mode.service \ test/test-execute/exec-runtimedirectory-owner.service \ test/test-execute/exec-runtimedirectory-owner-nfsnobody.service \ + test/test-execute/exec-restrict-namespaces-no.service \ + test/test-execute/exec-restrict-namespaces-yes.service \ + test/test-execute/exec-restrict-namespaces-mnt.service \ + test/test-execute/exec-restrict-namespaces-mnt-blacklist.service \ test/bus-policy/hello.conf \ test/bus-policy/methods.conf \ test/bus-policy/ownerships.conf \ diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3b39a9c912..f85dbb4cda 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -999,7 +999,11 @@ using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry> of <filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>. This setting is implied if <varname>DynamicUser=</varname> is set. For this setting the same restrictions regarding mount propagation and - privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.</para></listitem> + privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above. + If turned on and if running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant> + capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname> + is implied. + </para></listitem> </varlistentry> <varlistentry> @@ -1090,9 +1094,35 @@ mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for most services. For this setting the same restrictions regarding mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off. - Note that this option does not prevent kernel tuning through IPC interfaces and external programs. However - <varname>InaccessiblePaths=</varname> can be used to make some IPC file system objects - inaccessible.</para></listitem> + If turned on and if running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant> + capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname> + is implied. Note that this option does not prevent kernel tuning through IPC interfaces + and external programs. However <varname>InaccessiblePaths=</varname> can be used to + make some IPC file system objects inaccessible.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>ProtectKernelModules=</varname></term> + + <listitem><para>Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recommended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes <constant>CAP_SYS_MODULE</constant> from the capability bounding set for + the unit, and installs a system call filter to block module system calls, + also <filename>/usr/lib/modules</filename> is made inaccessible. For this + setting the same restrictions regarding mount propagation and privileges + apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> + <constant>kernel.modules_disabled</constant> mechanism and + <filename>/proc/sys/kernel/modules_disabled</filename> documentation. + If turned on and if running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant> + capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname> + is implied. + </para></listitem> </varlistentry> <varlistentry> @@ -1237,7 +1267,7 @@ <listitem><para>Takes a boolean argument. If true, ensures that the service process and all its children can never gain new privileges through <function>execve()</function> (e.g. via setuid or setgid bits, or filesystem capabilities). This is the simplest and most effective way to ensure that a process and its children can never - elevate privileges again. Defaults to false, but in the user manager instance certain settings force + elevate privileges again. Defaults to false, but certain settings force <varname>NoNewPrivileges=yes</varname>, ignoring the value of this setting. This is the case when <varname>SystemCallFilter=</varname>, <varname>SystemCallArchitectures=</varname>, <varname>RestrictAddressFamilies=</varname>, <varname>RestrictNamespaces=</varname>, @@ -1482,27 +1512,11 @@ <citerefentry><refentrytitle>setns</refentrytitle><manvolnum>2</manvolnum></citerefentry> system calls, taking the specified flags parameters into account. Note that — if this option is used — in addition to restricting creation and switching of the specified types of namespaces (or all of them, if true) access to the - <function>setns()</function> system call with a zero flags parameter is prohibited.</para></listitem> - </varlistentry> - - <varlistentry> - <term><varname>ProtectKernelModules=</varname></term> - - <listitem><para>Takes a boolean argument. If true, explicit module loading will - be denied. This allows to turn off module load and unload operations on modular - kernels. It is recommended to turn this on for most services that do not need special - file systems or extra kernel modules to work. Default to off. Enabling this option - removes <constant>CAP_SYS_MODULE</constant> from the capability bounding set for - the unit, and installs a system call filter to block module system calls, - also <filename>/usr/lib/modules</filename> is made inaccessible. For this - setting the same restrictions regarding mount propagation and privileges - apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above. - Note that limited automatic module loading due to user configuration or kernel - mapping tables might still happen as side effect of requested user operations, - both privileged and unprivileged. To disable module auto-load feature please see - <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> - <constant>kernel.modules_disabled</constant> mechanism and - <filename>/proc/sys/kernel/modules_disabled</filename> documentation.</para></listitem> + <function>setns()</function> system call with a zero flags parameter is prohibited. + If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant> + capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname> + is implied. + </para></listitem> </varlistentry> <varlistentry> @@ -1563,6 +1577,9 @@ that generate program code dynamically at runtime, such as JIT execution engines, or programs compiled making use of the code "trampoline" feature of various C compilers. This option improves service security, as it makes harder for software exploits to change running code dynamically. + If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant> + capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname> + is implied. </para></listitem> </varlistentry> @@ -1573,7 +1590,10 @@ the unit are refused. This restricts access to realtime task scheduling policies such as <constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See <citerefentry project='man-pages'><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about - these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods + these scheduling policies. If running in user mode, or in system mode, but + without the <constant>CAP_SYS_ADMIN</constant> capability + (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname> + is implied. Realtime scheduling policies may be used to monopolize CPU time for longer periods of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It is hence recommended to restrict access to realtime scheduling to the few programs that actually require them. Defaults to off.</para></listitem> diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index d7bb0496a0..23c1b44573 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -781,7 +781,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("RestrictNamespace", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_VTABLE_END }; diff --git a/src/core/execute.c b/src/core/execute.c index f666f7c6ce..04c4e511f4 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2201,7 +2201,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) { if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ return false; - return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */ + /* We need NNP if we have any form of seccomp and are unprivileged */ + return context_has_address_families(c) || c->memory_deny_write_execute || c->restrict_realtime || exec_context_restrict_namespaces_set(c) || diff --git a/src/core/execute.h b/src/core/execute.h index 56f880cffe..e52640ee91 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -216,7 +216,6 @@ struct ExecContext { bool nice_set:1; bool ioprio_set:1; bool cpu_sched_set:1; - bool no_new_privileges_set:1; }; static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index cb2f384f47..f4ef5a0140 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -57,7 +57,7 @@ m4_ifdef(`HAVE_SECCOMP', $1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) $1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context) $1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) -$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context.restrict_namespaces) +$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context) $1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) $1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)', `$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 52079980d8..970eed27c1 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3896,7 +3896,6 @@ int config_parse_no_new_privileges( } c->no_new_privileges = k; - c->no_new_privileges_set = true; return 0; } diff --git a/src/core/unit.c b/src/core/unit.c index bba0f5d357..da9bb58a52 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3429,14 +3429,6 @@ int unit_patch_contexts(Unit *u) { ec->working_directory_missing_ok = true; } - if (MANAGER_IS_USER(u->manager) && - (ec->syscall_whitelist || - !set_isempty(ec->syscall_filter) || - !set_isempty(ec->syscall_archs) || - ec->address_families_whitelist || - !set_isempty(ec->address_families))) - ec->no_new_privileges = true; - if (ec->private_devices) ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c index 3b8768b9a7..e7b1b1cb20 100644 --- a/src/shared/bus-util.c +++ b/src/shared/bus-util.c @@ -43,6 +43,7 @@ #include "escape.h" #include "fd-util.h" #include "missing.h" +#include "nsflags.h" #include "parse-util.h" #include "proc-cmdline.h" #include "rlimit-util.h" @@ -769,6 +770,23 @@ int bus_print_property(const char *name, sd_bus_message *property, bool value, b char timespan[FORMAT_TIMESPAN_MAX]; print_property(name, "%s", format_timespan(timespan, sizeof(timespan), u, 0)); + } else if (streq(name, "RestrictNamespaces")) { + _cleanup_free_ char *s = NULL; + const char *result = NULL; + + if ((u & NAMESPACE_FLAGS_ALL) == 0) + result = "yes"; + else if ((u & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL) + result = "no"; + else { + r = namespace_flag_to_string_many(u, &s); + if (r < 0) + return r; + + result = s; + } + + print_property(name, "%s", result); } else print_property(name, "%"PRIu64, u); diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 6029853e3e..b2ea358b8c 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -219,6 +219,18 @@ static void test_exec_systemcallerrornumber(Manager *m) { #endif } +static void test_exec_restrict_namespaces(Manager *m) { +#ifdef HAVE_SECCOMP + if (!is_seccomp_available()) + return; + + test(m, "exec-restrict-namespaces-no.service", 0, CLD_EXITED); + test(m, "exec-restrict-namespaces-yes.service", 1, CLD_EXITED); + test(m, "exec-restrict-namespaces-mnt.service", 0, CLD_EXITED); + test(m, "exec-restrict-namespaces-mnt-blacklist.service", 1, CLD_EXITED); +#endif +} + static void test_exec_systemcall_system_mode_with_user(Manager *m) { #ifdef HAVE_SECCOMP if (!is_seccomp_available()) @@ -435,6 +447,7 @@ int main(int argc, char *argv[]) { test_exec_privatenetwork, test_exec_systemcallfilter, test_exec_systemcallerrornumber, + test_exec_restrict_namespaces, test_exec_user, test_exec_group, test_exec_supplementary_groups, diff --git a/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service b/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service new file mode 100644 index 0000000000..ab909cbd94 --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=~mnt + +[Service] +RestrictNamespaces=~mnt +ExecStart=/bin/sh -x -c 'unshare -m' +Type=oneshot diff --git a/test/test-execute/exec-restrict-namespaces-mnt.service b/test/test-execute/exec-restrict-namespaces-mnt.service new file mode 100644 index 0000000000..1aeed72717 --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-mnt.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=mnt + +[Service] +RestrictNamespaces=mnt +ExecStart=/bin/sh -x -c 'unshare -m' +Type=oneshot diff --git a/test/test-execute/exec-restrict-namespaces-no.service b/test/test-execute/exec-restrict-namespaces-no.service new file mode 100644 index 0000000000..33500302d2 --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-no.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=no + +[Service] +RestrictNamespaces=no +ExecStart=/bin/sh -x -c 'unshare -m -u -i -n -p -f' +Type=oneshot diff --git a/test/test-execute/exec-restrict-namespaces-yes.service b/test/test-execute/exec-restrict-namespaces-yes.service new file mode 100644 index 0000000000..3fe70e2bea --- /dev/null +++ b/test/test-execute/exec-restrict-namespaces-yes.service @@ -0,0 +1,7 @@ +[Unit] +Description=Test RestrictNamespaces=yes + +[Service] +RestrictNamespaces=yes +ExecStart=/bin/sh -x -c 'unshare -m' +Type=oneshot |