summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am4
-rw-r--r--man/systemd.exec.xml74
-rw-r--r--src/core/dbus-execute.c2
-rw-r--r--src/core/execute.c3
-rw-r--r--src/core/execute.h1
-rw-r--r--src/core/load-fragment-gperf.gperf.m42
-rw-r--r--src/core/load-fragment.c1
-rw-r--r--src/core/unit.c8
-rw-r--r--src/shared/bus-util.c18
-rw-r--r--src/test/test-execute.c13
-rw-r--r--test/test-execute/exec-restrict-namespaces-mnt-blacklist.service7
-rw-r--r--test/test-execute/exec-restrict-namespaces-mnt.service7
-rw-r--r--test/test-execute/exec-restrict-namespaces-no.service7
-rw-r--r--test/test-execute/exec-restrict-namespaces-yes.service7
14 files changed, 114 insertions, 40 deletions
diff --git a/Makefile.am b/Makefile.am
index 6173e7a40f..47c2ec8a8d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1684,6 +1684,10 @@ EXTRA_DIST += \
test/test-execute/exec-runtimedirectory-mode.service \
test/test-execute/exec-runtimedirectory-owner.service \
test/test-execute/exec-runtimedirectory-owner-nfsnobody.service \
+ test/test-execute/exec-restrict-namespaces-no.service \
+ test/test-execute/exec-restrict-namespaces-yes.service \
+ test/test-execute/exec-restrict-namespaces-mnt.service \
+ test/test-execute/exec-restrict-namespaces-mnt-blacklist.service \
test/bus-policy/hello.conf \
test/bus-policy/methods.conf \
test/bus-policy/ownerships.conf \
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 3b39a9c912..f85dbb4cda 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -999,7 +999,11 @@
using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry> of
<filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>. This setting is implied if
<varname>DynamicUser=</varname> is set. For this setting the same restrictions regarding mount propagation and
- privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.</para></listitem>
+ privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.
+ If turned on and if running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
+ capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname>
+ is implied.
+ </para></listitem>
</varlistentry>
<varlistentry>
@@ -1090,9 +1094,35 @@
mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
<varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.
- Note that this option does not prevent kernel tuning through IPC interfaces and external programs. However
- <varname>InaccessiblePaths=</varname> can be used to make some IPC file system objects
- inaccessible.</para></listitem>
+ If turned on and if running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
+ capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname>
+ is implied. Note that this option does not prevent kernel tuning through IPC interfaces
+ and external programs. However <varname>InaccessiblePaths=</varname> can be used to
+ make some IPC file system objects inaccessible.</para></listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><varname>ProtectKernelModules=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If true, explicit module loading will
+ be denied. This allows to turn off module load and unload operations on modular
+ kernels. It is recommended to turn this on for most services that do not need special
+ file systems or extra kernel modules to work. Default to off. Enabling this option
+ removes <constant>CAP_SYS_MODULE</constant> from the capability bounding set for
+ the unit, and installs a system call filter to block module system calls,
+ also <filename>/usr/lib/modules</filename> is made inaccessible. For this
+ setting the same restrictions regarding mount propagation and privileges
+ apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.
+ Note that limited automatic module loading due to user configuration or kernel
+ mapping tables might still happen as side effect of requested user operations,
+ both privileged and unprivileged. To disable module auto-load feature please see
+ <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+ <constant>kernel.modules_disabled</constant> mechanism and
+ <filename>/proc/sys/kernel/modules_disabled</filename> documentation.
+ If turned on and if running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
+ capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname>
+ is implied.
+ </para></listitem>
</varlistentry>
<varlistentry>
@@ -1237,7 +1267,7 @@
<listitem><para>Takes a boolean argument. If true, ensures that the service process and all its children can
never gain new privileges through <function>execve()</function> (e.g. via setuid or setgid bits, or filesystem
capabilities). This is the simplest and most effective way to ensure that a process and its children can never
- elevate privileges again. Defaults to false, but in the user manager instance certain settings force
+ elevate privileges again. Defaults to false, but certain settings force
<varname>NoNewPrivileges=yes</varname>, ignoring the value of this setting. This is the case when
<varname>SystemCallFilter=</varname>, <varname>SystemCallArchitectures=</varname>,
<varname>RestrictAddressFamilies=</varname>, <varname>RestrictNamespaces=</varname>,
@@ -1482,27 +1512,11 @@
<citerefentry><refentrytitle>setns</refentrytitle><manvolnum>2</manvolnum></citerefentry> system calls, taking
the specified flags parameters into account. Note that — if this option is used — in addition to restricting
creation and switching of the specified types of namespaces (or all of them, if true) access to the
- <function>setns()</function> system call with a zero flags parameter is prohibited.</para></listitem>
- </varlistentry>
-
- <varlistentry>
- <term><varname>ProtectKernelModules=</varname></term>
-
- <listitem><para>Takes a boolean argument. If true, explicit module loading will
- be denied. This allows to turn off module load and unload operations on modular
- kernels. It is recommended to turn this on for most services that do not need special
- file systems or extra kernel modules to work. Default to off. Enabling this option
- removes <constant>CAP_SYS_MODULE</constant> from the capability bounding set for
- the unit, and installs a system call filter to block module system calls,
- also <filename>/usr/lib/modules</filename> is made inaccessible. For this
- setting the same restrictions regarding mount propagation and privileges
- apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.
- Note that limited automatic module loading due to user configuration or kernel
- mapping tables might still happen as side effect of requested user operations,
- both privileged and unprivileged. To disable module auto-load feature please see
- <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
- <constant>kernel.modules_disabled</constant> mechanism and
- <filename>/proc/sys/kernel/modules_disabled</filename> documentation.</para></listitem>
+ <function>setns()</function> system call with a zero flags parameter is prohibited.
+ If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
+ capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname>
+ is implied.
+ </para></listitem>
</varlistentry>
<varlistentry>
@@ -1563,6 +1577,9 @@
that generate program code dynamically at runtime, such as JIT execution engines, or programs compiled making
use of the code "trampoline" feature of various C compilers. This option improves service security, as it makes
harder for software exploits to change running code dynamically.
+ If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
+ capability (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname>
+ is implied.
</para></listitem>
</varlistentry>
@@ -1573,7 +1590,10 @@
the unit are refused. This restricts access to realtime task scheduling policies such as
<constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See
<citerefentry project='man-pages'><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about
- these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods
+ these scheduling policies. If running in user mode, or in system mode, but
+ without the <constant>CAP_SYS_ADMIN</constant> capability
+ (e.g. setting <varname>User=</varname>), <varname>NoNewPrivileges=yes</varname>
+ is implied. Realtime scheduling policies may be used to monopolize CPU time for longer periods
of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It
is hence recommended to restrict access to realtime scheduling to the few programs that actually require
them. Defaults to off.</para></listitem>
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index d7bb0496a0..23c1b44573 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -781,7 +781,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
- SD_BUS_PROPERTY("RestrictNamespace", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_VTABLE_END
};
diff --git a/src/core/execute.c b/src/core/execute.c
index f666f7c6ce..04c4e511f4 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -2201,7 +2201,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
return false;
- return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+ /* We need NNP if we have any form of seccomp and are unprivileged */
+ return context_has_address_families(c) ||
c->memory_deny_write_execute ||
c->restrict_realtime ||
exec_context_restrict_namespaces_set(c) ||
diff --git a/src/core/execute.h b/src/core/execute.h
index 56f880cffe..e52640ee91 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -216,7 +216,6 @@ struct ExecContext {
bool nice_set:1;
bool ioprio_set:1;
bool cpu_sched_set:1;
- bool no_new_privileges_set:1;
};
static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index cb2f384f47..f4ef5a0140 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -57,7 +57,7 @@ m4_ifdef(`HAVE_SECCOMP',
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context)
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
-$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context.restrict_namespaces)
+$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context)
$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 52079980d8..970eed27c1 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -3896,7 +3896,6 @@ int config_parse_no_new_privileges(
}
c->no_new_privileges = k;
- c->no_new_privileges_set = true;
return 0;
}
diff --git a/src/core/unit.c b/src/core/unit.c
index bba0f5d357..da9bb58a52 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3429,14 +3429,6 @@ int unit_patch_contexts(Unit *u) {
ec->working_directory_missing_ok = true;
}
- if (MANAGER_IS_USER(u->manager) &&
- (ec->syscall_whitelist ||
- !set_isempty(ec->syscall_filter) ||
- !set_isempty(ec->syscall_archs) ||
- ec->address_families_whitelist ||
- !set_isempty(ec->address_families)))
- ec->no_new_privileges = true;
-
if (ec->private_devices)
ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO));
diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c
index 3b8768b9a7..e7b1b1cb20 100644
--- a/src/shared/bus-util.c
+++ b/src/shared/bus-util.c
@@ -43,6 +43,7 @@
#include "escape.h"
#include "fd-util.h"
#include "missing.h"
+#include "nsflags.h"
#include "parse-util.h"
#include "proc-cmdline.h"
#include "rlimit-util.h"
@@ -769,6 +770,23 @@ int bus_print_property(const char *name, sd_bus_message *property, bool value, b
char timespan[FORMAT_TIMESPAN_MAX];
print_property(name, "%s", format_timespan(timespan, sizeof(timespan), u, 0));
+ } else if (streq(name, "RestrictNamespaces")) {
+ _cleanup_free_ char *s = NULL;
+ const char *result = NULL;
+
+ if ((u & NAMESPACE_FLAGS_ALL) == 0)
+ result = "yes";
+ else if ((u & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
+ result = "no";
+ else {
+ r = namespace_flag_to_string_many(u, &s);
+ if (r < 0)
+ return r;
+
+ result = s;
+ }
+
+ print_property(name, "%s", result);
} else
print_property(name, "%"PRIu64, u);
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index 6029853e3e..b2ea358b8c 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -219,6 +219,18 @@ static void test_exec_systemcallerrornumber(Manager *m) {
#endif
}
+static void test_exec_restrict_namespaces(Manager *m) {
+#ifdef HAVE_SECCOMP
+ if (!is_seccomp_available())
+ return;
+
+ test(m, "exec-restrict-namespaces-no.service", 0, CLD_EXITED);
+ test(m, "exec-restrict-namespaces-yes.service", 1, CLD_EXITED);
+ test(m, "exec-restrict-namespaces-mnt.service", 0, CLD_EXITED);
+ test(m, "exec-restrict-namespaces-mnt-blacklist.service", 1, CLD_EXITED);
+#endif
+}
+
static void test_exec_systemcall_system_mode_with_user(Manager *m) {
#ifdef HAVE_SECCOMP
if (!is_seccomp_available())
@@ -435,6 +447,7 @@ int main(int argc, char *argv[]) {
test_exec_privatenetwork,
test_exec_systemcallfilter,
test_exec_systemcallerrornumber,
+ test_exec_restrict_namespaces,
test_exec_user,
test_exec_group,
test_exec_supplementary_groups,
diff --git a/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service b/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service
new file mode 100644
index 0000000000..ab909cbd94
--- /dev/null
+++ b/test/test-execute/exec-restrict-namespaces-mnt-blacklist.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test RestrictNamespaces=~mnt
+
+[Service]
+RestrictNamespaces=~mnt
+ExecStart=/bin/sh -x -c 'unshare -m'
+Type=oneshot
diff --git a/test/test-execute/exec-restrict-namespaces-mnt.service b/test/test-execute/exec-restrict-namespaces-mnt.service
new file mode 100644
index 0000000000..1aeed72717
--- /dev/null
+++ b/test/test-execute/exec-restrict-namespaces-mnt.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test RestrictNamespaces=mnt
+
+[Service]
+RestrictNamespaces=mnt
+ExecStart=/bin/sh -x -c 'unshare -m'
+Type=oneshot
diff --git a/test/test-execute/exec-restrict-namespaces-no.service b/test/test-execute/exec-restrict-namespaces-no.service
new file mode 100644
index 0000000000..33500302d2
--- /dev/null
+++ b/test/test-execute/exec-restrict-namespaces-no.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test RestrictNamespaces=no
+
+[Service]
+RestrictNamespaces=no
+ExecStart=/bin/sh -x -c 'unshare -m -u -i -n -p -f'
+Type=oneshot
diff --git a/test/test-execute/exec-restrict-namespaces-yes.service b/test/test-execute/exec-restrict-namespaces-yes.service
new file mode 100644
index 0000000000..3fe70e2bea
--- /dev/null
+++ b/test/test-execute/exec-restrict-namespaces-yes.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test RestrictNamespaces=yes
+
+[Service]
+RestrictNamespaces=yes
+ExecStart=/bin/sh -x -c 'unshare -m'
+Type=oneshot