From 479050b36302a360048c2af5e79683d14ad56fb3 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 12 Feb 2016 23:29:57 +0100
Subject: core: drop Capabilities= setting

The setting is hardly useful (since its effect is generally reduced to zero due
to file system caps), and with the advent of ambient caps an actually useful
replacement exists, hence let's get rid of this.

I am pretty sure this was unused and our man page already recommended against
its use, hence this should be a safe thing to remove.
---
 man/systemd.exec.xml | 55 ++++++++++++++--------------------------------------
 1 file changed, 15 insertions(+), 40 deletions(-)

(limited to 'man/systemd.exec.xml')
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index f0f77c5091..008565c14b 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -778,32 +778,21 @@
       <varlistentry>
         <term><varname>CapabilityBoundingSet=</varname></term>
 
-        <listitem><para>Controls which capabilities to include in the
-        capability bounding set for the executed process. See
-        <citerefentry project='man-pages'><refentrytitle>capabilities</refentrytitle><manvolnum>7</manvolnum></citerefentry>
-        for details. Takes a whitespace-separated list of capability
-        names as read by
-        <citerefentry project='mankier'><refentrytitle>cap_from_name</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
-        e.g. <constant>CAP_SYS_ADMIN</constant>,
-        <constant>CAP_DAC_OVERRIDE</constant>,
-        <constant>CAP_SYS_PTRACE</constant>. Capabilities listed will
-        be included in the bounding set, all others are removed. If
-        the list of capabilities is prefixed with
-        <literal>~</literal>, all but the listed capabilities will be
-        included, the effect of the assignment inverted. Note that
-        this option also affects the respective capabilities in the
-        effective, permitted and inheritable capability sets, on top
-        of what <varname>Capabilities=</varname> does. If this option
-        is not used, the capability bounding set is not modified on
-        process execution, hence no limits on the capabilities of the
-        process are enforced. This option may appear more than once, in
-        which case the bounding sets are merged. If the empty string
-        is assigned to this option, the bounding set is reset to the
-        empty capability set, and all prior settings have no effect.
-        If set to <literal>~</literal> (without any further argument),
-        the bounding set is reset to the full set of available
-        capabilities, also undoing any previous
-        settings.</para></listitem>
+        <listitem><para>Controls which capabilities to include in the capability bounding set for the executed
+        process. See <citerefentry
+        project='man-pages'><refentrytitle>capabilities</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
+        details. Takes a whitespace-separated list of capability names as read by <citerefentry
+        project='mankier'><refentrytitle>cap_from_name</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
+        e.g. <constant>CAP_SYS_ADMIN</constant>, <constant>CAP_DAC_OVERRIDE</constant>,
+        <constant>CAP_SYS_PTRACE</constant>. Capabilities listed will be included in the bounding set, all others are
+        removed. If the list of capabilities is prefixed with <literal>~</literal>, all but the listed capabilities
+        will be included, the effect of the assignment inverted. Note that this option also affects the respective
+        capabilities in the effective, permitted and inheritable capability sets. If this option is not used, the
+        capability bounding set is not modified on process execution, hence no limits on the capabilities of the
+        process are enforced. This option may appear more than once, in which case the bounding sets are merged. If the
+        empty string is assigned to this option, the bounding set is reset to the empty capability set, and all prior
+        settings have no effect.  If set to <literal>~</literal> (without any further argument), the bounding set is
+        reset to the full set of available capabilities, also undoing any previous settings.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -853,20 +842,6 @@
         for details.</para></listitem>
       </varlistentry>
 
-      <varlistentry>
-        <term><varname>Capabilities=</varname></term>
-        <listitem><para>Controls the
-        <citerefentry project='man-pages'><refentrytitle>capabilities</refentrytitle><manvolnum>7</manvolnum></citerefentry>
-        set for the executed process. Take a capability string
-        describing the effective, permitted and inherited capability
-        sets as documented in
-        <citerefentry project='mankier'><refentrytitle>cap_from_text</refentrytitle><manvolnum>3</manvolnum></citerefentry>.
-        Note that these capability sets are usually influenced (and
-        filtered) by the capabilities attached to the executed file.
-        Due to that <varname>CapabilityBoundingSet=</varname> is
-        probably a much more useful setting.</para></listitem>
-      </varlistentry>
-
       <varlistentry>
         <term><varname>ReadWriteDirectories=</varname></term>
         <term><varname>ReadOnlyDirectories=</varname></term>
-- 
cgit v1.2.3-54-g00ecf


From 7882632d5a6e9a26e19a51e533d07a2f942cf62c Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Feb 2016 16:01:53 +0100
Subject: man: extend the Personality= documentation

Among other fixes, add information about more architectures that are supported
these days.
---
 man/systemd.exec.xml | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'man/systemd.exec.xml')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 008565c14b..c1f47e84e6 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1262,14 +1262,17 @@
       <varlistentry>
         <term><varname>Personality=</varname></term>
 
-        <listitem><para>Controls which kernel architecture
-        <citerefentry project='man-pages'><refentrytitle>uname</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        shall report, when invoked by unit processes. Takes one of
-        <constant>x86</constant> and <constant>x86-64</constant>. This
-        is useful when running 32-bit services on a 64-bit host
-        system. If not specified, the personality is left unmodified
-        and thus reflects the personality of the host system's
-        kernel.</para></listitem>
+        <listitem><para>Controls which kernel architecture <citerefentry
+        project='man-pages'><refentrytitle>uname</refentrytitle><manvolnum>2</manvolnum></citerefentry> shall report,
+        when invoked by unit processes. Takes one of the architecture identifiers <constant>x86</constant>,
+        <constant>x86-64</constant>, <constant>ppc</constant>, <constant>ppc-le</constant>, <constant>ppc64</constant>,
+        <constant>ppc64-le</constant>, <constant>s390</constant> or <constant>s390x</constant>. Which personality
+        architectures are supported depends on the system architecture. Usually the 64bit versions of the various
+        system architectures support their immediate 32bit personality architecture counterpart, but no others. For
+        example, <constant>x86-64</constant> systems support the <constant>x86-64</constant> and
+        <constant>x86</constant> personalities but no others. The personality feature is useful when running 32-bit
+        services on a 64-bit host system. If not specified, the personality is left unmodified and thus reflects the
+        personality of the host system's kernel.</para></listitem>
       </varlistentry>
 
       <varlistentry>
-- 
cgit v1.2.3-54-g00ecf


From 19c0b0b9a5039b842cf9e6c3e7ece75fb8725602 Mon Sep 17 00:00:00 2001
From: Ronny Chevalier <chevalier.ronny@gmail.com>
Date: Sat, 30 Jan 2016 17:26:39 +0100
Subject: core: set NoNewPrivileges for seccomp if we don't have CAP_SYS_ADMIN

The manpage of seccomp specify that using seccomp with
SECCOMP_SET_MODE_FILTER will return EACCES if the caller do not have
CAP_SYS_ADMIN set, or if the no_new_privileges bit is not set. Hence,
without NoNewPrivilege set, it is impossible to use a SystemCall*
directive with a User directive set in system mode.

Now, NoNewPrivileges is set if we are in user mode, or if we are in
system mode and we don't have CAP_SYS_ADMIN, and SystemCall*
directives are used.
---
 Makefile.am                                        |  1 +
 man/systemd.exec.xml                               | 16 ++++--
 src/core/execute.c                                 | 16 +++---
 src/test/test-execute.c                            | 57 ++++++++++++++++------
 .../exec-systemcallfilter-system-user.service      | 11 +++++
 5 files changed, 74 insertions(+), 27 deletions(-)
 create mode 100644 test/test-execute/exec-systemcallfilter-system-user.service

(limited to 'man/systemd.exec.xml')

diff --git a/Makefile.am b/Makefile.am
index 7bd98dddf6..02557ef46a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1556,6 +1556,7 @@ EXTRA_DIST += \
 	test/test-execute/exec-systemcallfilter-failing.service \
 	test/test-execute/exec-systemcallfilter-not-failing2.service \
 	test/test-execute/exec-systemcallfilter-not-failing.service \
+	test/test-execute/exec-systemcallfilter-system-user.service \
 	test/test-execute/exec-user.service \
 	test/test-execute/exec-workingdirectory.service \
 	test/test-execute/exec-umask-0177.service \
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index c1f47e84e6..3e1a2cb224 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1155,7 +1155,9 @@
         first character of the list is <literal>~</literal>, the
         effect is inverted: only the listed system calls will result
         in immediate process termination (blacklisting). If running in
-        user mode and this option is used,
+        user mode, or in system mode, but without the
+        <constant>CAP_SYS_ADMIN</constant> capabiblity (e.g. setting
+        <varname>User=nobody</varname>),
         <varname>NoNewPrivileges=yes</varname> is implied. This
         feature makes use of the Secure Computing Mode 2 interfaces of
         the kernel ('seccomp filtering') and is useful for enforcing a
@@ -1214,8 +1216,10 @@
         systems. The special <constant>native</constant> identifier
         implicitly maps to the native architecture of the system (or
         more strictly: to the architecture the system manager is
-        compiled for). If running in user mode and this option is
-        used, <varname>NoNewPrivileges=yes</varname> is implied. Note
+        compiled for). If running in user mode, or in system mode,
+        but without the <constant>CAP_SYS_ADMIN</constant>
+        capabiblity (e.g. setting <varname>User=nobody</varname>),
+        <varname>NoNewPrivileges=yes</varname> is implied. Note
         that setting this option to a non-empty list implies that
         <constant>native</constant> is included too. By default, this
         option is set to the empty list, i.e. no architecture system
@@ -1244,8 +1248,10 @@
         <function>socketpair()</function> (which creates connected
         AF_UNIX sockets only) are unaffected. Note that this option
         has no effect on 32-bit x86 and is ignored (but works
-        correctly on x86-64). If running in user mode and this option
-        is used, <varname>NoNewPrivileges=yes</varname> is implied. By
+        correctly on x86-64). If running in user mode, or in system
+        mode, but without the <constant>CAP_SYS_ADMIN</constant>
+        capabiblity (e.g. setting <varname>User=nobody</varname>),
+        <varname>NoNewPrivileges=yes</varname> is implied. By
         default, no restriction applies, all address families are
         accessible to processes. If assigned the empty string, any
         previous list changes are undone.</para>
diff --git a/src/core/execute.c b/src/core/execute.c
index 8ede9e9afb..0c311ec330 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -24,6 +24,7 @@
 #include <poll.h>
 #include <signal.h>
 #include <string.h>
+#include <sys/capability.h>
 #include <sys/personality.h>
 #include <sys/prctl.h>
 #include <sys/socket.h>
@@ -1824,6 +1825,11 @@ static int exec_child(
 
         if (params->apply_permissions) {
 
+                bool use_address_families = context->address_families_whitelist ||
+                        !set_isempty(context->address_families);
+                bool use_syscall_filter = context->syscall_whitelist ||
+                        !set_isempty(context->syscall_filter) ||
+                        !set_isempty(context->syscall_archs);
                 int secure_bits = context->secure_bits;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -1890,15 +1896,15 @@ static int exec_child(
                                 return -errno;
                         }
 
-                if (context->no_new_privileges)
+                if (context->no_new_privileges ||
+                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || use_syscall_filter)))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
                         }
 
 #ifdef HAVE_SECCOMP
-                if (context->address_families_whitelist ||
-                    !set_isempty(context->address_families)) {
+                if (use_address_families) {
                         r = apply_address_families(context);
                         if (r < 0) {
                                 *exit_status = EXIT_ADDRESS_FAMILIES;
@@ -1906,9 +1912,7 @@ static int exec_child(
                         }
                 }
 
-                if (context->syscall_whitelist ||
-                    !set_isempty(context->syscall_filter) ||
-                    !set_isempty(context->syscall_archs)) {
+                if (use_syscall_filter) {
                         r = apply_seccomp(context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index 0d2e4bfc15..5645f5c086 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -130,6 +130,15 @@ static void test_exec_systemcallerrornumber(Manager *m) {
 #endif
 }
 
+static void test_exec_systemcall_system_mode_with_user(Manager *m) {
+#ifdef HAVE_SECCOMP
+        if (getpwnam("nobody"))
+                test(m, "exec-systemcallfilter-system-user.service", 0, CLD_EXITED);
+        else
+                log_error_errno(errno, "Skipping test_exec_systemcall_system_mode_with_user, could not find nobody user: %m");
+#endif
+}
+
 static void test_exec_user(Manager *m) {
         if (getpwnam("nobody"))
                 test(m, "exec-user.service", 0, CLD_EXITED);
@@ -267,8 +276,31 @@ static void test_exec_spec_interpolation(Manager *m) {
         test(m, "exec-spec-interpolation.service", 0, CLD_EXITED);
 }
 
+static int run_tests(ManagerRunningAs running_as, test_function_t *tests) {
+        test_function_t *test = NULL;
+        Manager *m = NULL;
+        int r;
+
+        assert_se(tests);
+
+        r = manager_new(running_as, true, &m);
+        if (MANAGER_SKIP_TEST(r)) {
+                printf("Skipping test: manager_new: %s\n", strerror(-r));
+                return EXIT_TEST_SKIP;
+        }
+        assert_se(r >= 0);
+        assert_se(manager_startup(m, NULL, NULL) >= 0);
+
+        for (test = tests; test && *test; test++)
+                (*test)(m);
+
+        manager_free(m);
+
+        return 0;
+}
+
 int main(int argc, char *argv[]) {
-        test_function_t tests[] = {
+        test_function_t user_tests[] = {
                 test_exec_workingdirectory,
                 test_exec_personality,
                 test_exec_ignoresigpipe,
@@ -291,8 +323,10 @@ int main(int argc, char *argv[]) {
                 test_exec_spec_interpolation,
                 NULL,
         };
-        test_function_t *test = NULL;
-        Manager *m = NULL;
+        test_function_t system_tests[] = {
+                test_exec_systemcall_system_mode_with_user,
+                NULL,
+        };
         int r;
 
         log_parse_environment();
@@ -317,18 +351,9 @@ int main(int argc, char *argv[]) {
         assert_se(unsetenv("VAR2") == 0);
         assert_se(unsetenv("VAR3") == 0);
 
-        r = manager_new(MANAGER_USER, true, &m);
-        if (MANAGER_SKIP_TEST(r)) {
-                printf("Skipping test: manager_new: %s\n", strerror(-r));
-                return EXIT_TEST_SKIP;
-        }
-        assert_se(r >= 0);
-        assert_se(manager_startup(m, NULL, NULL) >= 0);
-
-        for (test = tests; test && *test; test++)
-                (*test)(m);
+        r = run_tests(MANAGER_USER, user_tests);
+        if (r != 0)
+                return r;
 
-        manager_free(m);
-
-        return 0;
+        return run_tests(MANAGER_SYSTEM, system_tests);
 }
diff --git a/test/test-execute/exec-systemcallfilter-system-user.service b/test/test-execute/exec-systemcallfilter-system-user.service
new file mode 100644
index 0000000000..462f94133d
--- /dev/null
+++ b/test/test-execute/exec-systemcallfilter-system-user.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=Test for SystemCallFilter in system mode with User set
+
+[Service]
+ExecStart=/bin/echo "Foo bar"
+Type=oneshot
+User=nobody
+SystemCallFilter=~read write open execve ioperm
+SystemCallFilter=ioctl
+SystemCallFilter=read write open execve
+SystemCallFilter=~ioperm
-- 
cgit v1.2.3-54-g00ecf


From b50a16af8e3c353703d55f117077fcf60b8081e8 Mon Sep 17 00:00:00 2001
From: Nicolas Braud-Santoni <nicolas@braud-santoni.eu>
Date: Sun, 17 Apr 2016 14:22:17 +0200
Subject: man: systemd.exec: Clarify InaccessibleDirectories (#3048) (#3048)

---
 man/systemd.exec.xml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'man/systemd.exec.xml')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 3e1a2cb224..4ed62dbada 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -859,9 +859,12 @@
         reading only, writing will be refused even if the usual file
         access controls would permit this. Directories listed in
         <varname>InaccessibleDirectories=</varname> will be made
-        inaccessible for processes inside the namespace. Note that
-        restricting access with these options does not extend to
-        submounts of a directory that are created later on. These
+        inaccessible for processes inside the namespace, and may not
+        countain any other mountpoints, including those specified by
+        <varname>ReadWriteDirectories=</varname> or
+        <varname>ReadOnlyDirectories=</varname>.
+        Note that restricting access with these options does not extend
+        to submounts of a directory that are created later on. These
         options may be specified more than once, in which case all
         directories listed will have limited access from within the
         namespace. If the empty string is assigned to this option, the
-- 
cgit v1.2.3-54-g00ecf


From 28c75e250155a6895a28e181aa7b84330864a28f Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Tue, 26 Apr 2016 11:57:54 +0200
Subject: man: elaborate on the automatic systemd-journald.socket service
 dependencies

Fixes: #1603
---
 man/systemd.exec.xml | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'man/systemd.exec.xml')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 4ed62dbada..fea42ebd31 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -94,11 +94,10 @@
     required to access <filename>/tmp</filename> and
     <filename>/var/tmp</filename>.</para>
 
-    <para>Units whose output standard output or error output is
-    connected to any other sink but <option>null</option>,
-    <option>tty</option> and <option>socket</option> automatically
-    acquire dependencies of type <varname>After=</varname> on
-    <filename>journald.socket</filename>.</para>
+    <para>Units whose output standard output or error output is connected to <option>journal</option>,
+    <option>syslog</option> or <option>kmsg</option> (or their combinations with console output, see below)
+    automatically acquire dependencies of type <varname>After=</varname> on
+    <filename>systemd-journald.socket</filename>.</para>
   </refsect1>
 
   <refsect1>
@@ -470,6 +469,10 @@
         similar to the same option of
         <varname>StandardInput=</varname>.</para>
 
+        <para>If the standard output (or error output, see below) of a unit is connected with the journal, syslog or
+        the kernel log buffer the unit will implicitly gain a dependency of type <varname>After=</varname> on
+        <filename>systemd-journald.socket</filename> (also see the automatic dependencies section above).</para>
+
         <para>This setting defaults to the value set with
         <option>DefaultStandardOutput=</option> in
         <citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>,
-- 
cgit v1.2.3-54-g00ecf


From dfe85b38d2d15c3e30357a84c781a1d78f176ff8 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Tue, 26 Apr 2016 15:08:06 +0200
Subject: man: minor wording fixes

As suggested in:

https://github.com/systemd/systemd/pull/3124#discussion_r61068789
---
 man/systemd.exec.xml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'man/systemd.exec.xml')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index fea42ebd31..2d0fb63f1d 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -94,10 +94,9 @@
     required to access <filename>/tmp</filename> and
     <filename>/var/tmp</filename>.</para>
 
-    <para>Units whose output standard output or error output is connected to <option>journal</option>,
-    <option>syslog</option> or <option>kmsg</option> (or their combinations with console output, see below)
-    automatically acquire dependencies of type <varname>After=</varname> on
-    <filename>systemd-journald.socket</filename>.</para>
+    <para>Units whose standard output or error output is connected to <option>journal</option>, <option>syslog</option>
+    or <option>kmsg</option> (or their combinations with console output, see below) automatically acquire dependencies
+    of type <varname>After=</varname> on <filename>systemd-journald.socket</filename>.</para>
   </refsect1>
 
   <refsect1>
@@ -469,8 +468,8 @@
         similar to the same option of
         <varname>StandardInput=</varname>.</para>
 
-        <para>If the standard output (or error output, see below) of a unit is connected with the journal, syslog or
-        the kernel log buffer the unit will implicitly gain a dependency of type <varname>After=</varname> on
+        <para>If the standard output (or error output, see below) of a unit is connected to the journal, syslog or the
+        kernel log buffer, the unit will implicitly gain a dependency of type <varname>After=</varname> on
         <filename>systemd-journald.socket</filename> (also see the automatic dependencies section above).</para>
 
         <para>This setting defaults to the value set with
-- 
cgit v1.2.3-54-g00ecf


From 29857001854a02c292f1f3b324e7a66831e859c8 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 28 Apr 2016 21:00:28 +0200
Subject: core: make parsing of RLIMIT_NICE aware of actual nice levels

---
 man/systemd.exec.xml        | 38 +++++++++++++++------------------
 src/basic/rlimit-util.c     | 52 ++++++++++++++++++++++++++++++++++++++++++++-
 src/test/test-rlimit-util.c | 12 +++++++++++
 3 files changed, 80 insertions(+), 22 deletions(-)

(limited to 'man/systemd.exec.xml')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 2d0fb63f1d..2a93760428 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -629,27 +629,23 @@
         <term><varname>LimitNICE=</varname></term>
         <term><varname>LimitRTPRIO=</varname></term>
         <term><varname>LimitRTTIME=</varname></term>
-        <listitem><para>These settings set both soft and hard limits
-        of various resources for executed processes. See
-        <citerefentry><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        for details. The resource limit is possible to specify in two formats,
-        <option>value</option> to set soft and hard limits to the same value,
-        or <option>soft:hard</option> to set both limits individually (e.g. LimitAS=4G:16G).
-        Use the string <varname>infinity</varname> to
-        configure no limit on a specific resource. The multiplicative
-        suffixes K (=1024), M (=1024*1024) and so on for G, T, P and E
-        may be used for resource limits measured in bytes
-        (e.g. LimitAS=16G). For the limits referring to time values,
-        the usual time units ms, s, min, h and so on may be used (see
-        <citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry>
-        for details). Note that if no time unit is specified for
-        <varname>LimitCPU=</varname> the default unit of seconds is
-        implied, while for <varname>LimitRTTIME=</varname> the default
-        unit of microseconds is implied. Also, note that the effective
-        granularity of the limits might influence their
-        enforcement. For example, time limits specified for
-        <varname>LimitCPU=</varname> will be rounded up implicitly to
-        multiples of 1s.</para>
+        <listitem><para>Set soft and hard limits on various resources for executed processes. See
+        <citerefentry><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry> for details on
+        the resource limit concept. Resource limits may be specified in two formats: either as single value to set a
+        specific soft and hard limit to the same value, or as colon-separated pair <option>soft:hard</option> to set
+        both limits individually (e.g. <literal>LimitAS=4G:16G</literal>).  Use the string <varname>infinity</varname>
+        to configure no limit on a specific resource. The multiplicative suffixes K, M, G, T, P and E (to the base
+        1024) may be used for resource limits measured in bytes (e.g. LimitAS=16G). For the limits referring to time
+        values, the usual time units ms, s, min, h and so on may be used (see
+        <citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
+        details). Note that if no time unit is specified for <varname>LimitCPU=</varname> the default unit of seconds
+        is implied, while for <varname>LimitRTTIME=</varname> the default unit of microseconds is implied. Also, note
+        that the effective granularity of the limits might influence their enforcement. For example, time limits
+        specified for <varname>LimitCPU=</varname> will be rounded up implicitly to multiples of 1s. For
+        <varname>LimitNICE=</varname> the value may be specified in two syntaxes: if prefixed with <literal>+</literal>
+        or <literal>-</literal>, the value is understood as regular Linux nice value in the range -20..19. If not
+        prefixed like this the value is understood as raw resource limit parameter in the range 0..40 (with 0 being
+        equivalent to 1).</para>
 
         <para>Note that most process resource limits configured with
         these options are per-process, and processes may fork in order
diff --git a/src/basic/rlimit-util.c b/src/basic/rlimit-util.c
index 7540b43215..ee063720ed 100644
--- a/src/basic/rlimit-util.c
+++ b/src/basic/rlimit-util.c
@@ -153,6 +153,56 @@ static int rlimit_parse_usec(const char *val, rlim_t *ret) {
         return 0;
 }
 
+static int rlimit_parse_nice(const char *val, rlim_t *ret) {
+        uint64_t rl;
+        int r;
+
+        /* So, Linux is weird. The range for RLIMIT_NICE is 40..1, mapping to the nice levels -20..19. However, the
+         * RLIMIT_NICE limit defaults to 0 by the kernel, i.e. a value that maps to nice level 20, which of course is
+         * bogus and does not exist. In order to permit parsing the RLIMIT_NICE of 0 here we hence implement a slight
+         * asymmetry: when parsing as positive nice level we permit 0..19. When parsing as negative nice level, we
+         * permit -20..0. But when parsing as raw resource limit value then we also allow the special value 0.
+         *
+         * Yeah, Linux is quality engineering sometimes... */
+
+        if (val[0] == '+') {
+
+                /* Prefixed with "+": Parse as positive user-friendly nice value */
+                r = safe_atou64(val + 1, &rl);
+                if (r < 0)
+                        return r;
+
+                if (rl >= PRIO_MAX)
+                        return -ERANGE;
+
+                rl = 20 - rl;
+
+        } else if (val[0] == '-') {
+
+                /* Prefixed with "-": Parse as negative user-friendly nice value */
+                r = safe_atou64(val + 1, &rl);
+                if (r < 0)
+                        return r;
+
+                if (rl > (uint64_t) (-PRIO_MIN))
+                        return -ERANGE;
+
+                rl = 20 + rl;
+        } else {
+
+                /* Not prefixed: parse as raw resource limit value */
+                r = safe_atou64(val, &rl);
+                if (r < 0)
+                        return r;
+
+                if (rl > (uint64_t) (20 - PRIO_MIN))
+                        return -ERANGE;
+        }
+
+        *ret = (rlim_t) rl;
+        return 0;
+}
+
 static int (*const rlimit_parse_table[_RLIMIT_MAX])(const char *val, rlim_t *ret) = {
         [RLIMIT_CPU] = rlimit_parse_sec,
         [RLIMIT_FSIZE] = rlimit_parse_size,
@@ -167,7 +217,7 @@ static int (*const rlimit_parse_table[_RLIMIT_MAX])(const char *val, rlim_t *ret
         [RLIMIT_LOCKS] = rlimit_parse_u64,
         [RLIMIT_SIGPENDING] = rlimit_parse_u64,
         [RLIMIT_MSGQUEUE] = rlimit_parse_size,
-        [RLIMIT_NICE] = rlimit_parse_u64,
+        [RLIMIT_NICE] = rlimit_parse_nice,
         [RLIMIT_RTPRIO] = rlimit_parse_u64,
         [RLIMIT_RTTIME] = rlimit_parse_usec,
 };
diff --git a/src/test/test-rlimit-util.c b/src/test/test-rlimit-util.c
index d9ac9368cd..62afd2de5e 100644
--- a/src/test/test-rlimit-util.c
+++ b/src/test/test-rlimit-util.c
@@ -99,6 +99,18 @@ int main(int argc, char *argv[]) {
         test_rlimit_parse_format(RLIMIT_NOFILE, "", 0, 0, -EINVAL, NULL);
         test_rlimit_parse_format(RLIMIT_NOFILE, "5:4", 0, 0, -EILSEQ, NULL);
         test_rlimit_parse_format(RLIMIT_NOFILE, "5:4:3", 0, 0, -EINVAL, NULL);
+        test_rlimit_parse_format(RLIMIT_NICE, "20", 20, 20, 0, "20");
+        test_rlimit_parse_format(RLIMIT_NICE, "40", 40, 40, 0, "40");
+        test_rlimit_parse_format(RLIMIT_NICE, "41", 41, 41, -ERANGE, "41");
+        test_rlimit_parse_format(RLIMIT_NICE, "0", 0, 0, 0, "0");
+        test_rlimit_parse_format(RLIMIT_NICE, "-7", 27, 27, 0, "27");
+        test_rlimit_parse_format(RLIMIT_NICE, "-20", 40, 40, 0, "40");
+        test_rlimit_parse_format(RLIMIT_NICE, "-21", 41, 41, -ERANGE, "41");
+        test_rlimit_parse_format(RLIMIT_NICE, "-0", 20, 20, 0, "20");
+        test_rlimit_parse_format(RLIMIT_NICE, "+7", 13, 13, 0, "13");
+        test_rlimit_parse_format(RLIMIT_NICE, "+19", 1, 1, 0, "1");
+        test_rlimit_parse_format(RLIMIT_NICE, "+20", 0, 0, -ERANGE, "0");
+        test_rlimit_parse_format(RLIMIT_NICE, "+0", 20, 20, 0, "20");
 
         return 0;
 }
-- 
cgit v1.2.3-54-g00ecf


From 737ba3c82c71c15de498f63527d264dc996ffa11 Mon Sep 17 00:00:00 2001
From: topimiettinen <topimiettinen@users.noreply.github.com>
Date: Mon, 16 May 2016 02:34:05 +0000
Subject: namespace: Make private /dev noexec and readonly (#3263)

Private /dev will not be managed by udev or others, so we can make it
noexec and readonly after we have made all device nodes. As /dev/shm
needs to be writable, we can't use bind_remount_recursive().
---
 man/systemd.exec.xml |  5 ++++-
 src/core/namespace.c | 10 +++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'man/systemd.exec.xml')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 2a93760428..3cf6de8256 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -933,7 +933,10 @@
         (propagation in the opposite direction continues to work).
         This means that this setting may not be used for services
         which shall be able to install mount points in the main mount
-        namespace.</para></listitem>
+        namespace. The /dev namespace will be mounted read-only and 'noexec'.
+        The latter may break old programs which try to set up executable
+        memory by using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry>
+        of <filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/namespace.c b/src/core/namespace.c
index ef85bfec23..203d122810 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -44,6 +44,8 @@
 #include "user-util.h"
 #include "util.h"
 
+#define DEV_MOUNT_OPTIONS (MS_NOSUID|MS_STRICTATIME|MS_NOEXEC)
+
 typedef enum MountMode {
         /* This is ordered by priority! */
         INACCESSIBLE,
@@ -153,7 +155,7 @@ static int mount_dev(BindMount *m) {
 
         dev = strjoina(temporary_mount, "/dev");
         (void) mkdir(dev, 0755);
-        if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
+        if (mount("tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755") < 0) {
                 r = -errno;
                 goto fail;
         }
@@ -330,9 +332,11 @@ static int make_read_only(BindMount *m) {
 
         if (IN_SET(m->mode, INACCESSIBLE, READONLY))
                 r = bind_remount_recursive(m->path, true);
-        else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
+        else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV)) {
                 r = bind_remount_recursive(m->path, false);
-        else
+                if (r == 0 && m->mode == PRIVATE_DEV) /* can be readonly but the submounts can't*/
+                        r = mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL);
+        } else
                 r = 0;
 
         if (m->ignore && r == -ENOENT)
-- 
cgit v1.2.3-54-g00ecf