From 59eeb84ba65483c5543d1bc840c2ac75642ef638 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Aug 2016 18:43:59 +0200
Subject: core: add two new service settings ProtectKernelTunables= and
 ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.
---
 man/systemd.exec.xml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'man')
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index bcedebd5bb..07128b489e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1059,6 +1059,26 @@
         Defaults to off.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>ProtectKernelTunables=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
+        <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
+        unit. Usually, tunable kernel variables should only be written at boot-time, with the
+        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
+        no services need to write to these at runtime; it is hence recommended to turn this on for most
+        services. Defaults to off.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>ProtectControlGroups=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
+        through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
+        container managers no services should require write access to the control groups hierarchies; it is hence
+        recommended to turn this on for most services. Defaults to off.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>MountFlags=</varname></term>
 
-- 
cgit v1.2.3-54-g00ecf


From 3f815163ff8fdcdbd329680580df36f94e15325d Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:57:21 +0200
Subject: core: introduce ProtectSystem=strict

Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a
new setting "strict". If set, the entire directory tree of the system is
mounted read-only, but the API file systems /proc, /dev, /sys are excluded
(they may be managed with PrivateDevices= and ProtectKernelTunables=). Also,
/home and /root are excluded as those are left for ProtectHome= to manage.

In this mode, all "real" file systems (i.e. non-API file systems) are mounted
read-only, and specific directories may only be excluded via
ReadWriteDirectories=, thus implementing an effective whitelist instead of
blacklist of writable directories.

While we are at, also add /efi to the list of paths always affected by
ProtectSystem=. This is a follow-up for
b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for
/boot. Our namespacing logic should respect that too.
---
 man/systemd.exec.xml | 33 ++++++++++++++++---------------
 src/core/namespace.c | 56 +++++++++++++++++++++++++++++++++++++++++++---------
 src/core/namespace.h |  1 +
 3 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 07128b489e..1b672fe0c9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1020,22 +1020,23 @@
       <varlistentry>
         <term><varname>ProtectSystem=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>full</literal>. If true, mounts the
-        <filename>/usr</filename> and <filename>/boot</filename>
-        directories read-only for processes invoked by this unit. If
-        set to <literal>full</literal>, the <filename>/etc</filename>
-        directory is mounted read-only, too. This setting ensures that
-        any modification of the vendor-supplied operating system (and
-        optionally its configuration) is prohibited for the service.
-        It is recommended to enable this setting for all long-running
-        services, unless they are involved with system updates or need
-        to modify the operating system in other ways. Note however
-        that processes retaining the CAP_SYS_ADMIN capability can undo
-        the effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or the special values <literal>full</literal> or
+        <literal>strict</literal>. If true, mounts the <filename>/usr</filename> and <filename>/boot</filename>
+        directories read-only for processes invoked by this unit. If set to <literal>full</literal>, the
+        <filename>/etc</filename> directory is mounted read-only, too. If set to <literal>strict</literal> the entire
+        file system hierarchy is mounted read-only, except for the API file system subtrees <filename>/dev</filename>,
+        <filename>/proc</filename> and <filename>/sys</filename> (protect these directories using
+        <varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>). This setting ensures that any modification of the vendor-supplied
+        operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
+        recommended to enable this setting for all long-running services, unless they are involved with system updates
+        or need to modify the operating system in other ways. If this option is used,
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
+        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
+        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
+        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
+        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
+        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/namespace.c b/src/core/namespace.c
index e08d7459c5..498cd139bf 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -472,9 +472,11 @@ int setup_namespace(
                 private_dev +
                 (protect_sysctl ? 3 : 0) +
                 (protect_cgroups != protect_sysctl) +
-                (protect_home != PROTECT_HOME_NO ? 3 : 0) +
-                (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
-                (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
@@ -529,9 +531,13 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO) {
+                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
+                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
+                         * strict system protection mode, then also add entries for these directories, but mark them
+                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
+
                         home_dir = prefix_roota(root_directory, "/home");
                         home_dir = strjoina("-", home_dir);
                         run_user_dir = prefix_roota(root_directory, "/run/user");
@@ -540,22 +546,53 @@ int setup_namespace(
                         root_dir = strjoina("-", root_dir);
 
                         r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
+                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
+                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
                         if (r < 0)
                                 return r;
                 }
 
-                if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *etc_dir;
+                if (protect_system == PROTECT_SYSTEM_STRICT) {
+                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
+                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+                         * protect those, and these options should be fully orthogonal. (And of course /home and
+                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
+                         * above). */
+
+                        m->path = prefix_roota(root_directory, "/");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc");
+                        m->mode = READWRITE;
+                        m++;
+
+                        if (!private_dev) {
+                                m->path = prefix_roota(root_directory, "/dev");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+                        if (!protect_sysctl) {
+                                m->path = prefix_roota(root_directory, "/sys");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+
+                } else if (protect_system != PROTECT_SYSTEM_NO) {
+                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
+
+                        /* In any other mode we simply mark the relevant three directories ready-only. */
 
                         usr_dir = prefix_roota(root_directory, "/usr");
                         boot_dir = prefix_roota(root_directory, "/boot");
                         boot_dir = strjoina("-", boot_dir);
+                        efi_dir = prefix_roota(root_directory, "/efi");
+                        efi_dir = strjoina("-", efi_dir);
                         etc_dir = prefix_roota(root_directory, "/etc");
 
                         r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
-                                : STRV_MAKE(usr_dir, boot_dir), READONLY);
+                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
+                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
                         if (r < 0)
                                 return r;
                 }
@@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
         [PROTECT_SYSTEM_NO] = "no",
         [PROTECT_SYSTEM_YES] = "yes",
         [PROTECT_SYSTEM_FULL] = "full",
+        [PROTECT_SYSTEM_STRICT] = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 3845336287..6505bcc499 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -35,6 +35,7 @@ typedef enum ProtectSystem {
         PROTECT_SYSTEM_NO,
         PROTECT_SYSTEM_YES,
         PROTECT_SYSTEM_FULL,
+        PROTECT_SYSTEM_STRICT,
         _PROTECT_SYSTEM_MAX,
         _PROTECT_SYSTEM_INVALID = -1
 } ProtectSystem;
-- 
cgit v1.2.3-54-g00ecf


From 63bb64a056113d4be5fefb16604accf08c8c204a Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 16:12:46 +0200
Subject: core: imply ProtectHome=read-only and ProtectSystem=strict if
 DynamicUser=1

Let's make sure that services that use DynamicUser=1 cannot leave files in the
file system should the system accidentally have a world-writable directory
somewhere.

This effectively ensures that directories need to be whitelisted rather than
blacklisted for access when DynamicUser=1 is set.
---
 man/systemd.exec.xml | 12 ++++++++----
 src/core/unit.c      |  6 ++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 1b672fe0c9..e4d9c0ef1b 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -160,14 +160,18 @@
         use. However, UID/GIDs are recycled after a unit is terminated. Care should be taken that any processes running
         as part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by these
         users/groups around, as a different unit might get the same UID/GID assigned later on, and thus gain access to
-        these files or directories. If <varname>DynamicUser=</varname> is enabled, <varname>RemoveIPC=</varname> and
+        these files or directories. If <varname>DynamicUser=</varname> is enabled, <varname>RemoveIPC=</varname>,
         <varname>PrivateTmp=</varname> are implied. This ensures that the lifetime of IPC objects and temporary files
         created by the executed processes is bound to the runtime of the service, and hence the lifetime of the dynamic
         user/group. Since <filename>/tmp</filename> and <filename>/var/tmp</filename> are usually the only
         world-writable directories on a system this ensures that a unit making use of dynamic user/group allocation
-        cannot leave files around after unit termination. Use <varname>RuntimeDirectory=</varname> (see below) in order
-        to assign a writable runtime directory to a service, owned by the dynamic user/group and removed automatically
-        when the unit is terminated. Defaults to off.</para></listitem>
+        cannot leave files around after unit termination. Moreover <varname>ProtectSystem=strict</varname> and
+        <varname>ProtectHome=read-only</varname> are implied, thus prohibiting the service to write to arbitrary file
+        system locations. In order to allow the service to write to certain directories, they have to be whitelisted
+        using <varname>ReadWritePaths=</varname>, but care must be taken so that that UID/GID recycling doesn't
+        create security issues involving files created by the service. Use <varname>RuntimeDirectory=</varname> (see
+        below) in order to assign a writable runtime directory to a service, owned by the dynamic user/group and
+        removed automatically when the unit is terminated. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/unit.c b/src/core/unit.c
index de22f657c6..5d284a359d 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3377,8 +3377,14 @@ int unit_patch_contexts(Unit *u) {
                                         return -ENOMEM;
                         }
 
+                        /* If the dynamic user option is on, let's make sure that the unit can't leave its UID/GID
+                         * around in the file system or on IPC objects. Hence enforce a strict sandbox. */
+
                         ec->private_tmp = true;
                         ec->remove_ipc = true;
+                        ec->protect_system = PROTECT_SYSTEM_STRICT;
+                        if (ec->protect_home == PROTECT_HOME_NO)
+                                ec->protect_home = PROTECT_HOME_READ_ONLY;
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From b2656f1b1ca94fc8b6a0eb44986df78d23ff7950 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:22:23 +0200
Subject: man: in user-facing documentaiton don't reference C function names

Let's drop the reference to the cap_from_name() function in the documentation
for the capabilities setting, as it is hardly helpful. Our readers are not
necessarily C hackers knowing the semantics of cap_from_name(). Moreover, the
strings we accept are just the plain capability names as listed in
capabilities(7) hence there's really no point in confusing the user with
anything else.
---
 man/systemd.exec.xml | 64 +++++++++++++++++++++-------------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index e4d9c0ef1b..67182f17dc 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -821,49 +821,37 @@
         <listitem><para>Controls which capabilities to include in the capability bounding set for the executed
         process. See <citerefentry
         project='man-pages'><refentrytitle>capabilities</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
-        details. Takes a whitespace-separated list of capability names as read by <citerefentry
-        project='mankier'><refentrytitle>cap_from_name</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
-        e.g. <constant>CAP_SYS_ADMIN</constant>, <constant>CAP_DAC_OVERRIDE</constant>,
-        <constant>CAP_SYS_PTRACE</constant>. Capabilities listed will be included in the bounding set, all others are
-        removed. If the list of capabilities is prefixed with <literal>~</literal>, all but the listed capabilities
-        will be included, the effect of the assignment inverted. Note that this option also affects the respective
-        capabilities in the effective, permitted and inheritable capability sets. If this option is not used, the
-        capability bounding set is not modified on process execution, hence no limits on the capabilities of the
-        process are enforced. This option may appear more than once, in which case the bounding sets are merged. If the
-        empty string is assigned to this option, the bounding set is reset to the empty capability set, and all prior
-        settings have no effect.  If set to <literal>~</literal> (without any further argument), the bounding set is
-        reset to the full set of available capabilities, also undoing any previous settings. This does not affect
-        commands prefixed with <literal>+</literal>.</para></listitem>
+        details. Takes a whitespace-separated list of capability names, e.g. <constant>CAP_SYS_ADMIN</constant>,
+        <constant>CAP_DAC_OVERRIDE</constant>, <constant>CAP_SYS_PTRACE</constant>. Capabilities listed will be
+        included in the bounding set, all others are removed. If the list of capabilities is prefixed with
+        <literal>~</literal>, all but the listed capabilities will be included, the effect of the assignment
+        inverted. Note that this option also affects the respective capabilities in the effective, permitted and
+        inheritable capability sets. If this option is not used, the capability bounding set is not modified on process
+        execution, hence no limits on the capabilities of the process are enforced. This option may appear more than
+        once, in which case the bounding sets are merged. If the empty string is assigned to this option, the bounding
+        set is reset to the empty capability set, and all prior settings have no effect.  If set to
+        <literal>~</literal> (without any further argument), the bounding set is reset to the full set of available
+        capabilities, also undoing any previous settings. This does not affect commands prefixed with
+        <literal>+</literal>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>AmbientCapabilities=</varname></term>
 
-        <listitem><para>Controls which capabilities to include in the
-        ambient capability set for the executed process. Takes a
-        whitespace-separated list of capability names as read by
-        <citerefentry project='mankier'><refentrytitle>cap_from_name</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
-        e.g. <constant>CAP_SYS_ADMIN</constant>,
-        <constant>CAP_DAC_OVERRIDE</constant>,
-        <constant>CAP_SYS_PTRACE</constant>. This option may appear more than
-        once in which case the ambient capability sets are merged.
-        If the list of capabilities is prefixed with <literal>~</literal>, all
-        but the listed capabilities will be included, the effect of the
-        assignment inverted. If the empty string is
-        assigned to this option, the ambient capability set is reset to
-        the empty capability set, and all prior settings have no effect.
-        If set to <literal>~</literal> (without any further argument), the
-        ambient capability set is reset to the full set of available
-        capabilities, also undoing any previous settings. Note that adding
-        capabilities to ambient capability set adds them to the process's
-        inherited capability set.
-        </para><para>
-        Ambient capability sets are useful if you want to execute a process
-        as a non-privileged user but still want to give it some capabilities.
-        Note that in this case option <constant>keep-caps</constant> is
-        automatically added to <varname>SecureBits=</varname> to retain the
-        capabilities over the user change. <varname>AmbientCapabilities=</varname> does not affect
-        commands prefixed with <literal>+</literal>.</para></listitem>
+        <listitem><para>Controls which capabilities to include in the ambient capability set for the executed
+        process. Takes a whitespace-separated list of capability names, e.g. <constant>CAP_SYS_ADMIN</constant>,
+        <constant>CAP_DAC_OVERRIDE</constant>, <constant>CAP_SYS_PTRACE</constant>. This option may appear more than
+        once in which case the ambient capability sets are merged.  If the list of capabilities is prefixed with
+        <literal>~</literal>, all but the listed capabilities will be included, the effect of the assignment
+        inverted. If the empty string is assigned to this option, the ambient capability set is reset to the empty
+        capability set, and all prior settings have no effect.  If set to <literal>~</literal> (without any further
+        argument), the ambient capability set is reset to the full set of available capabilities, also undoing any
+        previous settings. Note that adding capabilities to ambient capability set adds them to the process's inherited
+        capability set.  </para><para> Ambient capability sets are useful if you want to execute a process as a
+        non-privileged user but still want to give it some capabilities.  Note that in this case option
+        <constant>keep-caps</constant> is automatically added to <varname>SecureBits=</varname> to retain the
+        capabilities over the user change. <varname>AmbientCapabilities=</varname> does not affect commands prefixed
+        with <literal>+</literal>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
-- 
cgit v1.2.3-54-g00ecf


From effbd6d2eadb61bd236d118afc7901940c4c6b37 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:24:37 +0200
Subject: man: rework documentation for ReadOnlyPaths= and related settings

This reworks the documentation for ReadOnlyPaths=, ReadWritePaths=,
InaccessiblePaths=. It no longer claims that we'd follow symlinks relative to
the host file system. (Which wasn't true actually, as we didn't follow symlinks
at all in the most recent releases, and we know do follow them, but relative to
RootDirectory=).

This also replaces all references to the fact that all fs namespacing options
can be undone with enough privileges and disable propagation by a single one in
the documentation of ReadOnlyPaths= and friends, and then directs the read to
this in all other places.

Moreover a hint is added to the documentation of SystemCallFilter=, suggesting
usage of ~@mount in case any of the fs namespacing related options are used.
---
 man/systemd.exec.xml | 214 ++++++++++++++++++++++-----------------------------
 1 file changed, 92 insertions(+), 122 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 67182f17dc..84f81fe38e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -877,48 +877,34 @@
         <term><varname>ReadOnlyPaths=</varname></term>
         <term><varname>InaccessiblePaths=</varname></term>
 
-        <listitem><para>Sets up a new file system namespace for
-        executed processes. These options may be used to limit access
-        a process might have to the main file system hierarchy. Each
-        setting takes a space-separated list of paths relative to
-        the host's root directory (i.e. the system running the service manager).
-        Note that if entries contain symlinks, they are resolved from the host's root directory as well.
-        Entries (files or directories) listed in
-        <varname>ReadWritePaths=</varname> are accessible from
-        within the namespace with the same access rights as from
-        outside. Entries listed in
-        <varname>ReadOnlyPaths=</varname> are accessible for
-        reading only, writing will be refused even if the usual file
-        access controls would permit this. Entries listed in
-        <varname>InaccessiblePaths=</varname> will be made
-        inaccessible for processes inside the namespace, and may not
-        countain any other mountpoints, including those specified by
-        <varname>ReadWritePaths=</varname> or
-        <varname>ReadOnlyPaths=</varname>.
-        Note that restricting access with these options does not extend
-        to submounts of a directory that are created later on.
-        Non-directory paths can be specified as well. These
-        options may be specified more than once, in which case all
-        paths listed will have limited access from within the
-        namespace. If the empty string is assigned to this option, the
-        specific list is reset, and all prior assignments have no
-        effect.</para>
-        <para>Paths in
-        <varname>ReadOnlyPaths=</varname>
-        and
-        <varname>InaccessiblePaths=</varname>
-        may be prefixed with
-        <literal>-</literal>, in which case
-        they will be ignored when they do not
-        exist. Note that using this
-        setting will disconnect propagation of
-        mounts from the service to the host
-        (propagation in the opposite direction
-        continues to work). This means that
-        this setting may not be used for
-        services which shall be able to
-        install mount points in the main mount
-        namespace.</para></listitem>
+        <listitem><para>Sets up a new file system namespace for executed processes. These options may be used to limit
+        access a process might have to the file system hierarchy. Each setting takes a space-separated list of paths
+        relative to the host's root directory (i.e. the system running the service manager).  Note that if paths
+        contain symlinks, they are resolved relative to the root directory set with
+        <varname>RootDirectory=</varname>.</para>
+
+        <para>Paths listed in <varname>ReadWritePaths=</varname> are accessible from within the namespace with the same
+        access modes as from outside of it. Paths listed in <varname>ReadOnlyPaths=</varname> are accessible for
+        reading only, writing will be refused even if the usual file access controls would permit this. Nest
+        <varname>ReadWritePaths=</varname> inside of <varname>ReadOnlyPaths=</varname> in order to provide writable
+        subdirectories within read-only directories. Use <varname>ReadWritePaths=</varname> in order to whitelist
+        specific paths for write access if <varname>ProtectSystem=strict</varname> is used. Paths listed in
+        <varname>InaccessiblePaths=</varname> will be made inaccessible for processes inside the namespace (along with
+        everything below them in the file system hierarchy).</para>
+
+        <para>Note that restricting access with these options does not extend to submounts of a directory that are
+        created later on.  Non-directory paths may be specified as well. These options may be specified more than once,
+        in which case all paths listed will have limited access from within the namespace. If the empty string is
+        assigned to this option, the specific list is reset, and all prior assignments have no effect.</para>
+
+        <para>Paths in <varname>ReadOnlyPaths=</varname> and <varname>InaccessiblePaths=</varname> may be prefixed with
+        <literal>-</literal>, in which case they will be ignored when they do not exist. Note that using this setting
+        will disconnect propagation of mounts from the service to the host (propagation in the opposite direction
+        continues to work). This means that this setting may not be used for services which shall be able to install
+        mount points in the main mount namespace. Note that the effect of these settings may be undone by privileged
+        processes. In order to set up an effective sandboxed environment for a unit it is thus recommended to combine
+        these settings with either <varname>CapabilityBoundingSet=~CAP_SYS_ADMIN</varname> or
+        <varname>SystemCallFilter=~@mount</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -933,37 +919,30 @@
         private <filename>/tmp</filename> and <filename>/var/tmp</filename> namespace by using the
         <varname>JoinsNamespaceOf=</varname> directive, see
         <citerefentry><refentrytitle>systemd.unit</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
-        details. Note that using this setting will disconnect propagation of mounts from the service to the host
-        (propagation in the opposite direction continues to work).  This means that this setting may not be used for
-        services which shall be able to install mount points in the main mount namespace. This setting is implied if
-        <varname>DynamicUser=</varname> is set.</para></listitem>
+        details. This setting is implied if <varname>DynamicUser=</varname> is set. For this setting the same
+        restrictions regarding mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and
+        related calls, see above.</para></listitem>
+
       </varlistentry>
 
       <varlistentry>
         <term><varname>PrivateDevices=</varname></term>
 
-        <listitem><para>Takes a boolean argument. If true, sets up a
-        new /dev namespace for the executed processes and only adds
-        API pseudo devices such as <filename>/dev/null</filename>,
-        <filename>/dev/zero</filename> or
-        <filename>/dev/random</filename> (as well as the pseudo TTY
-        subsystem) to it, but no physical devices such as
-        <filename>/dev/sda</filename>. This is useful to securely turn
-        off physical device access by the executed process. Defaults
-        to false. Enabling this option will also remove
-        <constant>CAP_MKNOD</constant> from the capability bounding
-        set for the unit (see above), and set
-        <varname>DevicePolicy=closed</varname> (see
+        <listitem><para>Takes a boolean argument. If true, sets up a new /dev namespace for the executed processes and
+        only adds API pseudo devices such as <filename>/dev/null</filename>, <filename>/dev/zero</filename> or
+        <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
+        <filename>/dev/sda</filename>. This is useful to securely turn off physical device access by the executed
+        process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from the
+        capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
-        for details). Note that using this setting will disconnect
-        propagation of mounts from the service to the host
-        (propagation in the opposite direction continues to work).
-        This means that this setting may not be used for services
-        which shall be able to install mount points in the main mount
-        namespace. The /dev namespace will be mounted read-only and 'noexec'.
-        The latter may break old programs which try to set up executable
-        memory by using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        of <filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>.</para></listitem>
+        for details). Note that using this setting will disconnect propagation of mounts from the service to the host
+        (propagation in the opposite direction continues to work).  This means that this setting may not be used for
+        services which shall be able to install mount points in the main mount namespace. The /dev namespace will be
+        mounted read-only and 'noexec'.  The latter may break old programs which try to set up executable memory by
+        using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry> of
+        <filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>. This setting is implied if
+        <varname>DynamicUser=</varname> is set. For this setting the same restrictions regarding mount propagation and
+        privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1023,33 +1002,23 @@
         operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
         recommended to enable this setting for all long-running services, unless they are involved with system updates
         or need to modify the operating system in other ways. If this option is used,
-        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
-        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
-        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
-        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
-        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. This
+        setting is implied if <varname>DynamicUser=</varname> is set. For this setting the same restrictions regarding
+        mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
+        above. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>ProtectHome=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>read-only</literal>. If true, the directories
-        <filename>/home</filename>, <filename>/root</filename> and
-        <filename>/run/user</filename>
-        are made inaccessible and empty for processes invoked by this
-        unit. If set to <literal>read-only</literal>, the three
-        directories are made read-only instead. It is recommended to
-        enable this setting for all long-running services (in
-        particular network-facing ones), to ensure they cannot get
-        access to private user data, unless the services actually
-        require access to the user's private data. Note however that
-        processes retaining the CAP_SYS_ADMIN capability can undo the
-        effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or <literal>read-only</literal>. If true, the directories
+        <filename>/home</filename>, <filename>/root</filename> and <filename>/run/user</filename> are made inaccessible
+        and empty for processes invoked by this unit. If set to <literal>read-only</literal>, the three directories are
+        made read-only instead. It is recommended to enable this setting for all long-running services (in particular
+        network-facing ones), to ensure they cannot get access to private user data, unless the services actually
+        require access to the user's private data. This setting is implied if <varname>DynamicUser=</varname> is
+        set. For this setting the same restrictions regarding mount propagation and privileges apply as for
+        <varname>ReadOnlyPaths=</varname> and related calls, see above.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1059,48 +1028,41 @@
         <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
         unit. Usually, tunable kernel variables should only be written at boot-time, with the
         <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
-        no services need to write to these at runtime; it is hence recommended to turn this on for most
-        services. Defaults to off.</para></listitem>
+        no services need to write to these at runtime; it is hence recommended to turn this on for most services. For
+        this setting the same restrictions regarding mount propagation and privileges apply as for
+        <varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>ProtectControlGroups=</varname></term>
 
-        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
-        through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
-        container managers no services should require write access to the control groups hierarchies; it is hence
-        recommended to turn this on for most services. Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
+        project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
+        accessible through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the
+        unit. Except for container managers no services should require write access to the control groups hierarchies;
+        it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
+        mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
+        above. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>MountFlags=</varname></term>
 
-        <listitem><para>Takes a mount propagation flag:
-        <option>shared</option>, <option>slave</option> or
-        <option>private</option>, which control whether mounts in the
-        file system namespace set up for this unit's processes will
-        receive or propagate mounts or unmounts. See
-        <citerefentry project='man-pages'><refentrytitle>mount</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        for details. Defaults to <option>shared</option>. Use
-        <option>shared</option> to ensure that mounts and unmounts are
-        propagated from the host to the container and vice versa. Use
-        <option>slave</option> to run processes so that none of their
-        mounts and unmounts will propagate to the host. Use
-        <option>private</option> to also ensure that no mounts and
-        unmounts from the host will propagate into the unit processes'
-        namespace. Note that <option>slave</option> means that file
-        systems mounted on the host might stay mounted continuously in
-        the unit's namespace, and thus keep the device busy. Note that
-        the file system namespace related options
-        (<varname>PrivateTmp=</varname>,
-        <varname>PrivateDevices=</varname>,
-        <varname>ProtectSystem=</varname>,
-        <varname>ProtectHome=</varname>,
-        <varname>ReadOnlyPaths=</varname>,
-        <varname>InaccessiblePaths=</varname> and
-        <varname>ReadWritePaths=</varname>) require that mount
-        and unmount propagation from the unit's file system namespace
-        is disabled, and hence downgrade <option>shared</option> to
+        <listitem><para>Takes a mount propagation flag: <option>shared</option>, <option>slave</option> or
+        <option>private</option>, which control whether mounts in the file system namespace set up for this unit's
+        processes will receive or propagate mounts or unmounts. See <citerefentry
+        project='man-pages'><refentrytitle>mount</refentrytitle><manvolnum>2</manvolnum></citerefentry> for
+        details. Defaults to <option>shared</option>. Use <option>shared</option> to ensure that mounts and unmounts
+        are propagated from the host to the container and vice versa. Use <option>slave</option> to run processes so
+        that none of their mounts and unmounts will propagate to the host. Use <option>private</option> to also ensure
+        that no mounts and unmounts from the host will propagate into the unit processes' namespace. Note that
+        <option>slave</option> means that file systems mounted on the host might stay mounted continuously in the
+        unit's namespace, and thus keep the device busy. Note that the file system namespace related options
+        (<varname>PrivateTmp=</varname>, <varname>PrivateDevices=</varname>, <varname>ProtectSystem=</varname>,
+        <varname>ProtectHome=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>, <varname>ReadOnlyPaths=</varname>,
+        <varname>InaccessiblePaths=</varname>, <varname>ReadWritePaths=</varname>) require that mount and unmount
+        propagation from the unit's file system namespace is disabled, and hence downgrade <option>shared</option> to
         <option>slave</option>. </para></listitem>
       </varlistentry>
 
@@ -1335,7 +1297,15 @@
         </table>
 
         Note, that as new system calls are added to the kernel, additional system calls might be added to the groups
-        above, so the contents of the sets may change between systemd versions.</para></listitem>
+        above, so the contents of the sets may change between systemd versions.</para>
+
+        <para>It is recommended to combine the file system namespacing related options with
+        <varname>SystemCallFilter=~@mount</varname>, in order to prohibit the unit's processes to undo the
+        mappings. Specifically these are the options <varname>PrivateTmp=</varname>,
+        <varname>PrivateDevices=</varname>, <varname>ProtectSystem=</varname>, <varname>ProtectHome=</varname>,
+        <varname>ProtectKernelTunables=</varname>, <varname>ProtectControlGroups=</varname>,
+        <varname>ReadOnlyPaths=</varname>, <varname>InaccessiblePaths=</varname> and
+        <varname>ReadWritePaths=</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
-- 
cgit v1.2.3-54-g00ecf


From 81c8aceed4a0cabd605788e46a266cc4cefdc16a Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:29:28 +0200
Subject: man: the exit code/signal is stored in $EXIT_CODE, not $EXIT_STATUS

---
 man/systemd.exec.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 84f81fe38e..6811e7cc53 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1612,8 +1612,8 @@
         <varname>ExecStop=</varname> and <varname>ExecStopPost=</varname> processes, and encodes the service
         "result". Currently, the following values are defined: <literal>timeout</literal> (in case of an operation
         timeout), <literal>exit-code</literal> (if a service process exited with a non-zero exit code; see
-        <varname>$EXIT_STATUS</varname> below for the actual exit status returned), <literal>signal</literal> (if a
-        service process was terminated abnormally by a signal; see <varname>$EXIT_STATUS</varname> below for the actual
+        <varname>$EXIT_CODE</varname> below for the actual exit code returned), <literal>signal</literal> (if a
+        service process was terminated abnormally by a signal; see <varname>$EXIT_CODE</varname> below for the actual
         signal used for the termination), <literal>core-dump</literal> (if a service process terminated abnormally and
         dumped core), <literal>watchdog</literal> (if the watchdog keep-alive ping was enabled for the service but it
         missed the deadline), or <literal>resources</literal> (a catch-all condition in case a system operation
-- 
cgit v1.2.3-54-g00ecf


From 6757c06a1a8dd3755338ca76e598e0d81dc164f2 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:29:52 +0200
Subject: man: shorten the exit status table a bit

Let's merge a couple of columns, to make the table a bit shorter. This
effectively just drops whitespace, not contents, but makes the currently
humungous table much much more compact.
---
 man/systemd.exec.xml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 6811e7cc53..403aa471c8 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1658,32 +1658,32 @@
               <row>
                 <entry morerows="1" valign="top"><literal>timeout</literal></entry>
                 <entry valign="top"><literal>killed</literal></entry>
-                <entry><literal>TERM</literal><sbr/><literal>KILL</literal></entry>
+                <entry><literal>TERM</literal>, <literal>KILL</literal></entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>exited</literal></entry>
-                <entry><literal>0</literal><sbr/><literal>1</literal><sbr/><literal>2</literal><sbr/><literal
-                >3</literal><sbr/>…<sbr/><literal>255</literal></entry>
+                <entry><literal>0</literal>, <literal>1</literal>, <literal>2</literal>, <literal
+                >3</literal>, …, <literal>255</literal></entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>exit-code</literal></entry>
                 <entry valign="top"><literal>exited</literal></entry>
-                <entry><literal>0</literal><sbr/><literal>1</literal><sbr/><literal>2</literal><sbr/><literal
-                >3</literal><sbr/>…<sbr/><literal>255</literal></entry>
+                <entry><literal>0</literal>, <literal>1</literal>, <literal>2</literal>, <literal
+                >3</literal>, …, <literal>255</literal></entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>signal</literal></entry>
                 <entry valign="top"><literal>killed</literal></entry>
-                <entry><literal>HUP</literal><sbr/><literal>INT</literal><sbr/><literal>KILL</literal><sbr/>…</entry>
+                <entry><literal>HUP</literal>, <literal>INT</literal>, <literal>KILL</literal>, …</entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>core-dump</literal></entry>
                 <entry valign="top"><literal>dumped</literal></entry>
-                <entry><literal>ABRT</literal><sbr/><literal>SEGV</literal><sbr/><literal>QUIT</literal><sbr/>…</entry>
+                <entry><literal>ABRT</literal>, <literal>SEGV</literal>, <literal>QUIT</literal>, …</entry>
               </row>
 
               <row>
@@ -1693,12 +1693,12 @@
               </row>
               <row>
                 <entry><literal>killed</literal></entry>
-                <entry><literal>TERM</literal><sbr/><literal>KILL</literal></entry>
+                <entry><literal>TERM</literal>, <literal>KILL</literal></entry>
               </row>
               <row>
                 <entry><literal>exited</literal></entry>
-                <entry><literal>0</literal><sbr/><literal>1</literal><sbr/><literal>2</literal><sbr/><literal
-                >3</literal><sbr/>…<sbr/><literal>255</literal></entry>
+                <entry><literal>0</literal>, <literal>1</literal>, <literal>2</literal>, <literal
+                >3</literal>, …, <literal>255</literal></entry>
               </row>
 
               <row>
-- 
cgit v1.2.3-54-g00ecf


From e778185bb55320e8242b57c19079377fe33e01bc Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 19 Sep 2016 21:46:17 +0200
Subject: doc: documentation fixes for ReadWritePaths= and
 ProtectKernelTunables=

Documentation fixes for ReadWritePaths= and ProtectKernelTunables=
as reported by Evgeny Vereshchagin.
---
 man/systemd.exec.xml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 403aa471c8..79ceee3ec0 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -897,14 +897,14 @@
         in which case all paths listed will have limited access from within the namespace. If the empty string is
         assigned to this option, the specific list is reset, and all prior assignments have no effect.</para>
 
-        <para>Paths in <varname>ReadOnlyPaths=</varname> and <varname>InaccessiblePaths=</varname> may be prefixed with
-        <literal>-</literal>, in which case they will be ignored when they do not exist. Note that using this setting
-        will disconnect propagation of mounts from the service to the host (propagation in the opposite direction
-        continues to work). This means that this setting may not be used for services which shall be able to install
-        mount points in the main mount namespace. Note that the effect of these settings may be undone by privileged
-        processes. In order to set up an effective sandboxed environment for a unit it is thus recommended to combine
-        these settings with either <varname>CapabilityBoundingSet=~CAP_SYS_ADMIN</varname> or
-        <varname>SystemCallFilter=~@mount</varname>.</para></listitem>
+        <para>Paths in <varname>ReadWritePaths=</varname>, <varname>ReadOnlyPaths=</varname> and
+        <varname>InaccessiblePaths=</varname> may be prefixed with <literal>-</literal>, in which case they will be ignored
+        when they do not exist. Note that using this setting will disconnect propagation of mounts from the service to
+        the host (propagation in the opposite direction continues to work). This means that this setting may not be used
+        for services which shall be able to install mount points in the main mount namespace. Note that the effect of
+        these settings may be undone by privileged processes. In order to set up an effective sandboxed environment for
+        a unit it is thus recommended to combine these settings with either
+        <varname>CapabilityBoundingSet=~CAP_SYS_ADMIN</varname> or <varname>SystemCallFilter=~@mount</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1025,11 +1025,11 @@
         <term><varname>ProtectKernelTunables=</varname></term>
 
         <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
-        <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
-        unit. Usually, tunable kernel variables should only be written at boot-time, with the
-        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
-        no services need to write to these at runtime; it is hence recommended to turn this on for most services. For
-        this setting the same restrictions regarding mount propagation and privileges apply as for
+        <filename>/proc/sys</filename>, <filename>/sys</filename> and <filename>/proc/sysrq-trigger</filename> will be
+        made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
+        boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
+        most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
         <varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.</para></listitem>
       </varlistentry>
 
-- 
cgit v1.2.3-54-g00ecf


From 9221aec8d09f3b55a08fcbe8012e48129474ab54 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 19 Sep 2016 21:46:17 +0200
Subject: doc: explicitly document that /dev/mem and /dev/port are blocked by
 PrivateDevices=true

---
 man/systemd.exec.xml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 79ceee3ec0..a3a431c82b 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -931,9 +931,10 @@
         <listitem><para>Takes a boolean argument. If true, sets up a new /dev namespace for the executed processes and
         only adds API pseudo devices such as <filename>/dev/null</filename>, <filename>/dev/zero</filename> or
         <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
-        <filename>/dev/sda</filename>. This is useful to securely turn off physical device access by the executed
-        process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from the
-        capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
+        <filename>/dev/sda</filename>, system memory <filename>/dev/mem</filename>, system ports
+        <filename>/dev/port</filename> and others. This is useful to securely turn off physical device access by the
+        executed process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from
+        the capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         for details). Note that using this setting will disconnect propagation of mounts from the service to the host
         (propagation in the opposite direction continues to work).  This means that this setting may not be used for
-- 
cgit v1.2.3-54-g00ecf


From 49accde7bd915944d99c947dca0cf26ae0f24165 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:30:11 +0200
Subject: core:sandbox: add more /proc/* entries to ProtectKernelTunables=

Make ALSA entries, latency interface, mtrr, apm/acpi, suspend interface,
filesystems configuration and IRQ tuning readonly.

Most of these interfaces now days should be in /sys but they are still
available through /proc, so just protect them. This patch does not touch
/proc/net/...
---
 man/systemd.exec.xml |  6 ++++--
 src/core/namespace.c | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index a3a431c82b..f19e7f6ee9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1026,8 +1026,10 @@
         <term><varname>ProtectKernelTunables=</varname></term>
 
         <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
-        <filename>/proc/sys</filename>, <filename>/sys</filename> and <filename>/proc/sysrq-trigger</filename> will be
-        made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
+        <filename>/proc/sys</filename>, <filename>/sys</filename>, <filename>/proc/sysrq-trigger</filename>,
+        <filename>/proc/latency_stats</filename>, <filename>/proc/acpi</filename>,
+        <filename>/proc/timer_stats</filename>, <filename>/proc/fs</filename> and <filename>/proc/irq</filename> will
+        be made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
         boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
         most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8aa8b83c88..3234fab4bc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -74,7 +74,18 @@ typedef struct TargetMount {
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
         { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/proc/latency_stats",        READONLY,       true  },
+        { "/proc/mtrr",                 READONLY,       true  },
+        { "/proc/apm",                  READONLY,       true  },
+        { "/proc/acpi",                 READONLY,       true  },
+        { "/proc/timer_stats",          READONLY,       true  },
+        { "/proc/asound",               READONLY,       true  },
+        { "/proc/bus",                  READONLY,       true  },
+        { "/proc/fs",                   READONLY,       true  },
+        { "/proc/irq",                  READONLY,       true  },
         { "/sys",                       READONLY,       false },
+        { "/sys/kernel/debug",          READONLY,       true  },
+        { "/sys/kernel/tracing",        READONLY,       true  },
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
-- 
cgit v1.2.3-54-g00ecf


From 8f81a5f61bcf745bae3acad599d7a9da686643e3 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:52:27 +0200
Subject: core: Use @raw-io syscall group to filter I/O syscalls when
 PrivateDevices= is set

Instead of having a local syscall list, use the @raw-io group which
contains the same set of syscalls to filter.
---
 man/systemd.exec.xml |  6 ++++--
 src/core/execute.c   | 55 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'man')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index f19e7f6ee9..f70e5c36d4 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -933,8 +933,10 @@
         <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
         <filename>/dev/sda</filename>, system memory <filename>/dev/mem</filename>, system ports
         <filename>/dev/port</filename> and others. This is useful to securely turn off physical device access by the
-        executed process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from
-        the capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
+        executed process. Defaults to false. Enabling this option will install a system call filter to block low-level
+        I/O system calls that are grouped in the <varname>@raw-io</varname> set, will also remove
+        <constant>CAP_MKNOD</constant> from the capability bounding set for the unit (see above), and set
+        <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         for details). Note that using this setting will disconnect propagation of mounts from the service to the host
         (propagation in the opposite direction continues to work).  This means that this setting may not be used for
diff --git a/src/core/execute.c b/src/core/execute.c
index 0488ba2ca9..3da7ef3be6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1429,28 +1429,15 @@ finish:
 }
 
 static int apply_private_devices(Unit *u, const ExecContext *c) {
-
-        static const int device_syscalls[] = {
-                SCMP_SYS(ioperm),
-                SCMP_SYS(iopl),
-                SCMP_SYS(pciconfig_iobase),
-                SCMP_SYS(pciconfig_read),
-                SCMP_SYS(pciconfig_write),
-#ifdef __NR_s390_pci_mmio_read
-                SCMP_SYS(s390_pci_mmio_read),
-#endif
-#ifdef __NR_s390_pci_mmio_write
-                SCMP_SYS(s390_pci_mmio_write),
-#endif
-        };
-
+        const SystemCallFilterSet *set;
         scmp_filter_ctx *seccomp;
-        unsigned i;
+        const char *sys;
+        bool syscalls_found = false;
         int r;
 
         assert(c);
 
-        /* If PrivateDevices= is set, also turn off iopl and friends. */
+        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
 
         if (skip_seccomp_unavailable(u, "PrivateDevices="))
                 return 0;
@@ -1463,12 +1450,40 @@ static int apply_private_devices(Unit *u, const ExecContext *c) {
         if (r < 0)
                 goto finish;
 
-        for (i = 0; i < ELEMENTSOF(device_syscalls); i++) {
+        for (set = syscall_filter_sets; set->set_name; set++)
+                if (streq(set->set_name, "@raw-io")) {
+                        syscalls_found = true;
+                        break;
+                }
+
+        /* We should never fail here */
+        if (!syscalls_found) {
+                r = -EOPNOTSUPP;
+                goto finish;
+        }
+
+        NULSTR_FOREACH(sys, set->value) {
+                int id;
+                bool add = true;
+
+#ifndef __NR_s390_pci_mmio_read
+                if (streq(sys, "s390_pci_mmio_read"))
+                        add = false;
+#endif
+#ifndef __NR_s390_pci_mmio_write
+                if (streq(sys, "s390_pci_mmio_write"))
+                        add = false;
+#endif
+
+                if (!add)
+                        continue;
+
+                id = seccomp_syscall_resolve_name(sys);
+
                 r = seccomp_rule_add(
                                 seccomp,
                                 SCMP_ACT_ERRNO(EPERM),
-                                device_syscalls[i],
-                                0);
+                                id, 0);
                 if (r < 0)
                         goto finish;
         }
-- 
cgit v1.2.3-54-g00ecf