From 72246c2a654ead7f7ee6e7799161e2e46dc0b84b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Aug 2016 19:01:14 +0200
Subject: core: enforce seccomp for secondary archs too, for all rules

Let's make sure that all our rules apply to all archs the local kernel
supports.
---
 src/core/execute.c | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/src/core/execute.c b/src/core/execute.c
index 2026137721..ee734e8445 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1273,6 +1273,10 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c)
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         r = seccomp_rule_add(
                         seccomp,
                         SCMP_ACT_ERRNO(EPERM),
@@ -1322,6 +1326,10 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         /* Determine the highest policy constant we want to allow */
         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
                 if (permitted_policies[i] > max_policy)
-- 
cgit v1.2.3-54-g00ecf


From 59eeb84ba65483c5543d1bc840c2ac75642ef638 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Aug 2016 18:43:59 +0200
Subject: core: add two new service settings ProtectKernelTunables= and
 ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.
---
 man/systemd.exec.xml                  |  20 +++++++
 src/core/dbus-execute.c               |   9 ++-
 src/core/execute.c                    | 100 ++++++++++++++++++++++++++++++----
 src/core/execute.h                    |   2 +
 src/core/load-fragment-gperf.gperf.m4 |   2 +
 src/core/namespace.c                  |  36 ++++++++++--
 src/core/namespace.h                  |   2 +
 src/shared/bus-unit-util.c            |   2 +-
 src/test/test-ns.c                    |   2 +
 9 files changed, 159 insertions(+), 16 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index bcedebd5bb..07128b489e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1059,6 +1059,26 @@
         Defaults to off.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>ProtectKernelTunables=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
+        <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
+        unit. Usually, tunable kernel variables should only be written at boot-time, with the
+        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
+        no services need to write to these at runtime; it is hence recommended to turn this on for most
+        services. Defaults to off.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>ProtectControlGroups=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
+        through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
+        container managers no services should require write access to the control groups hierarchies; it is hence
+        recommended to turn this on for most services. Defaults to off.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>MountFlags=</varname></term>
 
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 7e33a2d201..eec4500c8c 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property(
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                               "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
+                              "ProtectControlGroups")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property(
                                 c->dynamic_user = b;
                         else if (streq(name, "RemoveIPC"))
                                 c->remove_ipc = b;
+                        else if (streq(name, "ProtectKernelTunables"))
+                                c->protect_kernel_tunables = b;
+                        else if (streq(name, "ProtectControlGroups"))
+                                c->protect_control_groups = b;
 
                         unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                 }
diff --git a/src/core/execute.c b/src/core/execute.c
index ee734e8445..609b69a859 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1383,6 +1383,45 @@ finish:
         return r;
 }
 
+static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
+        scmp_filter_ctx *seccomp;
+        int r;
+
+        assert(c);
+
+        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+         * let's protect even those systems where this is left on in the kernel. */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(_sysctl),
+                        0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace(
 
         if (context->private_devices ||
             context->protect_system != PROTECT_SYSTEM_NO ||
-            context->protect_home != PROTECT_HOME_NO)
+            context->protect_home != PROTECT_HOME_NO ||
+            context->protect_kernel_tunables ||
+            context->protect_control_groups)
                 return true;
 
         return false;
@@ -1804,6 +1845,37 @@ static int close_remaining_fds(
         return close_all_fds(dont_close, n_dont_close);
 }
 
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
+
+        return c->address_families_whitelist ||
+                !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_whitelist ||
+                !set_isempty(c->syscall_filter) ||
+                !set_isempty(c->syscall_archs);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
+
+        if (c->no_new_privileges)
+                return true;
+
+        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+                return false;
+
+        return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+                c->memory_deny_write_execute ||
+                c->restrict_realtime ||
+                c->protect_kernel_tunables ||
+                context_has_syscall_filters(c);
+}
+
 static int send_user_lookup(
                 Unit *unit,
                 int user_lookup_fd,
@@ -2255,6 +2327,8 @@ static int exec_child(
                                 tmp,
                                 var,
                                 context->private_devices,
+                                context->protect_kernel_tunables,
+                                context->protect_control_groups,
                                 context->protect_home,
                                 context->protect_system,
                                 context->mount_flags);
@@ -2343,11 +2417,6 @@ static int exec_child(
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
 
-                bool use_address_families = context->address_families_whitelist ||
-                        !set_isempty(context->address_families);
-                bool use_syscall_filter = context->syscall_whitelist ||
-                        !set_isempty(context->syscall_filter) ||
-                        !set_isempty(context->syscall_archs);
                 int secure_bits = context->secure_bits;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -2424,15 +2493,14 @@ static int exec_child(
                                 return -errno;
                         }
 
-                if (context->no_new_privileges ||
-                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
+                if (context_has_no_new_privileges(context))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
                         }
 
 #ifdef HAVE_SECCOMP
-                if (use_address_families) {
+                if (context_has_address_families(context)) {
                         r = apply_address_families(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_ADDRESS_FAMILIES;
@@ -2456,7 +2524,15 @@ static int exec_child(
                         }
                 }
 
-                if (use_syscall_filter) {
+                if (context->protect_kernel_tunables) {
+                        r = apply_protect_sysctl(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context_has_syscall_filters(context)) {
                         r = apply_seccomp(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
@@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
                 "%sPrivateDevices: %s\n"
+                "%sProtectKernelTunables: %s\n"
+                "%sProtectControlGroups: %s\n"
                 "%sPrivateNetwork: %s\n"
                 "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
@@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
                 prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->protect_kernel_tunables),
+                prefix, yes_no(c->protect_control_groups),
                 prefix, yes_no(c->private_network),
                 prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
diff --git a/src/core/execute.h b/src/core/execute.h
index 6082c42aba..449180c903 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -174,6 +174,8 @@ struct ExecContext {
         bool private_users;
         ProtectSystem protect_system;
         ProtectHome protect_home;
+        bool protect_kernel_tunables;
+        bool protect_control_groups;
 
         bool no_new_privileges;
 
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 2e6c965aec..c49c1d6732 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -89,6 +89,8 @@ $1.ReadOnlyPaths,                config_parse_namespace_path_strv,   0,
 $1.InaccessiblePaths,            config_parse_namespace_path_strv,   0,                             offsetof($1, exec_context.inaccessible_paths)
 $1.PrivateTmp,                   config_parse_bool,                  0,                             offsetof($1, exec_context.private_tmp)
 $1.PrivateDevices,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_devices)
+$1.ProtectKernelTunables,        config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_tunables)
+$1.ProtectControlGroups,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_control_groups)
 $1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)
 $1.PrivateUsers,                 config_parse_bool,                  0,                             offsetof($1, exec_context.private_users)
 $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 52a2505d94..f2768aeb28 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -53,7 +53,7 @@ typedef enum MountMode {
         PRIVATE_TMP,
         PRIVATE_VAR_TMP,
         PRIVATE_DEV,
-        READWRITE
+        READWRITE,
 } MountMode;
 
 typedef struct BindMount {
@@ -366,6 +366,8 @@ int setup_namespace(
                 const char* tmp_dir,
                 const char* var_tmp_dir,
                 bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
                 ProtectHome protect_home,
                 ProtectSystem protect_system,
                 unsigned long mount_flags) {
@@ -385,6 +387,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
+                (protect_sysctl ? 3 : 0) +
+                (protect_cgroups != protect_sysctl) +
                 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
                 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
                 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
@@ -421,6 +425,27 @@ int setup_namespace(
                         m++;
                 }
 
+                if (protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/proc/sys");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
+                        m->mode = READONLY;
+                        m->ignore = true; /* Not always compiled into the kernel */
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/sys");
+                        m->mode = READONLY;
+                        m++;
+                }
+
+                if (protect_cgroups != protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
+                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m++;
+                }
+
                 if (protect_home != PROTECT_HOME_NO) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
@@ -505,9 +530,12 @@ int setup_namespace(
 
 fail:
         if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m)
-                        if (m->done)
-                                (void) umount2(m->path, MNT_DETACH);
+                for (m = mounts; m < mounts + n; ++m) {
+                        if (!m->done)
+                                continue;
+
+                        (void) umount2(m->path, MNT_DETACH);
+                }
         }
 
         return r;
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 1aedf5f208..3845336287 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -46,6 +46,8 @@ int setup_namespace(const char *chroot,
                     const char *tmp_dir,
                     const char *var_tmp_dir,
                     bool private_dev,
+                    bool protect_sysctl,
+                    bool protect_cgroups,
                     ProtectHome protect_home,
                     ProtectSystem protect_system,
                     unsigned long mount_flags);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index feb4a06737..c6bd2f145c 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                               "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
 
                 r = parse_boolean(eq);
                 if (r < 0)
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 9248f2987c..05f243c75c 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -69,6 +69,8 @@ int main(int argc, char *argv[]) {
                             tmp_dir,
                             var_tmp_dir,
                             true,
+                            true,
+                            true,
                             PROTECT_HOME_NO,
                             PROTECT_SYSTEM_NO,
                             0);
-- 
cgit v1.2.3-54-g00ecf


From fe3c2583bee339b6744872dc1897e6486d5bd7e0 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 24 Aug 2016 23:17:42 +0200
Subject: namespace: make sure InaccessibleDirectories= masks all mounts
 further down

If a dir is marked to be inaccessible then everything below it should be masked
by it.
---
 src/core/namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 src/test/test-ns.c   |  4 +++-
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index f2768aeb28..102fe576f3 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -116,16 +116,47 @@ static void drop_duplicates(BindMount *m, unsigned *n) {
         assert(m);
         assert(n);
 
+        /* Drops duplicate entries. Expects that the array is properly ordered already. */
+
         for (f = m, t = m, previous = NULL; f < m+*n; f++) {
 
-                /* The first one wins */
-                if (previous && path_equal(f->path, previous->path))
+                /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
+                 * above. */
+                if (previous && path_equal(f->path, previous->path)) {
+                        log_debug("%s is duplicate.", f->path);
                         continue;
+                }
 
                 *t = *f;
-
                 previous = t;
+                t++;
+        }
+
+        *n = t - m;
+}
+
+static void drop_inaccessible(BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+        const char *clear = NULL;
+
+        assert(m);
+        assert(n);
+
+        /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
+         * ordered already. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
+                 * it, as inaccessible paths really should drop the entire subtree. */
+                if (clear && path_startswith(f->path, clear)) {
+                        log_debug("%s is masked by %s.", f->path, clear);
+                        continue;
+                }
 
+                clear = f->mode == INACCESSIBLE ? f->path : NULL;
+
+                *t = *f;
                 t++;
         }
 
@@ -282,6 +313,8 @@ static int apply_mount(
 
         assert(m);
 
+        log_debug("Applying namespace mount on %s", m->path);
+
         switch (m->mode) {
 
         case INACCESSIBLE:
@@ -289,7 +322,7 @@ static int apply_mount(
                 /* First, get rid of everything that is below if there
                  * is anything... Then, overmount it with an
                  * inaccessible path. */
-                umount_recursive(m->path, 0);
+                (void) umount_recursive(m->path, 0);
 
                 if (lstat(m->path, &target) < 0) {
                         if (m->ignore && errno == ENOENT)
@@ -303,6 +336,7 @@ static int apply_mount(
                         return -ELOOP;
                 }
                 break;
+
         case READONLY:
         case READWRITE:
                 /* Nothing to mount here, we just later toggle the
@@ -480,7 +514,9 @@ int setup_namespace(
                 assert(mounts + n == m);
 
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
+
                 drop_duplicates(mounts, &n);
+                drop_inaccessible(mounts, &n);
         }
 
         if (n > 0 || root_directory) {
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 05f243c75c..03a24620af 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -26,6 +26,7 @@
 int main(int argc, char *argv[]) {
         const char * const writable[] = {
                 "/home",
+                "/home/lennart/projects/foobar", /* this should be masked automatically */
                 NULL
         };
 
@@ -42,11 +43,12 @@ int main(int argc, char *argv[]) {
         };
         char *root_directory;
         char *projects_directory;
-
         int r;
         char tmp_dir[] = "/tmp/systemd-private-XXXXXX",
              var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX";
 
+        log_set_max_level(LOG_DEBUG);
+
         assert_se(mkdtemp(tmp_dir));
         assert_se(mkdtemp(var_tmp_dir));
 
-- 
cgit v1.2.3-54-g00ecf


From 07689d5d2c07ee434437de5e39bf0abaa772818b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:12:57 +0200
Subject: execute: split out creation of runtime dirs into its own functions

---
 src/core/execute.c | 57 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/src/core/execute.c b/src/core/execute.c
index 609b69a859..3877293b4f 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1789,6 +1789,37 @@ static int setup_private_users(uid_t uid, gid_t gid) {
         return 0;
 }
 
+static int setup_runtime_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                uid_t uid,
+                gid_t gid) {
+
+        char **rt;
+        int r;
+
+        assert(context);
+        assert(params);
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                _cleanup_free_ char *p;
+
+                p = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!p)
+                        return -ENOMEM;
+
+                r = mkdir_p_label(p, context->runtime_directory_mode);
+                if (r < 0)
+                        return r;
+
+                r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -2188,28 +2219,10 @@ static int exec_child(
         }
 
         if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) {
-                char **rt;
-
-                STRV_FOREACH(rt, context->runtime_directory) {
-                        _cleanup_free_ char *p;
-
-                        p = strjoin(params->runtime_prefix, "/", *rt, NULL);
-                        if (!p) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return -ENOMEM;
-                        }
-
-                        r = mkdir_p_label(p, context->runtime_directory_mode);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
-
-                        r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
+                r = setup_runtime_directory(context, params, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_RUNTIME_DIRECTORY;
+                        return r;
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From be39ccf3a0d4d15324af1de4d8552a1d65f40808 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:24:10 +0200
Subject: execute: move suppression of HOME=/ and SHELL=/bin/nologin into
 user-util.c

This adds a new call get_user_creds_clean(), which is just like
get_user_creds() but returns NULL in the home/shell parameters if they contain
no useful information. This code previously lived in execute.c, but by
generalizing this we can reuse it in run.c.
---
 src/basic/user-util.c | 32 +++++++++++++++++++++++++++++++-
 src/basic/user-util.h |  1 +
 src/core/execute.c    | 14 +++-----------
 src/run/run.c         | 18 +++++++++++-------
 4 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/src/basic/user-util.c b/src/basic/user-util.c
index 122d9a0c7c..0522bce1d1 100644
--- a/src/basic/user-util.c
+++ b/src/basic/user-util.c
@@ -31,14 +31,15 @@
 #include <unistd.h>
 #include <utmp.h>
 
-#include "missing.h"
 #include "alloc-util.h"
 #include "fd-util.h"
 #include "formats-util.h"
 #include "macro.h"
+#include "missing.h"
 #include "parse-util.h"
 #include "path-util.h"
 #include "string-util.h"
+#include "strv.h"
 #include "user-util.h"
 #include "utf8.h"
 
@@ -175,6 +176,35 @@ int get_user_creds(
         return 0;
 }
 
+int get_user_creds_clean(
+                const char **username,
+                uid_t *uid, gid_t *gid,
+                const char **home,
+                const char **shell) {
+
+        int r;
+
+        /* Like get_user_creds(), but resets home/shell to NULL if they don't contain anything relevant. */
+
+        r = get_user_creds(username, uid, gid, home, shell);
+        if (r < 0)
+                return r;
+
+        if (shell &&
+            (isempty(*shell) || PATH_IN_SET(*shell,
+                                            "/bin/nologin",
+                                            "/sbin/nologin",
+                                            "/usr/bin/nologin",
+                                            "/usr/sbin/nologin")))
+                *shell = NULL;
+
+        if (home &&
+            (isempty(*home) || path_equal(*home, "/")))
+                *home = NULL;
+
+        return 0;
+}
+
 int get_group_creds(const char **groupname, gid_t *gid) {
         struct group *g;
         gid_t id;
diff --git a/src/basic/user-util.h b/src/basic/user-util.h
index f569363811..6c61f63cae 100644
--- a/src/basic/user-util.h
+++ b/src/basic/user-util.h
@@ -40,6 +40,7 @@ char* getlogname_malloc(void);
 char* getusername_malloc(void);
 
 int get_user_creds(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell);
+int get_user_creds_clean(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell);
 int get_group_creds(const char **groupname, gid_t *gid);
 
 char* uid_to_name(uid_t uid);
diff --git a/src/core/execute.c b/src/core/execute.c
index 3877293b4f..c7a3ea39e7 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -2051,22 +2051,14 @@ static int exec_child(
         } else {
                 if (context->user) {
                         username = context->user;
-                        r = get_user_creds(&username, &uid, &gid, &home, &shell);
+                        r = get_user_creds_clean(&username, &uid, &gid, &home, &shell);
                         if (r < 0) {
                                 *exit_status = EXIT_USER;
                                 return r;
                         }
 
-                        /* Don't set $HOME or $SHELL if they are are not particularly enlightening anyway. */
-                        if (isempty(home) || path_equal(home, "/"))
-                                home = NULL;
-
-                        if (isempty(shell) || PATH_IN_SET(shell,
-                                                          "/bin/nologin",
-                                                          "/sbin/nologin",
-                                                          "/usr/bin/nologin",
-                                                          "/usr/sbin/nologin"))
-                                shell = NULL;
+                        /* Note that we don't set $HOME or $SHELL if they are are not particularly enlightening anyway
+                         * (i.e. are "/" or "/bin/nologin"). */
                 }
 
                 if (context->group) {
diff --git a/src/run/run.c b/src/run/run.c
index 2dd229868c..81b53fdfab 100644
--- a/src/run/run.c
+++ b/src/run/run.c
@@ -1168,17 +1168,21 @@ static int start_transient_scope(
                 uid_t uid;
                 gid_t gid;
 
-                r = get_user_creds(&arg_exec_user, &uid, &gid, &home, &shell);
+                r = get_user_creds_clean(&arg_exec_user, &uid, &gid, &home, &shell);
                 if (r < 0)
                         return log_error_errno(r, "Failed to resolve user %s: %m", arg_exec_user);
 
-                r = strv_extendf(&user_env, "HOME=%s", home);
-                if (r < 0)
-                        return log_oom();
+                if (home) {
+                        r = strv_extendf(&user_env, "HOME=%s", home);
+                        if (r < 0)
+                                return log_oom();
+                }
 
-                r = strv_extendf(&user_env, "SHELL=%s", shell);
-                if (r < 0)
-                        return log_oom();
+                if (shell) {
+                        r = strv_extendf(&user_env, "SHELL=%s", shell);
+                        if (r < 0)
+                                return log_oom();
+                }
 
                 r = strv_extendf(&user_env, "USER=%s", arg_exec_user);
                 if (r < 0)
-- 
cgit v1.2.3-54-g00ecf


From 3fbe8dbe41ad662d7cae0525f6fd62a66d2c5ec5 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:42:38 +0200
Subject: execute: if RuntimeDirectory= is set, it should be writable

Implicitly make all dirs set with RuntimeDirectory= writable, as the concept
otherwise makes no sense.
---
 src/core/execute.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/core/execute.c b/src/core/execute.c
index c7a3ea39e7..20e74ec8a6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1820,6 +1820,44 @@ static int setup_runtime_directory(
         return 0;
 }
 
+static int compile_read_write_paths(
+                const ExecContext *context,
+                const ExecParameters *params,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        char **rt;
+
+        /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all
+         * runtime directories. */
+
+        if (strv_isempty(context->read_write_paths) &&
+            strv_isempty(context->runtime_directory)) {
+                *ret = NULL; /* NOP if neither is set */
+                return 0;
+        }
+
+        l = strv_copy(context->read_write_paths);
+        if (!l)
+                return -ENOMEM;
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                char *s;
+
+                s = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!s)
+                        return -ENOMEM;
+
+                if (strv_consume(&l, s) < 0)
+                        return -ENOMEM;
+        }
+
+        *ret = l;
+        l = NULL;
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -2307,8 +2345,8 @@ static int exec_child(
         }
 
         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
-
         if (needs_mount_namespace) {
+                _cleanup_free_ char **rw = NULL;
                 char *tmp = NULL, *var = NULL;
 
                 /* The runtime struct only contains the parent
@@ -2324,9 +2362,15 @@ static int exec_child(
                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
                 }
 
+                r = compile_read_write_paths(context, params, &rw);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return r;
+                }
+
                 r = setup_namespace(
                                 (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
-                                context->read_write_paths,
+                                rw,
                                 context->read_only_paths,
                                 context->inaccessible_paths,
                                 tmp,
-- 
cgit v1.2.3-54-g00ecf


From 6ee1a919cf9013a695da2a01ae67327b996a6ef6 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:44:09 +0200
Subject: namespace: simplify mount_path_compare() a bit

---
 src/core/namespace.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 102fe576f3..74201caa10 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -93,21 +93,19 @@ static int mount_path_compare(const void *a, const void *b) {
         const BindMount *p = a, *q = b;
         int d;
 
+        /* If the paths are not equal, then order prefixes first */
         d = path_compare(p->path, q->path);
+        if (d != 0)
+                return d;
 
-        if (d == 0) {
-                /* If the paths are equal, check the mode */
-                if (p->mode < q->mode)
-                        return -1;
-
-                if (p->mode > q->mode)
-                        return 1;
+        /* If the paths are equal, check the mode */
+        if (p->mode < q->mode)
+                return -1;
 
-                return 0;
-        }
+        if (p->mode > q->mode)
+                return 1;
 
-        /* If the paths are not equal, then order prefixes first */
-        return d;
+        return 0;
 }
 
 static void drop_duplicates(BindMount *m, unsigned *n) {
-- 
cgit v1.2.3-54-g00ecf


From 7648a565d14dfb5516d93bacf0d87de2de5b5d91 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 11:29:32 +0200
Subject: namespace: when enforcing fs namespace restrictions suppress
 redundant mounts

If /foo is marked to be read-only, and /foo/bar too, then the latter may be
suppressed as it has no effect.
---
 src/core/namespace.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 74201caa10..72f850b2f2 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -161,6 +161,44 @@ static void drop_inaccessible(BindMount *m, unsigned *n) {
         *n = t - m;
 }
 
+static void drop_nop(BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+
+        assert(m);
+        assert(n);
+
+        /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
+         * list is ordered by prefixes. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                /* Only suppress such subtrees for READONLY and READWRITE entries */
+                if (IN_SET(f->mode, READONLY, READWRITE)) {
+                        BindMount *p;
+                        bool found = false;
+
+                        /* Now let's find the first parent of the entry we are looking at. */
+                        for (p = t-1; p >= m; p--) {
+                                if (path_startswith(f->path, p->path)) {
+                                        found = true;
+                                        break;
+                                }
+                        }
+
+                        /* We found it, let's see if it's the same mode, if so, we can drop this entry */
+                        if (found && p->mode == f->mode) {
+                                log_debug("%s is redundant by %s", f->path, p->path);
+                                continue;
+                        }
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+}
+
 static int mount_dev(BindMount *m) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -515,6 +553,7 @@ int setup_namespace(
 
                 drop_duplicates(mounts, &n);
                 drop_inaccessible(mounts, &n);
+                drop_nop(mounts, &n);
         }
 
         if (n > 0 || root_directory) {
-- 
cgit v1.2.3-54-g00ecf


From 6b7c9f8bce4679c89f3b89cacfd4932c0aeadad4 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Sun, 25 Sep 2016 10:40:51 +0200
Subject: namespace: rework how ReadWritePaths= is applied

Previously, if ReadWritePaths= was nested inside a ReadOnlyPaths=
specification, then we'd first recursively apply the ReadOnlyPaths= paths, and
make everything below read-only, only in order to then flip the read-only bit
again for the subdirs listed in ReadWritePaths= below it.

This is not only ugly (as for the dirs in question we first turn on the RO bit,
only to turn it off again immediately after), but also problematic in
containers, where a container manager might have marked a set of dirs read-only
and this code will undo this is ReadWritePaths= is set for any.

With this patch behaviour in this regard is altered: ReadOnlyPaths= will not be
applied to the children listed in ReadWritePaths= in the first place, so that
we do not need to turn off the RO bit for those after all.

This means that ReadWritePaths=/ReadOnlyPaths= may only be used to turn on the
RO bit, but never to turn it off again. Or to say this differently: if some
dirs are marked read-only via some external tool, then ReadWritePaths= will not
undo it.

This is not only the safer option, but also more in-line with what the man page
currently claims:

        "Entries (files or directories) listed in ReadWritePaths= are
        accessible from within the namespace with the same access rights as
        from outside."

To implement this change bind_remount_recursive() gained a new "blacklist"
string list parameter, which when passed may contain subdirs that shall be
excluded from the read-only mounting.

A number of functions are updated to add more debug logging to make this more
digestable.
---
 src/basic/mount-util.c    | 71 ++++++++++++++++++++++++++++++++---------------
 src/basic/mount-util.h    |  2 +-
 src/core/namespace.c      | 66 ++++++++++++++++++++++++++++---------------
 src/nspawn/nspawn-mount.c |  6 ++--
 src/nspawn/nspawn.c       |  2 +-
 5 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/src/basic/mount-util.c b/src/basic/mount-util.c
index bfa04394fe..b9affb4e70 100644
--- a/src/basic/mount-util.c
+++ b/src/basic/mount-util.c
@@ -36,6 +36,7 @@
 #include "set.h"
 #include "stdio-util.h"
 #include "string-util.h"
+#include "strv.h"
 
 static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) {
         char path[strlen("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
@@ -287,10 +288,12 @@ int umount_recursive(const char *prefix, int flags) {
                                 continue;
 
                         if (umount2(p, flags) < 0) {
-                                r = -errno;
+                                r = log_debug_errno(errno, "Failed to umount %s: %m", p);
                                 continue;
                         }
 
+                        log_debug("Successfully unmounted %s", p);
+
                         again = true;
                         n++;
 
@@ -311,24 +314,21 @@ static int get_mount_flags(const char *path, unsigned long *flags) {
         return 0;
 }
 
-int bind_remount_recursive(const char *prefix, bool ro) {
+int bind_remount_recursive(const char *prefix, bool ro, char **blacklist) {
         _cleanup_set_free_free_ Set *done = NULL;
         _cleanup_free_ char *cleaned = NULL;
         int r;
 
-        /* Recursively remount a directory (and all its submounts)
-         * read-only or read-write. If the directory is already
-         * mounted, we reuse the mount and simply mark it
-         * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
-         * operation). If it isn't we first make it one. Afterwards we
-         * apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to all
-         * submounts we can access, too. When mounts are stacked on
-         * the same mount point we only care for each individual
-         * "top-level" mount on each point, as we cannot
-         * influence/access the underlying mounts anyway. We do not
-         * have any effect on future submounts that might get
-         * propagated, they migt be writable. This includes future
-         * submounts that have been triggered via autofs. */
+        /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
+         * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
+         * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
+         * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
+         * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
+         * do not have any effect on future submounts that might get propagated, they migt be writable. This includes
+         * future submounts that have been triggered via autofs.
+         *
+         * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the
+         * remount operation. Note that we'll ignore the blacklist for the top-level path. */
 
         cleaned = strdup(prefix);
         if (!cleaned)
@@ -385,6 +385,33 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (r < 0)
                                 return r;
 
+                        if (!path_startswith(p, cleaned))
+                                continue;
+
+                        /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall
+                         * operate on. */
+                        if (!path_equal(cleaned, p)) {
+                                bool blacklisted = false;
+                                char **i;
+
+                                STRV_FOREACH(i, blacklist) {
+
+                                        if (path_equal(*i, cleaned))
+                                                continue;
+
+                                        if (!path_startswith(*i, cleaned))
+                                                continue;
+
+                                        if (path_startswith(p, *i)) {
+                                                blacklisted = true;
+                                                log_debug("Not remounting %s, because blacklisted by %s, called for %s", p, *i, cleaned);
+                                                break;
+                                        }
+                                }
+                                if (blacklisted)
+                                        continue;
+                        }
+
                         /* Let's ignore autofs mounts.  If they aren't
                          * triggered yet, we want to avoid triggering
                          * them, as we don't make any guarantees for
@@ -396,12 +423,9 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                                 continue;
                         }
 
-                        if (path_startswith(p, cleaned) &&
-                            !set_contains(done, p)) {
-
+                        if (!set_contains(done, p)) {
                                 r = set_consume(todo, p);
                                 p = NULL;
-
                                 if (r == -EEXIST)
                                         continue;
                                 if (r < 0)
@@ -418,8 +442,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
 
                 if (!set_contains(done, cleaned) &&
                     !set_contains(todo, cleaned)) {
-                        /* The prefix directory itself is not yet a
-                         * mount, make it one. */
+                        /* The prefix directory itself is not yet a mount, make it one. */
                         if (mount(cleaned, cleaned, NULL, MS_BIND|MS_REC, NULL) < 0)
                                 return -errno;
 
@@ -430,6 +453,8 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (mount(NULL, prefix, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
                                 return -errno;
 
+                        log_debug("Made top-level directory %s a mount point.", prefix);
+
                         x = strdup(cleaned);
                         if (!x)
                                 return -ENOMEM;
@@ -447,8 +472,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (r < 0)
                                 return r;
 
-                        /* Deal with mount points that are obstructed by a
-                         * later mount */
+                        /* Deal with mount points that are obstructed by a later mount */
                         r = path_is_mount_point(x, 0);
                         if (r == -ENOENT || r == 0)
                                 continue;
@@ -463,6 +487,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (mount(NULL, x, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
                                 return -errno;
 
+                        log_debug("Remounted %s read-only.", x);
                 }
         }
 }
diff --git a/src/basic/mount-util.h b/src/basic/mount-util.h
index f46989ebb3..74730de663 100644
--- a/src/basic/mount-util.h
+++ b/src/basic/mount-util.h
@@ -35,7 +35,7 @@ int path_is_mount_point(const char *path, int flags);
 int repeat_unmount(const char *path, int flags);
 
 int umount_recursive(const char *target, int flags);
-int bind_remount_recursive(const char *prefix, bool ro);
+int bind_remount_recursive(const char *prefix, bool ro, char **blacklist);
 
 int mount_move_root(const char *path);
 
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 72f850b2f2..b0dab9459e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -375,9 +375,19 @@ static int apply_mount(
 
         case READONLY:
         case READWRITE:
-                /* Nothing to mount here, we just later toggle the
-                 * MS_RDONLY bit for the mount point */
-                return 0;
+
+                r = path_is_mount_point(m->path, 0);
+                if (r < 0) {
+                        if (m->ignore && errno == ENOENT)
+                                return 0;
+                        return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
+                }
+                if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
+                        return 0;
+
+                /* This isn't a mount point yet, let's make it one. */
+                what = m->path;
+                break;
 
         case PRIVATE_TMP:
                 what = tmp_dir;
@@ -396,31 +406,33 @@ static int apply_mount(
 
         assert(what);
 
-        r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
-        if (r >= 0) {
-                log_debug("Successfully mounted %s to %s", what, m->path);
-                return r;
-        } else {
+        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
                 if (m->ignore && errno == ENOENT)
                         return 0;
+
                 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
         }
+
+        log_debug("Successfully mounted %s to %s", what, m->path);
+        return 0;
 }
 
-static int make_read_only(BindMount *m) {
-        int r;
+static int make_read_only(BindMount *m, char **blacklist) {
+        int r = 0;
 
         assert(m);
 
         if (IN_SET(m->mode, INACCESSIBLE, READONLY))
-                r = bind_remount_recursive(m->path, true);
-        else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV)) {
-                r = bind_remount_recursive(m->path, false);
-                if (r == 0 && m->mode == PRIVATE_DEV) /* can be readonly but the submounts can't*/
-                        if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
-                                r = -errno;
+                r = bind_remount_recursive(m->path, true, blacklist);
+        else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
+                if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
+                        r = -errno;
         } else
-                r = 0;
+                return 0;
+
+        /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
+         * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
+         * read-only mounts already applied. */
 
         if (m->ignore && r == -ENOENT)
                 return 0;
@@ -570,14 +582,25 @@ int setup_namespace(
         }
 
         if (n > 0) {
+                char **blacklist;
+                unsigned j;
+
+                /* First round, add in all special mounts we need */
                 for (m = mounts; m < mounts + n; ++m) {
                         r = apply_mount(m, tmp_dir, var_tmp_dir);
                         if (r < 0)
                                 goto fail;
                 }
 
+                /* Create a blacklist we can pass to bind_mount_recursive() */
+                blacklist = newa(char*, n+1);
+                for (j = 0; j < n; j++)
+                        blacklist[j] = (char*) mounts[j].path;
+                blacklist[j] = NULL;
+
+                /* Second round, flip the ro bits if necessary. */
                 for (m = mounts; m < mounts + n; ++m) {
-                        r = make_read_only(m);
+                        r = make_read_only(m, blacklist);
                         if (r < 0)
                                 goto fail;
                 }
@@ -586,9 +609,7 @@ int setup_namespace(
         if (root_directory) {
                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
                 r = mount_move_root(root_directory);
-
-                /* at this point, we cannot rollback */
-                if (r < 0)
+                if (r < 0) /* at this point, we cannot rollback */
                         return r;
         }
 
@@ -596,8 +617,7 @@ int setup_namespace(
          * reestablish propagation from our side to the host, since
          * what's disconnected is disconnected. */
         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
-                /* at this point, we cannot rollback */
-                return -errno;
+                return -errno; /* at this point, we cannot rollback */
 
         return 0;
 
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 295b75341f..8457357003 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -476,7 +476,7 @@ static int mount_bind(const char *dest, CustomMount *m) {
                 return log_error_errno(errno, "mount(%s) failed: %m", where);
 
         if (m->read_only) {
-                r = bind_remount_recursive(where, true);
+                r = bind_remount_recursive(where, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Read-only bind mount failed: %m");
         }
@@ -990,7 +990,7 @@ int setup_volatile_state(
         /* --volatile=state means we simply overmount /var
            with a tmpfs, and the rest read-only. */
 
-        r = bind_remount_recursive(directory, true);
+        r = bind_remount_recursive(directory, true, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
@@ -1065,7 +1065,7 @@ int setup_volatile(
 
         bind_mounted = true;
 
-        r = bind_remount_recursive(t, true);
+        r = bind_remount_recursive(t, true, NULL);
         if (r < 0) {
                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
                 goto fail;
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 0d61d34ebf..1f3e1f2dac 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -3019,7 +3019,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only) {
-                r = bind_remount_recursive(directory, true);
+                r = bind_remount_recursive(directory, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
-- 
cgit v1.2.3-54-g00ecf


From 160cfdbed3eb23b6bc3c17613685b756f23be4a1 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:51:37 +0200
Subject: namespace: add some debug logging when enforcing InaccessiblePaths=

---
 src/core/namespace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index b0dab9459e..e08d7459c5 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -345,7 +345,6 @@ static int apply_mount(
 
         const char *what;
         int r;
-        struct stat target;
 
         assert(m);
 
@@ -353,7 +352,8 @@ static int apply_mount(
 
         switch (m->mode) {
 
-        case INACCESSIBLE:
+        case INACCESSIBLE: {
+                struct stat target;
 
                 /* First, get rid of everything that is below if there
                  * is anything... Then, overmount it with an
@@ -363,7 +363,7 @@ static int apply_mount(
                 if (lstat(m->path, &target) < 0) {
                         if (m->ignore && errno == ENOENT)
                                 return 0;
-                        return -errno;
+                        return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
                 }
 
                 what = mode_to_inaccessible_node(target.st_mode);
@@ -372,6 +372,7 @@ static int apply_mount(
                         return -ELOOP;
                 }
                 break;
+        }
 
         case READONLY:
         case READWRITE:
-- 
cgit v1.2.3-54-g00ecf


From 3f815163ff8fdcdbd329680580df36f94e15325d Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:57:21 +0200
Subject: core: introduce ProtectSystem=strict

Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a
new setting "strict". If set, the entire directory tree of the system is
mounted read-only, but the API file systems /proc, /dev, /sys are excluded
(they may be managed with PrivateDevices= and ProtectKernelTunables=). Also,
/home and /root are excluded as those are left for ProtectHome= to manage.

In this mode, all "real" file systems (i.e. non-API file systems) are mounted
read-only, and specific directories may only be excluded via
ReadWriteDirectories=, thus implementing an effective whitelist instead of
blacklist of writable directories.

While we are at, also add /efi to the list of paths always affected by
ProtectSystem=. This is a follow-up for
b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for
/boot. Our namespacing logic should respect that too.
---
 man/systemd.exec.xml | 33 ++++++++++++++++---------------
 src/core/namespace.c | 56 +++++++++++++++++++++++++++++++++++++++++++---------
 src/core/namespace.h |  1 +
 3 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 07128b489e..1b672fe0c9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1020,22 +1020,23 @@
       <varlistentry>
         <term><varname>ProtectSystem=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>full</literal>. If true, mounts the
-        <filename>/usr</filename> and <filename>/boot</filename>
-        directories read-only for processes invoked by this unit. If
-        set to <literal>full</literal>, the <filename>/etc</filename>
-        directory is mounted read-only, too. This setting ensures that
-        any modification of the vendor-supplied operating system (and
-        optionally its configuration) is prohibited for the service.
-        It is recommended to enable this setting for all long-running
-        services, unless they are involved with system updates or need
-        to modify the operating system in other ways. Note however
-        that processes retaining the CAP_SYS_ADMIN capability can undo
-        the effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or the special values <literal>full</literal> or
+        <literal>strict</literal>. If true, mounts the <filename>/usr</filename> and <filename>/boot</filename>
+        directories read-only for processes invoked by this unit. If set to <literal>full</literal>, the
+        <filename>/etc</filename> directory is mounted read-only, too. If set to <literal>strict</literal> the entire
+        file system hierarchy is mounted read-only, except for the API file system subtrees <filename>/dev</filename>,
+        <filename>/proc</filename> and <filename>/sys</filename> (protect these directories using
+        <varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>). This setting ensures that any modification of the vendor-supplied
+        operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
+        recommended to enable this setting for all long-running services, unless they are involved with system updates
+        or need to modify the operating system in other ways. If this option is used,
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
+        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
+        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
+        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
+        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
+        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/namespace.c b/src/core/namespace.c
index e08d7459c5..498cd139bf 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -472,9 +472,11 @@ int setup_namespace(
                 private_dev +
                 (protect_sysctl ? 3 : 0) +
                 (protect_cgroups != protect_sysctl) +
-                (protect_home != PROTECT_HOME_NO ? 3 : 0) +
-                (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
-                (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
@@ -529,9 +531,13 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO) {
+                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
+                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
+                         * strict system protection mode, then also add entries for these directories, but mark them
+                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
+
                         home_dir = prefix_roota(root_directory, "/home");
                         home_dir = strjoina("-", home_dir);
                         run_user_dir = prefix_roota(root_directory, "/run/user");
@@ -540,22 +546,53 @@ int setup_namespace(
                         root_dir = strjoina("-", root_dir);
 
                         r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
+                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
+                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
                         if (r < 0)
                                 return r;
                 }
 
-                if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *etc_dir;
+                if (protect_system == PROTECT_SYSTEM_STRICT) {
+                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
+                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+                         * protect those, and these options should be fully orthogonal. (And of course /home and
+                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
+                         * above). */
+
+                        m->path = prefix_roota(root_directory, "/");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc");
+                        m->mode = READWRITE;
+                        m++;
+
+                        if (!private_dev) {
+                                m->path = prefix_roota(root_directory, "/dev");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+                        if (!protect_sysctl) {
+                                m->path = prefix_roota(root_directory, "/sys");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+
+                } else if (protect_system != PROTECT_SYSTEM_NO) {
+                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
+
+                        /* In any other mode we simply mark the relevant three directories ready-only. */
 
                         usr_dir = prefix_roota(root_directory, "/usr");
                         boot_dir = prefix_roota(root_directory, "/boot");
                         boot_dir = strjoina("-", boot_dir);
+                        efi_dir = prefix_roota(root_directory, "/efi");
+                        efi_dir = strjoina("-", efi_dir);
                         etc_dir = prefix_roota(root_directory, "/etc");
 
                         r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
-                                : STRV_MAKE(usr_dir, boot_dir), READONLY);
+                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
+                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
                         if (r < 0)
                                 return r;
                 }
@@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
         [PROTECT_SYSTEM_NO] = "no",
         [PROTECT_SYSTEM_YES] = "yes",
         [PROTECT_SYSTEM_FULL] = "full",
+        [PROTECT_SYSTEM_STRICT] = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 3845336287..6505bcc499 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -35,6 +35,7 @@ typedef enum ProtectSystem {
         PROTECT_SYSTEM_NO,
         PROTECT_SYSTEM_YES,
         PROTECT_SYSTEM_FULL,
+        PROTECT_SYSTEM_STRICT,
         _PROTECT_SYSTEM_MAX,
         _PROTECT_SYSTEM_INVALID = -1
 } ProtectSystem;
-- 
cgit v1.2.3-54-g00ecf


From 63bb64a056113d4be5fefb16604accf08c8c204a Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 16:12:46 +0200
Subject: core: imply ProtectHome=read-only and ProtectSystem=strict if
 DynamicUser=1

Let's make sure that services that use DynamicUser=1 cannot leave files in the
file system should the system accidentally have a world-writable directory
somewhere.

This effectively ensures that directories need to be whitelisted rather than
blacklisted for access when DynamicUser=1 is set.
---
 man/systemd.exec.xml | 12 ++++++++----
 src/core/unit.c      |  6 ++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 1b672fe0c9..e4d9c0ef1b 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -160,14 +160,18 @@
         use. However, UID/GIDs are recycled after a unit is terminated. Care should be taken that any processes running
         as part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by these
         users/groups around, as a different unit might get the same UID/GID assigned later on, and thus gain access to
-        these files or directories. If <varname>DynamicUser=</varname> is enabled, <varname>RemoveIPC=</varname> and
+        these files or directories. If <varname>DynamicUser=</varname> is enabled, <varname>RemoveIPC=</varname>,
         <varname>PrivateTmp=</varname> are implied. This ensures that the lifetime of IPC objects and temporary files
         created by the executed processes is bound to the runtime of the service, and hence the lifetime of the dynamic
         user/group. Since <filename>/tmp</filename> and <filename>/var/tmp</filename> are usually the only
         world-writable directories on a system this ensures that a unit making use of dynamic user/group allocation
-        cannot leave files around after unit termination. Use <varname>RuntimeDirectory=</varname> (see below) in order
-        to assign a writable runtime directory to a service, owned by the dynamic user/group and removed automatically
-        when the unit is terminated. Defaults to off.</para></listitem>
+        cannot leave files around after unit termination. Moreover <varname>ProtectSystem=strict</varname> and
+        <varname>ProtectHome=read-only</varname> are implied, thus prohibiting the service to write to arbitrary file
+        system locations. In order to allow the service to write to certain directories, they have to be whitelisted
+        using <varname>ReadWritePaths=</varname>, but care must be taken so that that UID/GID recycling doesn't
+        create security issues involving files created by the service. Use <varname>RuntimeDirectory=</varname> (see
+        below) in order to assign a writable runtime directory to a service, owned by the dynamic user/group and
+        removed automatically when the unit is terminated. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/unit.c b/src/core/unit.c
index de22f657c6..5d284a359d 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3377,8 +3377,14 @@ int unit_patch_contexts(Unit *u) {
                                         return -ENOMEM;
                         }
 
+                        /* If the dynamic user option is on, let's make sure that the unit can't leave its UID/GID
+                         * around in the file system or on IPC objects. Hence enforce a strict sandbox. */
+
                         ec->private_tmp = true;
                         ec->remove_ipc = true;
+                        ec->protect_system = PROTECT_SYSTEM_STRICT;
+                        if (ec->protect_home == PROTECT_HOME_NO)
+                                ec->protect_home = PROTECT_HOME_READ_ONLY;
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From 920a7899ded2711e5ff4fe367a60a4fefca6767f Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 16:25:49 +0200
Subject: nspawn: let's mount /proc/sysrq-trigger read-only by default

LXC does this, and we should probably too. Better safe than sorry.
---
 src/nspawn/nspawn-mount.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 8457357003..25d38aa742 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -314,19 +314,21 @@ int mount_all(const char *dest,
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
-                { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
-                { "/proc/sys/net",   "/proc/sys/net",   NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
-                { NULL,              "/proc/sys",       NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
-                { "tmpfs",           "/sys",            "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
-                { "sysfs",           "/sys",            "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
-                { "tmpfs",           "/dev",            "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
-                { "tmpfs",           "/dev/shm",        "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",           "/run",            "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",           "/tmp",            "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  false, false },
+                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
+                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
+                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
+                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
+                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                   false, true,  false },   /* Bind mount first ...*/
+                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true,  false },   /* ... then, make it r/o */
+                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
+                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
+                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
+                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
+                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
+                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  false, false },
 #ifdef HAVE_SELINUX
-                { "/sys/fs/selinux", "/sys/fs/selinux", NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
-                { NULL,              "/sys/fs/selinux", NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
+                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
+                { NULL,                  "/sys/fs/selinux",     NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
 #endif
         };
 
@@ -356,7 +358,7 @@ int mount_all(const char *dest,
                         continue;
 
                 r = mkdir_p(where, 0755);
-                if (r < 0) {
+                if (r < 0 && r != -EEXIST) {
                         if (mount_table[k].fatal)
                                 return log_error_errno(r, "Failed to create directory %s: %m", where);
 
-- 
cgit v1.2.3-54-g00ecf


From 096424d1230e0a0339735c51b43949809e972430 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 17:29:12 +0200
Subject: execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.
---
 src/core/execute.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/core/execute.c b/src/core/execute.c
index 20e74ec8a6..ae251b2a4c 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -2291,14 +2291,9 @@ static int exec_child(
         }
         accum_env = strv_env_clean(accum_env);
 
-        umask(context->umask);
+        (void) umask(context->umask);
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-                r = enforce_groups(context, username, gid);
-                if (r < 0) {
-                        *exit_status = EXIT_GROUP;
-                        return r;
-                }
 #ifdef HAVE_SMACK
                 if (context->smack_process_label) {
                         r = mac_smack_apply_pid(0, context->smack_process_label);
@@ -2395,6 +2390,14 @@ static int exec_child(
                 }
         }
 
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
+                r = enforce_groups(context, username, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return r;
+                }
+        }
+
         if (context->working_directory_home)
                 wd = home;
         else if (context->working_directory)
-- 
cgit v1.2.3-54-g00ecf


From 1e4e94c8819e2fe3a7217690c0590dba8ab0be9e Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 17:30:47 +0200
Subject: namespace: invoke unshare() only after checking all parameters

Let's create the new namespace only after we validated and processed all
parameters, right before we start with actually mounting things.

This way, the window where we can roll back is larger (not that it matters
IRL...)
---
 src/core/namespace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 498cd139bf..356d3c8121 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -462,9 +462,6 @@ int setup_namespace(
         if (mount_flags == 0)
                 mount_flags = MS_SHARED;
 
-        if (unshare(CLONE_NEWNS) < 0)
-                return -errno;
-
         n = !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -606,6 +603,9 @@ int setup_namespace(
                 drop_nop(mounts, &n);
         }
 
+        if (unshare(CLONE_NEWNS) < 0)
+                return -errno;
+
         if (n > 0 || root_directory) {
                 /* Remount / as SLAVE so that nothing now mounted in the namespace
                    shows up in the parent */
-- 
cgit v1.2.3-54-g00ecf


From d944dc9553009822deaddec76814f5642a6a8176 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Sat, 24 Sep 2016 12:41:30 +0200
Subject: namespace: chase symlinks for mounts to set up in userspace

This adds logic to chase symlinks for all mount points that shall be created in
a namespace environment in userspace, instead of leaving this to the kernel.
This has the advantage that we can correctly handle absolute symlinks that
shall be taken relative to a specific root directory. Moreover, we can properly
handle mounts created on symlinked files or directories as we can merge their
mounts as necessary.

(This also drops the "done" flag in the namespace logic, which was never
actually working, but was supposed to permit a partial rollback of the
namespace logic, which however is only mildly useful as it wasn't clear in
which case it would or would not be able to roll back.)

Fixes: #3867
---
 src/basic/fs-util.c     | 187 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/basic/fs-util.h     |   2 +
 src/core/namespace.c    | 118 +++++++++++++++++++-----------
 src/test/test-fs-util.c |  96 ++++++++++++++++++++++++-
 src/test/test-ns.c      |  10 ++-
 5 files changed, 367 insertions(+), 46 deletions(-)

diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c
index ce87257bc1..86d9ad7e36 100644
--- a/src/basic/fs-util.c
+++ b/src/basic/fs-util.c
@@ -597,3 +597,190 @@ int inotify_add_watch_fd(int fd, int what, uint32_t mask) {
 
         return r;
 }
+
+int chase_symlinks(const char *path, const char *_root, char **ret) {
+        _cleanup_free_ char *buffer = NULL, *done = NULL, *root = NULL;
+        _cleanup_close_ int fd = -1;
+        unsigned max_follow = 32; /* how many symlinks to follow before giving up and returning ELOOP */
+        char *todo;
+        int r;
+
+        assert(path);
+
+        /* This is a lot like canonicalize_file_name(), but takes an additional "root" parameter, that allows following
+         * symlinks relative to a root directory, instead of the root of the host.
+         *
+         * Note that "root" matters only if we encounter an absolute symlink, it's unused otherwise. Most importantly
+         * this means the path parameter passed in is not prefixed by it.
+         *
+         * Algorithmically this operates on two path buffers: "done" are the components of the path we already
+         * processed and resolved symlinks, "." and ".." of. "todo" are the components of the path we still need to
+         * process. On each iteration, we move one component from "todo" to "done", processing it's special meaning
+         * each time. The "todo" path always starts with at least one slash, the "done" path always ends in no
+         * slash. We always keep an O_PATH fd to the component we are currently processing, thus keeping lookup races
+         * at a minimum. */
+
+        r = path_make_absolute_cwd(path, &buffer);
+        if (r < 0)
+                return r;
+
+        if (_root) {
+                r = path_make_absolute_cwd(_root, &root);
+                if (r < 0)
+                        return r;
+        }
+
+        fd = open("/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+        if (fd < 0)
+                return -errno;
+
+        todo = buffer;
+        for (;;) {
+                _cleanup_free_ char *first = NULL;
+                _cleanup_close_ int child = -1;
+                struct stat st;
+                size_t n, m;
+
+                /* Determine length of first component in the path */
+                n = strspn(todo, "/");                  /* The slashes */
+                m = n + strcspn(todo + n, "/");         /* The entire length of the component */
+
+                /* Extract the first component. */
+                first = strndup(todo, m);
+                if (!first)
+                        return -ENOMEM;
+
+                todo += m;
+
+                /* Just a single slash? Then we reached the end. */
+                if (isempty(first) || path_equal(first, "/"))
+                        break;
+
+                /* Just a dot? Then let's eat this up. */
+                if (path_equal(first, "/."))
+                        continue;
+
+                /* Two dots? Then chop off the last bit of what we already found out. */
+                if (path_equal(first, "/..")) {
+                        _cleanup_free_ char *parent = NULL;
+                        int fd_parent = -1;
+
+                        if (isempty(done) || path_equal(done, "/"))
+                                return -EINVAL;
+
+                        parent = dirname_malloc(done);
+                        if (!parent)
+                                return -ENOMEM;
+
+                        /* Don't allow this to leave the root dir */
+                        if (root &&
+                            path_startswith(done, root) &&
+                            !path_startswith(parent, root))
+                                return -EINVAL;
+
+                        free(done);
+                        done = parent;
+                        parent = NULL;
+
+                        fd_parent = openat(fd, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                        if (fd_parent < 0)
+                                return -errno;
+
+                        safe_close(fd);
+                        fd = fd_parent;
+
+                        continue;
+                }
+
+                /* Otherwise let's see what this is. */
+                child = openat(fd, first + n, O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                if (child < 0)
+                        return -errno;
+
+                if (fstat(child, &st) < 0)
+                        return -errno;
+
+                if (S_ISLNK(st.st_mode)) {
+                        _cleanup_free_ char *destination = NULL;
+
+                        /* This is a symlink, in this case read the destination. But let's make sure we don't follow
+                         * symlinks without bounds. */
+                        if (--max_follow <= 0)
+                                return -ELOOP;
+
+                        r = readlinkat_malloc(fd, first + n, &destination);
+                        if (r < 0)
+                                return r;
+                        if (isempty(destination))
+                                return -EINVAL;
+
+                        if (path_is_absolute(destination)) {
+
+                                /* An absolute destination. Start the loop from the beginning, but use the root
+                                 * directory as base. */
+
+                                safe_close(fd);
+                                fd = open(root ?: "/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                                if (fd < 0)
+                                        return -errno;
+
+                                free(buffer);
+                                buffer = destination;
+                                destination = NULL;
+
+                                todo = buffer;
+                                free(done);
+
+                                /* Note that we do not revalidate the root, we take it as is. */
+                                if (isempty(root))
+                                        done = NULL;
+                                else {
+                                        done = strdup(root);
+                                        if (!done)
+                                                return -ENOMEM;
+                                }
+
+                        } else {
+                                char *joined;
+
+                                /* A relative destination. If so, this is what we'll prefix what's left to do with what
+                                 * we just read, and start the loop again, but remain in the current directory. */
+
+                                joined = strjoin("/", destination, todo, NULL);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                free(buffer);
+                                todo = buffer = joined;
+                        }
+
+                        continue;
+                }
+
+                /* If this is not a symlink, then let's just add the name we read to what we already verified. */
+                if (!done) {
+                        done = first;
+                        first = NULL;
+                } else {
+                        if (!strextend(&done, first, NULL))
+                                return -ENOMEM;
+                }
+
+                /* And iterate again, but go one directory further down. */
+                safe_close(fd);
+                fd = child;
+                child = -1;
+        }
+
+        if (!done) {
+                /* Special case, turn the empty string into "/", to indicate the root directory. */
+                done = strdup("/");
+                if (!done)
+                        return -ENOMEM;
+        }
+
+        *ret = done;
+        done = NULL;
+
+        return 0;
+}
diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h
index 2c3b9a1c74..31df47cf1e 100644
--- a/src/basic/fs-util.h
+++ b/src/basic/fs-util.h
@@ -77,3 +77,5 @@ union inotify_event_buffer {
 };
 
 int inotify_add_watch_fd(int fd, int what, uint32_t mask);
+
+int chase_symlinks(const char *path, const char *_root, char **ret);
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 356d3c8121..d3ab2e8e3e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -29,6 +29,7 @@
 #include "alloc-util.h"
 #include "dev-setup.h"
 #include "fd-util.h"
+#include "fs-util.h"
 #include "loopback-setup.h"
 #include "missing.h"
 #include "mkdir.h"
@@ -57,9 +58,9 @@ typedef enum MountMode {
 } MountMode;
 
 typedef struct BindMount {
-        const char *path;
+        const char *path; /* stack memory, doesn't need to be freed explicitly */
+        char *chased; /* malloc()ed memory, needs to be freed */
         MountMode mode;
-        bool done;
         bool ignore;
 } BindMount;
 
@@ -71,7 +72,6 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         STRV_FOREACH(i, strv) {
 
                 (*p)->ignore = false;
-                (*p)->done = false;
 
                 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
                         (*p)->ignore = true;
@@ -360,11 +360,8 @@ static int apply_mount(
                  * inaccessible path. */
                 (void) umount_recursive(m->path, 0);
 
-                if (lstat(m->path, &target) < 0) {
-                        if (m->ignore && errno == ENOENT)
-                                return 0;
+                if (lstat(m->path, &target) < 0)
                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
-                }
 
                 what = mode_to_inaccessible_node(target.st_mode);
                 if (!what) {
@@ -378,11 +375,8 @@ static int apply_mount(
         case READWRITE:
 
                 r = path_is_mount_point(m->path, 0);
-                if (r < 0) {
-                        if (m->ignore && errno == ENOENT)
-                                return 0;
+                if (r < 0)
                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
-                }
                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
                         return 0;
 
@@ -407,12 +401,8 @@ static int apply_mount(
 
         assert(what);
 
-        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                if (m->ignore && errno == ENOENT)
-                        return 0;
-
+        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
                 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
-        }
 
         log_debug("Successfully mounted %s to %s", what, m->path);
         return 0;
@@ -435,12 +425,43 @@ static int make_read_only(BindMount *m, char **blacklist) {
          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
          * read-only mounts already applied. */
 
-        if (m->ignore && r == -ENOENT)
-                return 0;
-
         return r;
 }
 
+static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+        int r;
+
+        assert(m);
+        assert(n);
+
+        /* Since mount() will always follow symlinks and we need to take the different root directory into account we
+         * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
+         * can't resolve the path, and which have been marked for such removal. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                r = chase_symlinks(f->path, root_directory, &f->chased);
+                if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
+                        continue;
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
+
+                if (path_equal(f->path, f->chased))
+                        f->chased = mfree(f->chased);
+                else {
+                        log_debug("Chased %s → %s", f->path, f->chased);
+                        f->path = f->chased;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+        return 0;
+}
+
 int setup_namespace(
                 const char* root_directory,
                 char** read_write_paths,
@@ -456,6 +477,7 @@ int setup_namespace(
                 unsigned long mount_flags) {
 
         BindMount *m, *mounts = NULL;
+        bool make_slave = false;
         unsigned n;
         int r = 0;
 
@@ -475,6 +497,9 @@ int setup_namespace(
                  ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
                   (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
+        if (root_directory || n > 0)
+                make_slave = true;
+
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
                 r = append_mounts(&m, read_write_paths, READWRITE);
@@ -596,6 +621,13 @@ int setup_namespace(
 
                 assert(mounts + n == m);
 
+                /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
+                 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
+                 * racy. */
+                r = chase_all_symlinks(root_directory, mounts, &n);
+                if (r < 0)
+                        goto finish;
+
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
 
                 drop_duplicates(mounts, &n);
@@ -603,20 +635,26 @@ int setup_namespace(
                 drop_nop(mounts, &n);
         }
 
-        if (unshare(CLONE_NEWNS) < 0)
-                return -errno;
+        if (unshare(CLONE_NEWNS) < 0) {
+                r = -errno;
+                goto finish;
+        }
 
-        if (n > 0 || root_directory) {
+        if (make_slave) {
                 /* Remount / as SLAVE so that nothing now mounted in the namespace
                    shows up in the parent */
-                if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                        return -errno;
+                if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
         }
 
         if (root_directory) {
                 /* Turn directory into bind mount */
-                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
-                        return -errno;
+                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
         }
 
         if (n > 0) {
@@ -627,7 +665,7 @@ int setup_namespace(
                 for (m = mounts; m < mounts + n; ++m) {
                         r = apply_mount(m, tmp_dir, var_tmp_dir);
                         if (r < 0)
-                                goto fail;
+                                goto finish;
                 }
 
                 /* Create a blacklist we can pass to bind_mount_recursive() */
@@ -640,34 +678,30 @@ int setup_namespace(
                 for (m = mounts; m < mounts + n; ++m) {
                         r = make_read_only(m, blacklist);
                         if (r < 0)
-                                goto fail;
+                                goto finish;
                 }
         }
 
         if (root_directory) {
                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
                 r = mount_move_root(root_directory);
-                if (r < 0) /* at this point, we cannot rollback */
-                        return r;
+                if (r < 0)
+                        goto finish;
         }
 
         /* Remount / as the desired mode. Not that this will not
          * reestablish propagation from our side to the host, since
          * what's disconnected is disconnected. */
-        if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
-                return -errno; /* at this point, we cannot rollback */
+        if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
+                r = -errno;
+                goto finish;
+        }
 
-        return 0;
+        r = 0;
 
-fail:
-        if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m) {
-                        if (!m->done)
-                                continue;
-
-                        (void) umount2(m->path, MNT_DETACH);
-                }
-        }
+finish:
+        for (m = mounts; m < mounts + n; m++)
+                free(m->chased);
 
         return r;
 }
diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c
index b35a2ea2c8..53a3cdc663 100644
--- a/src/test/test-fs-util.c
+++ b/src/test/test-fs-util.c
@@ -20,16 +20,109 @@
 #include <unistd.h>
 
 #include "alloc-util.h"
-#include "fileio.h"
 #include "fd-util.h"
+#include "fileio.h"
 #include "fs-util.h"
 #include "macro.h"
 #include "mkdir.h"
+#include "path-util.h"
 #include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
 #include "util.h"
 
+static void test_chase_symlinks(void) {
+        _cleanup_free_ char *result = NULL;
+        char temp[] = "/tmp/test-chase.XXXXXX";
+        const char *top, *p, *q;
+        int r;
+
+        assert_se(mkdtemp(temp));
+
+        top = strjoina(temp, "/top");
+        assert_se(mkdir(top, 0700) >= 0);
+
+        p = strjoina(top, "/dot");
+        assert_se(symlink(".", p) >= 0);
+
+        p = strjoina(top, "/dotdot");
+        assert_se(symlink("..", p) >= 0);
+
+        p = strjoina(top, "/dotdota");
+        assert_se(symlink("../a", p) >= 0);
+
+        p = strjoina(temp, "/a");
+        assert_se(symlink("b", p) >= 0);
+
+        p = strjoina(temp, "/b");
+        assert_se(symlink("/usr", p) >= 0);
+
+        p = strjoina(temp, "/start");
+        assert_se(symlink("top/dot/dotdota", p) >= 0);
+
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/usr"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r == -ENOENT);
+
+        q = strjoina(temp, "/usr");
+        assert_se(mkdir(q, 0700) >= 0);
+
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, q));
+
+        p = strjoina(temp, "/slash");
+        assert_se(symlink("/", p) >= 0);
+
+        result = mfree(result);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, temp));
+
+        p = strjoina(temp, "/slashslash");
+        assert_se(symlink("///usr///", p) >= 0);
+
+        result = mfree(result);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/usr"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, q));
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/./.././", NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/"));
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/./.././", "/etc", &result);
+        assert_se(r == -EINVAL);
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/machine-id/foo", NULL, &result);
+        assert_se(r == -ENOTDIR);
+
+        result = mfree(result);
+        p = strjoina(temp, "/recursive-symlink");
+        assert_se(symlink("recursive-symlink", p) >= 0);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r == -ELOOP);
+
+        assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
+}
+
 static void test_unlink_noerrno(void) {
         char name[] = "/tmp/test-close_nointr.XXXXXX";
         int fd;
@@ -144,6 +237,7 @@ int main(int argc, char *argv[]) {
         test_readlink_and_make_absolute();
         test_get_files_in_directory();
         test_var_tmp();
+        test_chase_symlinks();
 
         return 0;
 }
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 03a24620af..c4d4da6d05 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -26,14 +26,18 @@
 int main(int argc, char *argv[]) {
         const char * const writable[] = {
                 "/home",
-                "/home/lennart/projects/foobar", /* this should be masked automatically */
+                "-/home/lennart/projects/foobar", /* this should be masked automatically */
                 NULL
         };
 
         const char * const readonly[] = {
-                "/",
-                "/usr",
+                /* "/", */
+                /* "/usr", */
                 "/boot",
+                "/lib",
+                "/usr/lib",
+                "-/lib64",
+                "-/usr/lib64",
                 NULL
         };
 
-- 
cgit v1.2.3-54-g00ecf


From 8f1ad200f010dc2106f7e3ff5879f0330ee36996 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 11:27:38 +0200
Subject: namespace: don't make the root directory of a namespace a mount if it
 already is one

Let's not stack mounts needlessly.
---
 src/core/namespace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index d3ab2e8e3e..a7451ffbdc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -650,10 +650,15 @@ int setup_namespace(
         }
 
         if (root_directory) {
-                /* Turn directory into bind mount */
-                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                        r = -errno;
+                /* Turn directory into bind mount, if it isn't one yet */
+                r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW);
+                if (r < 0)
                         goto finish;
+                if (r == 0) {
+                        if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
+                                r = -errno;
+                                goto finish;
+                        }
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From b2656f1b1ca94fc8b6a0eb44986df78d23ff7950 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:22:23 +0200
Subject: man: in user-facing documentaiton don't reference C function names

Let's drop the reference to the cap_from_name() function in the documentation
for the capabilities setting, as it is hardly helpful. Our readers are not
necessarily C hackers knowing the semantics of cap_from_name(). Moreover, the
strings we accept are just the plain capability names as listed in
capabilities(7) hence there's really no point in confusing the user with
anything else.
---
 man/systemd.exec.xml | 64 +++++++++++++++++++++-------------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index e4d9c0ef1b..67182f17dc 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -821,49 +821,37 @@
         <listitem><para>Controls which capabilities to include in the capability bounding set for the executed
         process. See <citerefentry
         project='man-pages'><refentrytitle>capabilities</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
-        details. Takes a whitespace-separated list of capability names as read by <citerefentry
-        project='mankier'><refentrytitle>cap_from_name</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
-        e.g. <constant>CAP_SYS_ADMIN</constant>, <constant>CAP_DAC_OVERRIDE</constant>,
-        <constant>CAP_SYS_PTRACE</constant>. Capabilities listed will be included in the bounding set, all others are
-        removed. If the list of capabilities is prefixed with <literal>~</literal>, all but the listed capabilities
-        will be included, the effect of the assignment inverted. Note that this option also affects the respective
-        capabilities in the effective, permitted and inheritable capability sets. If this option is not used, the
-        capability bounding set is not modified on process execution, hence no limits on the capabilities of the
-        process are enforced. This option may appear more than once, in which case the bounding sets are merged. If the
-        empty string is assigned to this option, the bounding set is reset to the empty capability set, and all prior
-        settings have no effect.  If set to <literal>~</literal> (without any further argument), the bounding set is
-        reset to the full set of available capabilities, also undoing any previous settings. This does not affect
-        commands prefixed with <literal>+</literal>.</para></listitem>
+        details. Takes a whitespace-separated list of capability names, e.g. <constant>CAP_SYS_ADMIN</constant>,
+        <constant>CAP_DAC_OVERRIDE</constant>, <constant>CAP_SYS_PTRACE</constant>. Capabilities listed will be
+        included in the bounding set, all others are removed. If the list of capabilities is prefixed with
+        <literal>~</literal>, all but the listed capabilities will be included, the effect of the assignment
+        inverted. Note that this option also affects the respective capabilities in the effective, permitted and
+        inheritable capability sets. If this option is not used, the capability bounding set is not modified on process
+        execution, hence no limits on the capabilities of the process are enforced. This option may appear more than
+        once, in which case the bounding sets are merged. If the empty string is assigned to this option, the bounding
+        set is reset to the empty capability set, and all prior settings have no effect.  If set to
+        <literal>~</literal> (without any further argument), the bounding set is reset to the full set of available
+        capabilities, also undoing any previous settings. This does not affect commands prefixed with
+        <literal>+</literal>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>AmbientCapabilities=</varname></term>
 
-        <listitem><para>Controls which capabilities to include in the
-        ambient capability set for the executed process. Takes a
-        whitespace-separated list of capability names as read by
-        <citerefentry project='mankier'><refentrytitle>cap_from_name</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
-        e.g. <constant>CAP_SYS_ADMIN</constant>,
-        <constant>CAP_DAC_OVERRIDE</constant>,
-        <constant>CAP_SYS_PTRACE</constant>. This option may appear more than
-        once in which case the ambient capability sets are merged.
-        If the list of capabilities is prefixed with <literal>~</literal>, all
-        but the listed capabilities will be included, the effect of the
-        assignment inverted. If the empty string is
-        assigned to this option, the ambient capability set is reset to
-        the empty capability set, and all prior settings have no effect.
-        If set to <literal>~</literal> (without any further argument), the
-        ambient capability set is reset to the full set of available
-        capabilities, also undoing any previous settings. Note that adding
-        capabilities to ambient capability set adds them to the process's
-        inherited capability set.
-        </para><para>
-        Ambient capability sets are useful if you want to execute a process
-        as a non-privileged user but still want to give it some capabilities.
-        Note that in this case option <constant>keep-caps</constant> is
-        automatically added to <varname>SecureBits=</varname> to retain the
-        capabilities over the user change. <varname>AmbientCapabilities=</varname> does not affect
-        commands prefixed with <literal>+</literal>.</para></listitem>
+        <listitem><para>Controls which capabilities to include in the ambient capability set for the executed
+        process. Takes a whitespace-separated list of capability names, e.g. <constant>CAP_SYS_ADMIN</constant>,
+        <constant>CAP_DAC_OVERRIDE</constant>, <constant>CAP_SYS_PTRACE</constant>. This option may appear more than
+        once in which case the ambient capability sets are merged.  If the list of capabilities is prefixed with
+        <literal>~</literal>, all but the listed capabilities will be included, the effect of the assignment
+        inverted. If the empty string is assigned to this option, the ambient capability set is reset to the empty
+        capability set, and all prior settings have no effect.  If set to <literal>~</literal> (without any further
+        argument), the ambient capability set is reset to the full set of available capabilities, also undoing any
+        previous settings. Note that adding capabilities to ambient capability set adds them to the process's inherited
+        capability set.  </para><para> Ambient capability sets are useful if you want to execute a process as a
+        non-privileged user but still want to give it some capabilities.  Note that in this case option
+        <constant>keep-caps</constant> is automatically added to <varname>SecureBits=</varname> to retain the
+        capabilities over the user change. <varname>AmbientCapabilities=</varname> does not affect commands prefixed
+        with <literal>+</literal>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
-- 
cgit v1.2.3-54-g00ecf


From effbd6d2eadb61bd236d118afc7901940c4c6b37 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:24:37 +0200
Subject: man: rework documentation for ReadOnlyPaths= and related settings

This reworks the documentation for ReadOnlyPaths=, ReadWritePaths=,
InaccessiblePaths=. It no longer claims that we'd follow symlinks relative to
the host file system. (Which wasn't true actually, as we didn't follow symlinks
at all in the most recent releases, and we know do follow them, but relative to
RootDirectory=).

This also replaces all references to the fact that all fs namespacing options
can be undone with enough privileges and disable propagation by a single one in
the documentation of ReadOnlyPaths= and friends, and then directs the read to
this in all other places.

Moreover a hint is added to the documentation of SystemCallFilter=, suggesting
usage of ~@mount in case any of the fs namespacing related options are used.
---
 man/systemd.exec.xml | 214 ++++++++++++++++++++++-----------------------------
 1 file changed, 92 insertions(+), 122 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 67182f17dc..84f81fe38e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -877,48 +877,34 @@
         <term><varname>ReadOnlyPaths=</varname></term>
         <term><varname>InaccessiblePaths=</varname></term>
 
-        <listitem><para>Sets up a new file system namespace for
-        executed processes. These options may be used to limit access
-        a process might have to the main file system hierarchy. Each
-        setting takes a space-separated list of paths relative to
-        the host's root directory (i.e. the system running the service manager).
-        Note that if entries contain symlinks, they are resolved from the host's root directory as well.
-        Entries (files or directories) listed in
-        <varname>ReadWritePaths=</varname> are accessible from
-        within the namespace with the same access rights as from
-        outside. Entries listed in
-        <varname>ReadOnlyPaths=</varname> are accessible for
-        reading only, writing will be refused even if the usual file
-        access controls would permit this. Entries listed in
-        <varname>InaccessiblePaths=</varname> will be made
-        inaccessible for processes inside the namespace, and may not
-        countain any other mountpoints, including those specified by
-        <varname>ReadWritePaths=</varname> or
-        <varname>ReadOnlyPaths=</varname>.
-        Note that restricting access with these options does not extend
-        to submounts of a directory that are created later on.
-        Non-directory paths can be specified as well. These
-        options may be specified more than once, in which case all
-        paths listed will have limited access from within the
-        namespace. If the empty string is assigned to this option, the
-        specific list is reset, and all prior assignments have no
-        effect.</para>
-        <para>Paths in
-        <varname>ReadOnlyPaths=</varname>
-        and
-        <varname>InaccessiblePaths=</varname>
-        may be prefixed with
-        <literal>-</literal>, in which case
-        they will be ignored when they do not
-        exist. Note that using this
-        setting will disconnect propagation of
-        mounts from the service to the host
-        (propagation in the opposite direction
-        continues to work). This means that
-        this setting may not be used for
-        services which shall be able to
-        install mount points in the main mount
-        namespace.</para></listitem>
+        <listitem><para>Sets up a new file system namespace for executed processes. These options may be used to limit
+        access a process might have to the file system hierarchy. Each setting takes a space-separated list of paths
+        relative to the host's root directory (i.e. the system running the service manager).  Note that if paths
+        contain symlinks, they are resolved relative to the root directory set with
+        <varname>RootDirectory=</varname>.</para>
+
+        <para>Paths listed in <varname>ReadWritePaths=</varname> are accessible from within the namespace with the same
+        access modes as from outside of it. Paths listed in <varname>ReadOnlyPaths=</varname> are accessible for
+        reading only, writing will be refused even if the usual file access controls would permit this. Nest
+        <varname>ReadWritePaths=</varname> inside of <varname>ReadOnlyPaths=</varname> in order to provide writable
+        subdirectories within read-only directories. Use <varname>ReadWritePaths=</varname> in order to whitelist
+        specific paths for write access if <varname>ProtectSystem=strict</varname> is used. Paths listed in
+        <varname>InaccessiblePaths=</varname> will be made inaccessible for processes inside the namespace (along with
+        everything below them in the file system hierarchy).</para>
+
+        <para>Note that restricting access with these options does not extend to submounts of a directory that are
+        created later on.  Non-directory paths may be specified as well. These options may be specified more than once,
+        in which case all paths listed will have limited access from within the namespace. If the empty string is
+        assigned to this option, the specific list is reset, and all prior assignments have no effect.</para>
+
+        <para>Paths in <varname>ReadOnlyPaths=</varname> and <varname>InaccessiblePaths=</varname> may be prefixed with
+        <literal>-</literal>, in which case they will be ignored when they do not exist. Note that using this setting
+        will disconnect propagation of mounts from the service to the host (propagation in the opposite direction
+        continues to work). This means that this setting may not be used for services which shall be able to install
+        mount points in the main mount namespace. Note that the effect of these settings may be undone by privileged
+        processes. In order to set up an effective sandboxed environment for a unit it is thus recommended to combine
+        these settings with either <varname>CapabilityBoundingSet=~CAP_SYS_ADMIN</varname> or
+        <varname>SystemCallFilter=~@mount</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -933,37 +919,30 @@
         private <filename>/tmp</filename> and <filename>/var/tmp</filename> namespace by using the
         <varname>JoinsNamespaceOf=</varname> directive, see
         <citerefentry><refentrytitle>systemd.unit</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
-        details. Note that using this setting will disconnect propagation of mounts from the service to the host
-        (propagation in the opposite direction continues to work).  This means that this setting may not be used for
-        services which shall be able to install mount points in the main mount namespace. This setting is implied if
-        <varname>DynamicUser=</varname> is set.</para></listitem>
+        details. This setting is implied if <varname>DynamicUser=</varname> is set. For this setting the same
+        restrictions regarding mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and
+        related calls, see above.</para></listitem>
+
       </varlistentry>
 
       <varlistentry>
         <term><varname>PrivateDevices=</varname></term>
 
-        <listitem><para>Takes a boolean argument. If true, sets up a
-        new /dev namespace for the executed processes and only adds
-        API pseudo devices such as <filename>/dev/null</filename>,
-        <filename>/dev/zero</filename> or
-        <filename>/dev/random</filename> (as well as the pseudo TTY
-        subsystem) to it, but no physical devices such as
-        <filename>/dev/sda</filename>. This is useful to securely turn
-        off physical device access by the executed process. Defaults
-        to false. Enabling this option will also remove
-        <constant>CAP_MKNOD</constant> from the capability bounding
-        set for the unit (see above), and set
-        <varname>DevicePolicy=closed</varname> (see
+        <listitem><para>Takes a boolean argument. If true, sets up a new /dev namespace for the executed processes and
+        only adds API pseudo devices such as <filename>/dev/null</filename>, <filename>/dev/zero</filename> or
+        <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
+        <filename>/dev/sda</filename>. This is useful to securely turn off physical device access by the executed
+        process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from the
+        capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
-        for details). Note that using this setting will disconnect
-        propagation of mounts from the service to the host
-        (propagation in the opposite direction continues to work).
-        This means that this setting may not be used for services
-        which shall be able to install mount points in the main mount
-        namespace. The /dev namespace will be mounted read-only and 'noexec'.
-        The latter may break old programs which try to set up executable
-        memory by using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        of <filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>.</para></listitem>
+        for details). Note that using this setting will disconnect propagation of mounts from the service to the host
+        (propagation in the opposite direction continues to work).  This means that this setting may not be used for
+        services which shall be able to install mount points in the main mount namespace. The /dev namespace will be
+        mounted read-only and 'noexec'.  The latter may break old programs which try to set up executable memory by
+        using <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry> of
+        <filename>/dev/zero</filename> instead of using <constant>MAP_ANON</constant>. This setting is implied if
+        <varname>DynamicUser=</varname> is set. For this setting the same restrictions regarding mount propagation and
+        privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see above.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1023,33 +1002,23 @@
         operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
         recommended to enable this setting for all long-running services, unless they are involved with system updates
         or need to modify the operating system in other ways. If this option is used,
-        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
-        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
-        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
-        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
-        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. This
+        setting is implied if <varname>DynamicUser=</varname> is set. For this setting the same restrictions regarding
+        mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
+        above. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>ProtectHome=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>read-only</literal>. If true, the directories
-        <filename>/home</filename>, <filename>/root</filename> and
-        <filename>/run/user</filename>
-        are made inaccessible and empty for processes invoked by this
-        unit. If set to <literal>read-only</literal>, the three
-        directories are made read-only instead. It is recommended to
-        enable this setting for all long-running services (in
-        particular network-facing ones), to ensure they cannot get
-        access to private user data, unless the services actually
-        require access to the user's private data. Note however that
-        processes retaining the CAP_SYS_ADMIN capability can undo the
-        effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or <literal>read-only</literal>. If true, the directories
+        <filename>/home</filename>, <filename>/root</filename> and <filename>/run/user</filename> are made inaccessible
+        and empty for processes invoked by this unit. If set to <literal>read-only</literal>, the three directories are
+        made read-only instead. It is recommended to enable this setting for all long-running services (in particular
+        network-facing ones), to ensure they cannot get access to private user data, unless the services actually
+        require access to the user's private data. This setting is implied if <varname>DynamicUser=</varname> is
+        set. For this setting the same restrictions regarding mount propagation and privileges apply as for
+        <varname>ReadOnlyPaths=</varname> and related calls, see above.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1059,48 +1028,41 @@
         <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
         unit. Usually, tunable kernel variables should only be written at boot-time, with the
         <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
-        no services need to write to these at runtime; it is hence recommended to turn this on for most
-        services. Defaults to off.</para></listitem>
+        no services need to write to these at runtime; it is hence recommended to turn this on for most services. For
+        this setting the same restrictions regarding mount propagation and privileges apply as for
+        <varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>ProtectControlGroups=</varname></term>
 
-        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
-        through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
-        container managers no services should require write access to the control groups hierarchies; it is hence
-        recommended to turn this on for most services. Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
+        project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
+        accessible through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the
+        unit. Except for container managers no services should require write access to the control groups hierarchies;
+        it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
+        mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
+        above. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
         <term><varname>MountFlags=</varname></term>
 
-        <listitem><para>Takes a mount propagation flag:
-        <option>shared</option>, <option>slave</option> or
-        <option>private</option>, which control whether mounts in the
-        file system namespace set up for this unit's processes will
-        receive or propagate mounts or unmounts. See
-        <citerefentry project='man-pages'><refentrytitle>mount</refentrytitle><manvolnum>2</manvolnum></citerefentry>
-        for details. Defaults to <option>shared</option>. Use
-        <option>shared</option> to ensure that mounts and unmounts are
-        propagated from the host to the container and vice versa. Use
-        <option>slave</option> to run processes so that none of their
-        mounts and unmounts will propagate to the host. Use
-        <option>private</option> to also ensure that no mounts and
-        unmounts from the host will propagate into the unit processes'
-        namespace. Note that <option>slave</option> means that file
-        systems mounted on the host might stay mounted continuously in
-        the unit's namespace, and thus keep the device busy. Note that
-        the file system namespace related options
-        (<varname>PrivateTmp=</varname>,
-        <varname>PrivateDevices=</varname>,
-        <varname>ProtectSystem=</varname>,
-        <varname>ProtectHome=</varname>,
-        <varname>ReadOnlyPaths=</varname>,
-        <varname>InaccessiblePaths=</varname> and
-        <varname>ReadWritePaths=</varname>) require that mount
-        and unmount propagation from the unit's file system namespace
-        is disabled, and hence downgrade <option>shared</option> to
+        <listitem><para>Takes a mount propagation flag: <option>shared</option>, <option>slave</option> or
+        <option>private</option>, which control whether mounts in the file system namespace set up for this unit's
+        processes will receive or propagate mounts or unmounts. See <citerefentry
+        project='man-pages'><refentrytitle>mount</refentrytitle><manvolnum>2</manvolnum></citerefentry> for
+        details. Defaults to <option>shared</option>. Use <option>shared</option> to ensure that mounts and unmounts
+        are propagated from the host to the container and vice versa. Use <option>slave</option> to run processes so
+        that none of their mounts and unmounts will propagate to the host. Use <option>private</option> to also ensure
+        that no mounts and unmounts from the host will propagate into the unit processes' namespace. Note that
+        <option>slave</option> means that file systems mounted on the host might stay mounted continuously in the
+        unit's namespace, and thus keep the device busy. Note that the file system namespace related options
+        (<varname>PrivateTmp=</varname>, <varname>PrivateDevices=</varname>, <varname>ProtectSystem=</varname>,
+        <varname>ProtectHome=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>, <varname>ReadOnlyPaths=</varname>,
+        <varname>InaccessiblePaths=</varname>, <varname>ReadWritePaths=</varname>) require that mount and unmount
+        propagation from the unit's file system namespace is disabled, and hence downgrade <option>shared</option> to
         <option>slave</option>. </para></listitem>
       </varlistentry>
 
@@ -1335,7 +1297,15 @@
         </table>
 
         Note, that as new system calls are added to the kernel, additional system calls might be added to the groups
-        above, so the contents of the sets may change between systemd versions.</para></listitem>
+        above, so the contents of the sets may change between systemd versions.</para>
+
+        <para>It is recommended to combine the file system namespacing related options with
+        <varname>SystemCallFilter=~@mount</varname>, in order to prohibit the unit's processes to undo the
+        mappings. Specifically these are the options <varname>PrivateTmp=</varname>,
+        <varname>PrivateDevices=</varname>, <varname>ProtectSystem=</varname>, <varname>ProtectHome=</varname>,
+        <varname>ProtectKernelTunables=</varname>, <varname>ProtectControlGroups=</varname>,
+        <varname>ReadOnlyPaths=</varname>, <varname>InaccessiblePaths=</varname> and
+        <varname>ReadWritePaths=</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
-- 
cgit v1.2.3-54-g00ecf


From 81c8aceed4a0cabd605788e46a266cc4cefdc16a Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:29:28 +0200
Subject: man: the exit code/signal is stored in $EXIT_CODE, not $EXIT_STATUS

---
 man/systemd.exec.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 84f81fe38e..6811e7cc53 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1612,8 +1612,8 @@
         <varname>ExecStop=</varname> and <varname>ExecStopPost=</varname> processes, and encodes the service
         "result". Currently, the following values are defined: <literal>timeout</literal> (in case of an operation
         timeout), <literal>exit-code</literal> (if a service process exited with a non-zero exit code; see
-        <varname>$EXIT_STATUS</varname> below for the actual exit status returned), <literal>signal</literal> (if a
-        service process was terminated abnormally by a signal; see <varname>$EXIT_STATUS</varname> below for the actual
+        <varname>$EXIT_CODE</varname> below for the actual exit code returned), <literal>signal</literal> (if a
+        service process was terminated abnormally by a signal; see <varname>$EXIT_CODE</varname> below for the actual
         signal used for the termination), <literal>core-dump</literal> (if a service process terminated abnormally and
         dumped core), <literal>watchdog</literal> (if the watchdog keep-alive ping was enabled for the service but it
         missed the deadline), or <literal>resources</literal> (a catch-all condition in case a system operation
-- 
cgit v1.2.3-54-g00ecf


From 6757c06a1a8dd3755338ca76e598e0d81dc164f2 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:29:52 +0200
Subject: man: shorten the exit status table a bit

Let's merge a couple of columns, to make the table a bit shorter. This
effectively just drops whitespace, not contents, but makes the currently
humungous table much much more compact.
---
 man/systemd.exec.xml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 6811e7cc53..403aa471c8 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1658,32 +1658,32 @@
               <row>
                 <entry morerows="1" valign="top"><literal>timeout</literal></entry>
                 <entry valign="top"><literal>killed</literal></entry>
-                <entry><literal>TERM</literal><sbr/><literal>KILL</literal></entry>
+                <entry><literal>TERM</literal>, <literal>KILL</literal></entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>exited</literal></entry>
-                <entry><literal>0</literal><sbr/><literal>1</literal><sbr/><literal>2</literal><sbr/><literal
-                >3</literal><sbr/>…<sbr/><literal>255</literal></entry>
+                <entry><literal>0</literal>, <literal>1</literal>, <literal>2</literal>, <literal
+                >3</literal>, …, <literal>255</literal></entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>exit-code</literal></entry>
                 <entry valign="top"><literal>exited</literal></entry>
-                <entry><literal>0</literal><sbr/><literal>1</literal><sbr/><literal>2</literal><sbr/><literal
-                >3</literal><sbr/>…<sbr/><literal>255</literal></entry>
+                <entry><literal>0</literal>, <literal>1</literal>, <literal>2</literal>, <literal
+                >3</literal>, …, <literal>255</literal></entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>signal</literal></entry>
                 <entry valign="top"><literal>killed</literal></entry>
-                <entry><literal>HUP</literal><sbr/><literal>INT</literal><sbr/><literal>KILL</literal><sbr/>…</entry>
+                <entry><literal>HUP</literal>, <literal>INT</literal>, <literal>KILL</literal>, …</entry>
               </row>
 
               <row>
                 <entry valign="top"><literal>core-dump</literal></entry>
                 <entry valign="top"><literal>dumped</literal></entry>
-                <entry><literal>ABRT</literal><sbr/><literal>SEGV</literal><sbr/><literal>QUIT</literal><sbr/>…</entry>
+                <entry><literal>ABRT</literal>, <literal>SEGV</literal>, <literal>QUIT</literal>, …</entry>
               </row>
 
               <row>
@@ -1693,12 +1693,12 @@
               </row>
               <row>
                 <entry><literal>killed</literal></entry>
-                <entry><literal>TERM</literal><sbr/><literal>KILL</literal></entry>
+                <entry><literal>TERM</literal>, <literal>KILL</literal></entry>
               </row>
               <row>
                 <entry><literal>exited</literal></entry>
-                <entry><literal>0</literal><sbr/><literal>1</literal><sbr/><literal>2</literal><sbr/><literal
-                >3</literal><sbr/>…<sbr/><literal>255</literal></entry>
+                <entry><literal>0</literal>, <literal>1</literal>, <literal>2</literal>, <literal
+                >3</literal>, …, <literal>255</literal></entry>
               </row>
 
               <row>
-- 
cgit v1.2.3-54-g00ecf


From f6eb19a474fdee780d5f2a4b62b5a55e6cbef4de Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 12:45:10 +0200
Subject: units: permit importd to mount stuff

Fixes #3996
---
 units/systemd-importd.service.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/units/systemd-importd.service.in b/units/systemd-importd.service.in
index 0f5489e7e3..332ee910d1 100644
--- a/units/systemd-importd.service.in
+++ b/units/systemd-importd.service.in
@@ -18,4 +18,4 @@ NoNewPrivileges=yes
 WatchdogSec=3min
 KillMode=mixed
 MemoryDenyWriteExecute=yes
-SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
+SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io
-- 
cgit v1.2.3-54-g00ecf


From 0c28d51ac84973904e5f780b024adf8108e69fa1 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 13:23:27 +0200
Subject: units: further lock down our long-running services

Let's make this an excercise in dogfooding: let's turn on more security
features for all our long-running services.

Specifically:

- Turn on RestrictRealtime=yes for all of them

- Turn on ProtectKernelTunables=yes and ProtectControlGroups=yes for most of
  them

- Turn on RestrictAddressFamilies= for all of them, but different sets of
  address families for each

Also, always order settings in the unit files, that the various sandboxing
features are close together.

Add a couple of missing, older settings for a numbre of unit files.

Note that this change turns off AF_INET/AF_INET6 from udevd, thus effectively
turning of networking from udev rule commands. Since this might break stuff
(that is already broken I'd argue) this is documented in NEWS.
---
 units/systemd-hostnamed.service.in        |  6 +++++-
 units/systemd-importd.service.in          |  6 ++++--
 units/systemd-journal-gatewayd.service.in |  5 +++++
 units/systemd-journal-remote.service.in   | 13 +++++++++----
 units/systemd-journal-upload.service.in   | 12 +++++++++---
 units/systemd-journald.service.in         |  4 +++-
 units/systemd-localed.service.in          |  6 +++++-
 units/systemd-logind.service.in           |  4 +++-
 units/systemd-machined.service.in         |  4 +++-
 units/systemd-networkd.service.m4.in      |  5 ++++-
 units/systemd-resolved.service.m4.in      |  8 +++++++-
 units/systemd-timedated.service.in        |  6 +++++-
 units/systemd-timesyncd.service.in        |  6 +++++-
 units/systemd-udevd.service.in            |  5 ++++-
 14 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/units/systemd-hostnamed.service.in b/units/systemd-hostnamed.service.in
index 0b03a589ea..edc5a1722a 100644
--- a/units/systemd-hostnamed.service.in
+++ b/units/systemd-hostnamed.service.in
@@ -13,12 +13,16 @@ Documentation=http://www.freedesktop.org/wiki/Software/systemd/hostnamed
 [Service]
 ExecStart=@rootlibexecdir@/systemd-hostnamed
 BusName=org.freedesktop.hostname1
-CapabilityBoundingSet=CAP_SYS_ADMIN
 WatchdogSec=3min
+CapabilityBoundingSet=CAP_SYS_ADMIN
 PrivateTmp=yes
 PrivateDevices=yes
 PrivateNetwork=yes
 ProtectSystem=yes
 ProtectHome=yes
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
diff --git a/units/systemd-importd.service.in b/units/systemd-importd.service.in
index 332ee910d1..ac27c2bcba 100644
--- a/units/systemd-importd.service.in
+++ b/units/systemd-importd.service.in
@@ -13,9 +13,11 @@ Documentation=http://www.freedesktop.org/wiki/Software/systemd/importd
 [Service]
 ExecStart=@rootlibexecdir@/systemd-importd
 BusName=org.freedesktop.import1
-CapabilityBoundingSet=CAP_CHOWN CAP_FOWNER CAP_FSETID CAP_MKNOD CAP_SETFCAP CAP_SYS_ADMIN CAP_SETPCAP CAP_DAC_OVERRIDE
-NoNewPrivileges=yes
 WatchdogSec=3min
 KillMode=mixed
+CapabilityBoundingSet=CAP_CHOWN CAP_FOWNER CAP_FSETID CAP_MKNOD CAP_SETFCAP CAP_SYS_ADMIN CAP_SETPCAP CAP_DAC_OVERRIDE
+NoNewPrivileges=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io
diff --git a/units/systemd-journal-gatewayd.service.in b/units/systemd-journal-gatewayd.service.in
index f4f845841d..efefaa4244 100644
--- a/units/systemd-journal-gatewayd.service.in
+++ b/units/systemd-journal-gatewayd.service.in
@@ -20,6 +20,11 @@ PrivateDevices=yes
 PrivateNetwork=yes
 ProtectSystem=full
 ProtectHome=yes
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
+MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
 
 # If there are many split upjournal files we need a lot of fds to
 # access them all and combine
diff --git a/units/systemd-journal-remote.service.in b/units/systemd-journal-remote.service.in
index fdf3da4b64..753dd6c158 100644
--- a/units/systemd-journal-remote.service.in
+++ b/units/systemd-journal-remote.service.in
@@ -11,15 +11,20 @@ Documentation=man:systemd-journal-remote(8) man:journal-remote.conf(5)
 Requires=systemd-journal-remote.socket
 
 [Service]
-ExecStart=@rootlibexecdir@/systemd-journal-remote \
-          --listen-https=-3 \
-          --output=/var/log/journal/remote/
+ExecStart=@rootlibexecdir@/systemd-journal-remote --listen-https=-3 --output=/var/log/journal/remote/
 User=systemd-journal-remote
 Group=systemd-journal-remote
+WatchdogSec=3min
 PrivateTmp=yes
 PrivateDevices=yes
 PrivateNetwork=yes
-WatchdogSec=3min
+ProtectSystem=full
+ProtectHome=yes
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
+MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
 
 [Install]
 Also=systemd-journal-remote.socket
diff --git a/units/systemd-journal-upload.service.in b/units/systemd-journal-upload.service.in
index 1f488ff425..c709543af5 100644
--- a/units/systemd-journal-upload.service.in
+++ b/units/systemd-journal-upload.service.in
@@ -11,13 +11,19 @@ Documentation=man:systemd-journal-upload(8)
 After=network.target
 
 [Service]
-ExecStart=@rootlibexecdir@/systemd-journal-upload \
-          --save-state
+ExecStart=@rootlibexecdir@/systemd-journal-upload --save-state
 User=systemd-journal-upload
 SupplementaryGroups=systemd-journal
+WatchdogSec=3min
 PrivateTmp=yes
 PrivateDevices=yes
-WatchdogSec=3min
+ProtectSystem=full
+ProtectHome=yes
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
+MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
 
 # If there are many split up journal files we need a lot of fds to
 # access them all and combine
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
index 08ace8ae44..712ce55483 100644
--- a/units/systemd-journald.service.in
+++ b/units/systemd-journald.service.in
@@ -21,10 +21,12 @@ Restart=always
 RestartSec=0
 NotifyAccess=all
 StandardOutput=null
-CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
 WatchdogSec=3min
 FileDescriptorStoreMax=1024
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_NETLINK
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
 
 # Increase the default a bit in order to allow many simultaneous
diff --git a/units/systemd-localed.service.in b/units/systemd-localed.service.in
index 1f3151c2b5..df829e1164 100644
--- a/units/systemd-localed.service.in
+++ b/units/systemd-localed.service.in
@@ -13,12 +13,16 @@ Documentation=http://www.freedesktop.org/wiki/Software/systemd/localed
 [Service]
 ExecStart=@rootlibexecdir@/systemd-localed
 BusName=org.freedesktop.locale1
-CapabilityBoundingSet=
 WatchdogSec=3min
+CapabilityBoundingSet=
 PrivateTmp=yes
 PrivateDevices=yes
 PrivateNetwork=yes
 ProtectSystem=yes
 ProtectHome=yes
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
diff --git a/units/systemd-logind.service.in b/units/systemd-logind.service.in
index bee08d011f..0b6de35733 100644
--- a/units/systemd-logind.service.in
+++ b/units/systemd-logind.service.in
@@ -23,9 +23,11 @@ ExecStart=@rootlibexecdir@/systemd-logind
 Restart=always
 RestartSec=0
 BusName=org.freedesktop.login1
-CapabilityBoundingSet=CAP_SYS_ADMIN CAP_MAC_ADMIN CAP_AUDIT_CONTROL CAP_CHOWN CAP_KILL CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_FOWNER CAP_SYS_TTY_CONFIG
 WatchdogSec=3min
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_MAC_ADMIN CAP_AUDIT_CONTROL CAP_CHOWN CAP_KILL CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_FOWNER CAP_SYS_TTY_CONFIG
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io
 
 # Increase the default a bit in order to allow many simultaneous
diff --git a/units/systemd-machined.service.in b/units/systemd-machined.service.in
index dcf9f347b7..911ead79ee 100644
--- a/units/systemd-machined.service.in
+++ b/units/systemd-machined.service.in
@@ -15,9 +15,11 @@ After=machine.slice
 [Service]
 ExecStart=@rootlibexecdir@/systemd-machined
 BusName=org.freedesktop.machine1
-CapabilityBoundingSet=CAP_KILL CAP_SYS_PTRACE CAP_SYS_ADMIN CAP_SETGID CAP_SYS_CHROOT CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_CHOWN CAP_FOWNER CAP_FSETID CAP_MKNOD
 WatchdogSec=3min
+CapabilityBoundingSet=CAP_KILL CAP_SYS_PTRACE CAP_SYS_ADMIN CAP_SETGID CAP_SYS_CHROOT CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_CHOWN CAP_FOWNER CAP_FSETID CAP_MKNOD
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @obsolete @raw-io
 
 # Note that machined cannot be placed in a mount namespace, since it
diff --git a/units/systemd-networkd.service.m4.in b/units/systemd-networkd.service.m4.in
index 38d967d2d1..a968d8bd45 100644
--- a/units/systemd-networkd.service.m4.in
+++ b/units/systemd-networkd.service.m4.in
@@ -27,11 +27,14 @@ Type=notify
 Restart=on-failure
 RestartSec=0
 ExecStart=@rootlibexecdir@/systemd-networkd
+WatchdogSec=3min
 CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW CAP_SETUID CAP_SETGID CAP_SETPCAP CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER
 ProtectSystem=full
 ProtectHome=yes
-WatchdogSec=3min
+ProtectControlGroups=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6 AF_PACKET
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
 
 [Install]
diff --git a/units/systemd-resolved.service.m4.in b/units/systemd-resolved.service.m4.in
index 15ab56a066..0f0440ddaf 100644
--- a/units/systemd-resolved.service.m4.in
+++ b/units/systemd-resolved.service.m4.in
@@ -23,11 +23,17 @@ Type=notify
 Restart=always
 RestartSec=0
 ExecStart=@rootlibexecdir@/systemd-resolved
+WatchdogSec=3min
 CapabilityBoundingSet=CAP_SETUID CAP_SETGID CAP_SETPCAP CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER CAP_NET_RAW CAP_NET_BIND_SERVICE
+PrivateTmp=yes
+PrivateDevices=yes
 ProtectSystem=full
 ProtectHome=yes
-WatchdogSec=3min
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6
 SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
 
 [Install]
diff --git a/units/systemd-timedated.service.in b/units/systemd-timedated.service.in
index bc1795d747..e8c4d5ed4b 100644
--- a/units/systemd-timedated.service.in
+++ b/units/systemd-timedated.service.in
@@ -13,10 +13,14 @@ Documentation=http://www.freedesktop.org/wiki/Software/systemd/timedated
 [Service]
 ExecStart=@rootlibexecdir@/systemd-timedated
 BusName=org.freedesktop.timedate1
-CapabilityBoundingSet=CAP_SYS_TIME
 WatchdogSec=3min
+CapabilityBoundingSet=CAP_SYS_TIME
 PrivateTmp=yes
 ProtectSystem=yes
 ProtectHome=yes
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX
 SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
diff --git a/units/systemd-timesyncd.service.in b/units/systemd-timesyncd.service.in
index df1e339196..9a6c6ea60d 100644
--- a/units/systemd-timesyncd.service.in
+++ b/units/systemd-timesyncd.service.in
@@ -22,13 +22,17 @@ Type=notify
 Restart=always
 RestartSec=0
 ExecStart=@rootlibexecdir@/systemd-timesyncd
+WatchdogSec=3min
 CapabilityBoundingSet=CAP_SYS_TIME CAP_SETUID CAP_SETGID CAP_SETPCAP CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER
 PrivateTmp=yes
 PrivateDevices=yes
 ProtectSystem=full
 ProtectHome=yes
-WatchdogSec=3min
+ProtectControlGroups=yes
+ProtectKernelTunables=yes
 MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
 SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io
 
 [Install]
diff --git a/units/systemd-udevd.service.in b/units/systemd-udevd.service.in
index 67e4c5fcd7..cb2d8ba775 100644
--- a/units/systemd-udevd.service.in
+++ b/units/systemd-udevd.service.in
@@ -21,7 +21,10 @@ Sockets=systemd-udevd-control.socket systemd-udevd-kernel.socket
 Restart=always
 RestartSec=0
 ExecStart=@rootlibexecdir@/systemd-udevd
-MountFlags=slave
 KillMode=mixed
 WatchdogSec=3min
 TasksMax=infinity
+MountFlags=slave
+MemoryDenyWriteExecute=yes
+RestrictRealtime=yes
+RestrictAddressFamilies=AF_UNIX AF_NETLINK
-- 
cgit v1.2.3-54-g00ecf


From 1ecdba149bab8346b611e2ccacfe66e58a7b863c Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 19 Sep 2016 21:29:06 +0200
Subject: NEWS: update news about systemd-udevd.service

---
 NEWS | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/NEWS b/NEWS
index 178ccf9b04..5f3f76df4f 100644
--- a/NEWS
+++ b/NEWS
@@ -137,6 +137,20 @@ CHANGES WITH 232 in spe
           $SYSTEMD_NSPAWN_SHARE_NS_UTS may be used to control the unsharing of
           individual namespaces.
 
+        * systemd-udevd.service is now run in a Seccomp-based sandbox that
+          prohibits access to AF_INET and AF_INET6 sockets and thus access to
+          the network. This might break code that runs from udev rules that
+          tries to talk to the network. Doing that is generally a bad idea and
+          unsafe due to a variety of reasons. It's also racy as device
+          management would race against network configuration. It is
+          recommended to rework such rules to use the SYSTEMD_WANTS property on
+          the relevant devices to pull in a proper systemd service (which can
+          be sandboxed differently and ordered correctly after the network
+          having come up). If that's not possible consider reverting this
+          sandboxing feature locally by removing the RestrictAddressFamilies=
+          setting from the systemd-udevd.service unit file, or adding AF_INET
+          and AF_INET6 to it.
+
 CHANGES WITH 231:
 
         * In service units the various ExecXYZ= settings have been extended
-- 
cgit v1.2.3-54-g00ecf


From ba128bb809cc59ca60db65f0c09bd7f48876fa83 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 16:39:04 +0200
Subject: execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.
---
 src/core/execute.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/src/core/execute.c b/src/core/execute.c
index ae251b2a4c..a20e9ea829 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1422,12 +1422,67 @@ finish:
         return r;
 }
 
+static int apply_private_devices(Unit *u, const ExecContext *c) {
+
+        static const int device_syscalls[] = {
+                SCMP_SYS(ioperm),
+                SCMP_SYS(iopl),
+                SCMP_SYS(pciconfig_iobase),
+                SCMP_SYS(pciconfig_read),
+                SCMP_SYS(pciconfig_write),
+#ifdef __NR_s390_pci_mmio_read
+                SCMP_SYS(s390_pci_mmio_read),
+#endif
+#ifdef __NR_s390_pci_mmio_write
+                SCMP_SYS(s390_pci_mmio_write),
+#endif
+        };
+
+        scmp_filter_ctx *seccomp;
+        unsigned i;
+        int r;
+
+        assert(c);
+
+        /* If PrivateDevices= is set, also turn off iopl and friends. */
+
+        if (skip_seccomp_unavailable(u, "PrivateDevices="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        for (i = 0; i < ELEMENTSOF(device_syscalls); i++) {
+                r = seccomp_rule_add(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EPERM),
+                                device_syscalls[i],
+                                0);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
         assert(idle_pipe);
 
-
         idle_pipe[1] = safe_close(idle_pipe[1]);
         idle_pipe[2] = safe_close(idle_pipe[2]);
 
@@ -2584,6 +2639,14 @@ static int exec_child(
                         }
                 }
 
+                if (context->private_devices) {
+                        r = apply_private_devices(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
                 if (context_has_syscall_filters(context)) {
                         r = apply_seccomp(unit, context);
                         if (r < 0) {
-- 
cgit v1.2.3-54-g00ecf


From 0439746492e5839cfa1cdd76b9d23711eb1f451b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 24 Aug 2016 20:53:56 +0200
Subject: Update TODO

---
 TODO | 38 +++++++++++---------------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/TODO b/TODO
index e7391f0bfe..a47f4c488b 100644
--- a/TODO
+++ b/TODO
@@ -32,6 +32,8 @@ Janitorial Clean-ups:
 
 Features:
 
+* switch to ProtectSystem=strict for all our long-running services where that's possible
+
 * introduce an "invocation ID" for units, that is randomly generated, and
   identifies each runtime-cycle of a unit. It should be set freshly each time
   we traverse inactive → activating/active, and should be the primary key to
@@ -40,8 +42,9 @@ Features:
   the cgroup of a services. The former is accessible without privileges, the
   latter ensures the ID cannot be faked.
 
-* Introduce ProtectSystem=strict for making the entire OS hierarchy read-only
-  except for a select few
+* If RootDirectory= is used, mount /proc, /sys, /dev into it, if not mounted yet
+
+* Permit masking specific netlink APIs with RestrictAddressFamily=
 
 * nspawn: start UID allocation loop from hash of container name
 
@@ -55,15 +58,13 @@ Features:
 
 * ProtectClock= (drops CAP_SYS_TIMES, adds seecomp filters for settimeofday, adjtimex), sets DeviceAllow o /dev/rtc
 
-* ProtectMount= (drop mount/umount/pivot_root from seccomp, disallow fuse via DeviceAllow, imply Mountflags=slave)
-
-* ProtectDevices= should also take iopl/ioperm/pciaccess away
+* ProtectKernelModules= (drops CAP_SYS_MODULE and filters the kmod syscalls)
 
-* ProtectKeyRing= to take keyring calls away
+* ProtectTracing= (drops CAP_SYS_PTRACE, blocks ptrace syscall, makes /sys/kernel/tracing go away)
 
-* ProtectControlGroups= which mounts all of /sys/fs/cgroup read-only
+* ProtectMount= (drop mount/umount/pivot_root from seccomp, disallow fuse via DeviceAllow, imply Mountflags=slave)
 
-* ProtectKernelTunables= which mounts /sys and /proc/sys read-only
+* ProtectKeyRing= to take keyring calls away
 
 * RemoveKeyRing= to remove all keyring entries of the specified user
 
@@ -72,9 +73,6 @@ Features:
 
 * Add BindDirectory= for allowing arbitrary, private bind mounts for services
 
-* Beef up RootDirectory= to use namespacing/bind mounts as soon as fs
-  namespaces are enabled by the service
-
 * Add RootImage= for mounting a disk image or file as root directory
 
 * RestrictNamespaces= or so in services (taking away the ability to create namespaces, with setns, unshare, clone)
@@ -180,7 +178,7 @@ Features:
 * implement a per-service firewall based on net_cls
 
 * Port various tools to make use of verbs.[ch], where applicable: busctl,
-  bootctl, coredumpctl, hostnamectl, localectl, systemd-analyze, timedatectl
+  coredumpctl, hostnamectl, localectl, systemd-analyze, timedatectl
 
 * hostnamectl: show root image uuid
 
@@ -293,9 +291,6 @@ Features:
 
 * MessageQueueMessageSize= (and suchlike) should use parse_iec_size().
 
-* "busctl status" works only as root on dbus1, since we cannot read
-  /proc/$PID/exe
-
 * implement Distribute= in socket units to allow running multiple
   service instances processing the listening socket, and open this up
   for ReusePort=
@@ -306,8 +301,6 @@ Features:
   and passes this back to PID1 via SCM_RIGHTS. This also could be used
   to allow Chown/chgrp on sockets without requiring NSS in PID 1.
 
-* New service property: maximum CPU runtime for a service
-
 * introduce bus call FreezeUnit(s, b), as well as "systemctl freeze
   $UNIT" and "systemctl thaw $UNIT" as wrappers around this. The calls
   should SIGSTOP all unit processes in a loop until all processes of
@@ -344,12 +337,10 @@ Features:
   error. Currently, we just ignore it and read the unit from the search
   path anyway.
 
-* refuse boot if /etc/os-release is missing or /etc/machine-id cannot be set up
+* refuse boot if /usr/lib/os-release is missing or /etc/machine-id cannot be set up
 
 * btrfs raid assembly: some .device jobs stay stuck in the queue
 
-* make sure gdm does not use multi-user-x but the new default X configuration file, and then remove multi-user-x from systemd
-
 * man: the documentation of Restart= currently is very misleading and suggests the tools from ExecStartPre= might get restarted.
 
 * load .d/*.conf dropins for device units
@@ -606,9 +597,6 @@ Features:
 * currently x-systemd.timeout is lost in the initrd, since crypttab is copied into dracut, but fstab is not
 
 * nspawn:
-  - to allow "linking" of nspawn containers, extend --network-bridge= so
-    that it can dynamically create bridge interfaces that are refcounted
-    by the containers on them. For each group of containers to link together
   - nspawn -x should support ephemeral instances of gpt images
   - emulate /dev/kmsg using CUSE and turn off the syslog syscall
     with seccomp. That should provide us with a useful log buffer that
@@ -617,8 +605,6 @@ Features:
   - as soon as networkd has a bus interface, hook up --network-interface=,
     --network-bridge= with networkd, to trigger netdev creation should an
     interface be missing
-  - don't copy /etc/resolv.conf from host into container unless we are in
-    shared-network mode
   - a nice way to boot up without machine id set, so that it is set at boot
     automatically for supporting --ephemeral. Maybe hash the host machine id
     together with the machine name to generate the machine id for the container
@@ -684,7 +670,6 @@ Features:
 
 * coredump:
   - save coredump in Windows/Mozilla minidump format
-  - move PID 1 segfaults to /var/lib/systemd/coredump?
 
 * support crash reporting operation modes (https://live.gnome.org/GnomeOS/Design/Whiteboards/ProblemReporting)
 
@@ -751,7 +736,6 @@ Features:
   - GC unreferenced jobs (such as .device jobs)
   - move PAM code into its own binary
   - when we automatically restart a service, ensure we restart its rdeps, too.
-  - for services: do not set $HOME in services unless requested
   - hide PAM options in fragment parser when compile time disabled
   - Support --test based on current system state
   - If we show an error about a unit (such as not showing up) and it has no Description string, then show a description string generated form the reverse of unit_name_mangle().
-- 
cgit v1.2.3-54-g00ecf


From 54500613a46023fe991f424e21ed15948b9a74f5 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:25:08 +0200
Subject: main: minor simplification

---
 src/core/main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/core/main.c b/src/core/main.c
index 803307c9d5..be0cb0b6d1 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -996,10 +996,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_MACHINE_ID:
                         r = set_machine_id(optarg);
-                        if (r < 0) {
-                                log_error("MachineID '%s' is not valid.", optarg);
-                                return r;
-                        }
+                        if (r < 0)
+                                return log_error_errno(r, "MachineID '%s' is not valid.", optarg);
                         break;
 
                 case 'h':
-- 
cgit v1.2.3-54-g00ecf


From cd2902c9546eabfffcf5d6de4d0bd4dfe6a4d427 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:25:40 +0200
Subject: namespace: drop all mounts outside of the new root directory

There's no point in mounting these, if they are outside of the root directory
we'll move to.
---
 src/core/namespace.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index a7451ffbdc..c9b2154985 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -199,6 +199,31 @@ static void drop_nop(BindMount *m, unsigned *n) {
         *n = t - m;
 }
 
+static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+
+        assert(m);
+        assert(n);
+
+        if (!root_directory)
+                return;
+
+        /* Drops all mounts that are outside of the root directory. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                if (!path_startswith(f->path, root_directory)) {
+                        log_debug("%s is outside of root directory.", f->path);
+                        continue;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+}
+
 static int mount_dev(BindMount *m) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -631,6 +656,7 @@ int setup_namespace(
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
 
                 drop_duplicates(mounts, &n);
+                drop_outside_root(root_directory, mounts, &n);
                 drop_inaccessible(mounts, &n);
                 drop_nop(mounts, &n);
         }
-- 
cgit v1.2.3-54-g00ecf


From cefc33aee299fa214f093d3d1b4c171ac3b30dde Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:40:42 +0200
Subject: execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.
---
 src/core/execute.c | 74 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/src/core/execute.c b/src/core/execute.c
index a20e9ea829..0488ba2ca9 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -837,6 +837,8 @@ static int null_conv(
         return PAM_CONV_ERR;
 }
 
+#endif
+
 static int setup_pam(
                 const char *name,
                 const char *user,
@@ -845,6 +847,8 @@ static int setup_pam(
                 char ***env,
                 int fds[], unsigned n_fds) {
 
+#ifdef HAVE_PAM
+
         static const struct pam_conv conv = {
                 .conv = null_conv,
                 .appdata_ptr = NULL
@@ -1038,8 +1042,10 @@ fail:
         closelog();
 
         return r;
-}
+#else
+        return 0;
 #endif
+}
 
 static void rename_process_from_path(const char *path) {
         char process_name[11];
@@ -1875,6 +1881,42 @@ static int setup_runtime_directory(
         return 0;
 }
 
+static int setup_smack(
+                const ExecContext *context,
+                const ExecCommand *command) {
+
+#ifdef HAVE_SMACK
+        int r;
+
+        assert(context);
+        assert(command);
+
+        if (!mac_smack_use())
+                return 0;
+
+        if (context->smack_process_label) {
+                r = mac_smack_apply_pid(0, context->smack_process_label);
+                if (r < 0)
+                        return r;
+        }
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+        else {
+                _cleanup_free_ char *exec_label = NULL;
+
+                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
+                if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
+                        return r;
+
+                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
+                if (r < 0)
+                        return r;
+        }
+#endif
+#endif
+
+        return 0;
+}
+
 static int compile_read_write_paths(
                 const ExecContext *context,
                 const ExecParameters *params,
@@ -2349,33 +2391,12 @@ static int exec_child(
         (void) umask(context->umask);
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-#ifdef HAVE_SMACK
-                if (context->smack_process_label) {
-                        r = mac_smack_apply_pid(0, context->smack_process_label);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
+                r = setup_smack(context, command);
+                if (r < 0) {
+                        *exit_status = EXIT_SMACK_PROCESS_LABEL;
+                        return r;
                 }
-#ifdef SMACK_DEFAULT_PROCESS_LABEL
-                else {
-                        _cleanup_free_ char *exec_label = NULL;
 
-                        r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-                        if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-
-                        r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-                }
-#endif
-#endif
-#ifdef HAVE_PAM
                 if (context->pam_name && username) {
                         r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds);
                         if (r < 0) {
@@ -2383,7 +2404,6 @@ static int exec_child(
                                 return r;
                         }
                 }
-#endif
         }
 
         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
-- 
cgit v1.2.3-54-g00ecf


From 9c94d52e0919e4d7999e49b9ba2654a9e2ca4543 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:03:21 +0200
Subject: core:namespace: minor improvements to append_mounts()

---
 src/core/namespace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index c9b2154985..8de774e6f6 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -70,12 +70,11 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         assert(p);
 
         STRV_FOREACH(i, strv) {
+                bool ignore = false;
 
-                (*p)->ignore = false;
-
-                if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
-                        (*p)->ignore = true;
+                if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) {
                         (*i)++;
+                        ignore = true;
                 }
 
                 if (!path_is_absolute(*i))
@@ -83,6 +82,7 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
 
                 (*p)->path = *i;
                 (*p)->mode = mode;
+                (*p)->ignore = ignore;
                 (*p)++;
         }
 
-- 
cgit v1.2.3-54-g00ecf


From 11a30cec2a9b6168b024c06720ad238dd1390794 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:16:44 +0200
Subject: core:namespace: put paths protected by ProtectKernelTunables= in

Instead of having all these paths everywhere, put the ones that are
protected by ProtectKernelTunables= into their own table. This way it
is easy to add paths and track which ones are protected.
---
 src/core/namespace.c | 54 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8de774e6f6..13f6aeba51 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -61,9 +61,23 @@ typedef struct BindMount {
         const char *path; /* stack memory, doesn't need to be freed explicitly */
         char *chased; /* malloc()ed memory, needs to be freed */
         MountMode mode;
-        bool ignore;
+        bool ignore; /* Ignore if path does not exist */
 } BindMount;
 
+typedef struct TargetMount {
+        const char *path;
+        MountMode mode;
+        bool ignore; /* Ignore if path does not exist */
+} TargetMount;
+
+/* ProtectKernelTunables= option and the related filesystem APIs */
+static const TargetMount protect_kernel_tunables_table[] = {
+        { "/proc/sys",                  READONLY,       false },
+        { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/sys",                       READONLY,       false },
+        { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
+};
+
 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         char **i;
 
@@ -89,6 +103,20 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         return 0;
 }
 
+static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
+        unsigned int i;
+
+        assert(p);
+
+        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
+                const TargetMount *t = &protect_kernel_tunables_table[i];
+                (*p)->path = prefix_roota(root_directory, t->path);
+                (*p)->mode = t->mode;
+                (*p)->ignore = t->ignore;
+                (*p)++;
+        }
+}
+
 static int mount_path_compare(const void *a, const void *b) {
         const BindMount *p = a, *q = b;
         int d;
@@ -514,8 +542,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
-                (protect_sysctl ? 3 : 0) +
-                (protect_cgroups != protect_sysctl) +
+                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+                (protect_cgroups ? 1 : 0) +
                 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
                 (protect_system == PROTECT_SYSTEM_STRICT ?
                  (2 + !private_dev + !protect_sysctl) :
@@ -557,24 +585,12 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_sysctl) {
-                        m->path = prefix_roota(root_directory, "/proc/sys");
-                        m->mode = READONLY;
-                        m++;
-
-                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
-                        m->mode = READONLY;
-                        m->ignore = true; /* Not always compiled into the kernel */
-                        m++;
+                if (protect_sysctl)
+                        append_protect_kernel_tunables(&m, root_directory);
 
-                        m->path = prefix_roota(root_directory, "/sys");
-                        m->mode = READONLY;
-                        m++;
-                }
-
-                if (protect_cgroups != protect_sysctl) {
+                if (protect_cgroups) {
                         m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
-                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m->mode = READONLY;
                         m++;
                 }
 
-- 
cgit v1.2.3-54-g00ecf


From 2652c6c10394623b2c3e2ed5d4616c85918d140c Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:25:00 +0200
Subject: core:namespace: simplify mount calculation

Move out mount calculation on its own function. Actually the logic is
smart enough to later drop nop and duplicates mounts, this change
improves code readability.
---
 src/core/namespace.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)
---
 src/core/namespace.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 13f6aeba51..8aa8b83c88 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -515,6 +515,32 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned
         return 0;
 }
 
+static unsigned namespace_calculate_mounts(
+                char** read_write_paths,
+                char** read_only_paths,
+                char** inaccessible_paths,
+                const char* tmp_dir,
+                const char* var_tmp_dir,
+                bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
+                ProtectHome protect_home,
+                ProtectSystem protect_system) {
+
+        return !!tmp_dir + !!var_tmp_dir +
+                strv_length(read_write_paths) +
+                strv_length(read_only_paths) +
+                strv_length(inaccessible_paths) +
+                private_dev +
+                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+                (protect_cgroups ? 1 : 0) +
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+}
+
 int setup_namespace(
                 const char* root_directory,
                 char** read_write_paths,
@@ -537,19 +563,15 @@ int setup_namespace(
         if (mount_flags == 0)
                 mount_flags = MS_SHARED;
 
-        n = !!tmp_dir + !!var_tmp_dir +
-                strv_length(read_write_paths) +
-                strv_length(read_only_paths) +
-                strv_length(inaccessible_paths) +
-                private_dev +
-                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
-                (protect_cgroups ? 1 : 0) +
-                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                (protect_system == PROTECT_SYSTEM_STRICT ?
-                 (2 + !private_dev + !protect_sysctl) :
-                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
-                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+        n = namespace_calculate_mounts(read_write_paths,
+                                       read_only_paths,
+                                       inaccessible_paths,
+                                       tmp_dir, var_tmp_dir,
+                                       private_dev, protect_sysctl,
+                                       protect_cgroups, protect_home,
+                                       protect_system);
 
+        /* Set mount slave mode */
         if (root_directory || n > 0)
                 make_slave = true;
 
-- 
cgit v1.2.3-54-g00ecf


From e778185bb55320e8242b57c19079377fe33e01bc Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 19 Sep 2016 21:46:17 +0200
Subject: doc: documentation fixes for ReadWritePaths= and
 ProtectKernelTunables=

Documentation fixes for ReadWritePaths= and ProtectKernelTunables=
as reported by Evgeny Vereshchagin.
---
 man/systemd.exec.xml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 403aa471c8..79ceee3ec0 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -897,14 +897,14 @@
         in which case all paths listed will have limited access from within the namespace. If the empty string is
         assigned to this option, the specific list is reset, and all prior assignments have no effect.</para>
 
-        <para>Paths in <varname>ReadOnlyPaths=</varname> and <varname>InaccessiblePaths=</varname> may be prefixed with
-        <literal>-</literal>, in which case they will be ignored when they do not exist. Note that using this setting
-        will disconnect propagation of mounts from the service to the host (propagation in the opposite direction
-        continues to work). This means that this setting may not be used for services which shall be able to install
-        mount points in the main mount namespace. Note that the effect of these settings may be undone by privileged
-        processes. In order to set up an effective sandboxed environment for a unit it is thus recommended to combine
-        these settings with either <varname>CapabilityBoundingSet=~CAP_SYS_ADMIN</varname> or
-        <varname>SystemCallFilter=~@mount</varname>.</para></listitem>
+        <para>Paths in <varname>ReadWritePaths=</varname>, <varname>ReadOnlyPaths=</varname> and
+        <varname>InaccessiblePaths=</varname> may be prefixed with <literal>-</literal>, in which case they will be ignored
+        when they do not exist. Note that using this setting will disconnect propagation of mounts from the service to
+        the host (propagation in the opposite direction continues to work). This means that this setting may not be used
+        for services which shall be able to install mount points in the main mount namespace. Note that the effect of
+        these settings may be undone by privileged processes. In order to set up an effective sandboxed environment for
+        a unit it is thus recommended to combine these settings with either
+        <varname>CapabilityBoundingSet=~CAP_SYS_ADMIN</varname> or <varname>SystemCallFilter=~@mount</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1025,11 +1025,11 @@
         <term><varname>ProtectKernelTunables=</varname></term>
 
         <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
-        <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
-        unit. Usually, tunable kernel variables should only be written at boot-time, with the
-        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
-        no services need to write to these at runtime; it is hence recommended to turn this on for most services. For
-        this setting the same restrictions regarding mount propagation and privileges apply as for
+        <filename>/proc/sys</filename>, <filename>/sys</filename> and <filename>/proc/sysrq-trigger</filename> will be
+        made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
+        boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
+        most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
         <varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.</para></listitem>
       </varlistentry>
 
-- 
cgit v1.2.3-54-g00ecf


From 9221aec8d09f3b55a08fcbe8012e48129474ab54 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 19 Sep 2016 21:46:17 +0200
Subject: doc: explicitly document that /dev/mem and /dev/port are blocked by
 PrivateDevices=true

---
 man/systemd.exec.xml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 79ceee3ec0..a3a431c82b 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -931,9 +931,10 @@
         <listitem><para>Takes a boolean argument. If true, sets up a new /dev namespace for the executed processes and
         only adds API pseudo devices such as <filename>/dev/null</filename>, <filename>/dev/zero</filename> or
         <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
-        <filename>/dev/sda</filename>. This is useful to securely turn off physical device access by the executed
-        process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from the
-        capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
+        <filename>/dev/sda</filename>, system memory <filename>/dev/mem</filename>, system ports
+        <filename>/dev/port</filename> and others. This is useful to securely turn off physical device access by the
+        executed process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from
+        the capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         for details). Note that using this setting will disconnect propagation of mounts from the service to the host
         (propagation in the opposite direction continues to work).  This means that this setting may not be used for
-- 
cgit v1.2.3-54-g00ecf


From 49accde7bd915944d99c947dca0cf26ae0f24165 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:30:11 +0200
Subject: core:sandbox: add more /proc/* entries to ProtectKernelTunables=

Make ALSA entries, latency interface, mtrr, apm/acpi, suspend interface,
filesystems configuration and IRQ tuning readonly.

Most of these interfaces now days should be in /sys but they are still
available through /proc, so just protect them. This patch does not touch
/proc/net/...
---
 man/systemd.exec.xml |  6 ++++--
 src/core/namespace.c | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index a3a431c82b..f19e7f6ee9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1026,8 +1026,10 @@
         <term><varname>ProtectKernelTunables=</varname></term>
 
         <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
-        <filename>/proc/sys</filename>, <filename>/sys</filename> and <filename>/proc/sysrq-trigger</filename> will be
-        made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
+        <filename>/proc/sys</filename>, <filename>/sys</filename>, <filename>/proc/sysrq-trigger</filename>,
+        <filename>/proc/latency_stats</filename>, <filename>/proc/acpi</filename>,
+        <filename>/proc/timer_stats</filename>, <filename>/proc/fs</filename> and <filename>/proc/irq</filename> will
+        be made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
         boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
         most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8aa8b83c88..3234fab4bc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -74,7 +74,18 @@ typedef struct TargetMount {
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
         { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/proc/latency_stats",        READONLY,       true  },
+        { "/proc/mtrr",                 READONLY,       true  },
+        { "/proc/apm",                  READONLY,       true  },
+        { "/proc/acpi",                 READONLY,       true  },
+        { "/proc/timer_stats",          READONLY,       true  },
+        { "/proc/asound",               READONLY,       true  },
+        { "/proc/bus",                  READONLY,       true  },
+        { "/proc/fs",                   READONLY,       true  },
+        { "/proc/irq",                  READONLY,       true  },
         { "/sys",                       READONLY,       false },
+        { "/sys/kernel/debug",          READONLY,       true  },
+        { "/sys/kernel/tracing",        READONLY,       true  },
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
-- 
cgit v1.2.3-54-g00ecf


From f471b2afa11c97e48a4b6756f7254f88cc436960 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:21:25 +0200
Subject: core: simplify ProtectSystem= implementation

ProtectSystem= with all its different modes and other options like
PrivateDevices= + ProtectKernelTunables= + ProtectHome= are orthogonal,
however currently it's a bit hard to parse that from the implementation
view. Simplify it by giving each mode its own table with all paths and
references to other Protect options.

With this change some entries are duplicated, but we do not care since
duplicate mounts are first sorted by the most restrictive mode then
cleaned.
---
 src/core/namespace.c | 171 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 58 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 3234fab4bc..985e343096 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -70,6 +70,14 @@ typedef struct TargetMount {
         bool ignore; /* Ignore if path does not exist */
 } TargetMount;
 
+/*
+ * The following Protect tables are to protect paths and mark some of them
+ * READONLY, in case a path is covered by an option from another table, then
+ * it is marked READWRITE in the current one, and the more restrictive mode is
+ * applied from that other table. This way all options can be combined in a
+ * safe and comprehensible way for users.
+ */
+
 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
@@ -89,6 +97,45 @@ static const TargetMount protect_kernel_tunables_table[] = {
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
+/* ProtectSystem=yes table */
+static const TargetMount protect_system_yes_table[] = {
+        { "/usr",       READONLY,       false },
+        { "/boot",      READONLY,       true  },
+        { "/efi",       READONLY,       true  },
+};
+
+/* ProtectSystem=full includes ProtectSystem=yes */
+static const TargetMount protect_system_full_table[] = {
+        { "/usr",       READONLY,       false },
+        { "/boot",      READONLY,       true  },
+        { "/efi",       READONLY,       true  },
+        { "/etc",       READONLY,       false },
+};
+
+/*
+ * ProtectSystem=strict table. In this strict mode, we mount everything
+ * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
+ * which are left writable, but PrivateDevices= + ProtectKernelTunables=
+ * protect those, and these options should be fully orthogonal.
+ * (And of course /home and friends are also left writable, as ProtectHome=
+ * shall manage those, orthogonally).
+ */
+static const TargetMount protect_system_strict_table[] = {
+        { "/",          READONLY,       false },
+        { "/proc",      READWRITE,      false },      /* ProtectKernelTunables= */
+        { "/sys",       READWRITE,      false },      /* ProtectKernelTunables= */
+        { "/dev",       READWRITE,      false },      /* PrivateDevices= */
+        { "/home",      READWRITE,      true  },      /* ProtectHome= */
+        { "/run/user",  READWRITE,      true  },      /* ProtectHome= */
+        { "/root",      READWRITE,      true  },      /* ProtectHome= */
+};
+
+static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) {
+        (*p)->path = path;
+        (*p)->mode = mode;
+        (*p)->ignore = ignore;
+}
+
 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         char **i;
 
@@ -105,27 +152,71 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
                 if (!path_is_absolute(*i))
                         return -EINVAL;
 
-                (*p)->path = *i;
-                (*p)->mode = mode;
-                (*p)->ignore = ignore;
+                set_bind_mount(p, *i, mode, ignore);
                 (*p)++;
         }
 
         return 0;
 }
 
-static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
-        unsigned int i;
+static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) {
+        unsigned i;
 
         assert(p);
+        assert(mounts);
 
-        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
-                const TargetMount *t = &protect_kernel_tunables_table[i];
-                (*p)->path = prefix_roota(root_directory, t->path);
-                (*p)->mode = t->mode;
-                (*p)->ignore = t->ignore;
+        for (i = 0; i < size; i++) {
+                /*
+                 * Here we assume that the ignore field is set during
+                 * declaration we do not support "-" at the beginning.
+                 */
+                const TargetMount *m = &mounts[i];
+                const char *path = prefix_roota(root_directory, m->path);
+
+                if (!path_is_absolute(path))
+                        return -EINVAL;
+
+                set_bind_mount(p, path, m->mode, m->ignore);
                 (*p)++;
         }
+
+        return 0;
+}
+
+static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
+        assert(p);
+
+        return append_target_mounts(p, root_directory, protect_kernel_tunables_table,
+                                    ELEMENTSOF(protect_kernel_tunables_table));
+}
+
+static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
+        int r = 0;
+
+        assert(p);
+
+        if (protect_system == PROTECT_SYSTEM_NO)
+                return 0;
+
+        switch (protect_system) {
+        case PROTECT_SYSTEM_STRICT:
+                r = append_target_mounts(p, root_directory, protect_system_strict_table,
+                                         ELEMENTSOF(protect_system_strict_table));
+                break;
+        case PROTECT_SYSTEM_YES:
+                r = append_target_mounts(p, root_directory, protect_system_yes_table,
+                                         ELEMENTSOF(protect_system_yes_table));
+                break;
+        case PROTECT_SYSTEM_FULL:
+                r = append_target_mounts(p, root_directory, protect_system_full_table,
+                                         ELEMENTSOF(protect_system_full_table));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+
+        return r;
 }
 
 static int mount_path_compare(const void *a, const void *b) {
@@ -538,6 +629,14 @@ static unsigned namespace_calculate_mounts(
                 ProtectHome protect_home,
                 ProtectSystem protect_system) {
 
+        unsigned protect_system_cnt =
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 ELEMENTSOF(protect_system_strict_table) :
+                 ((protect_system == PROTECT_SYSTEM_FULL) ?
+                  ELEMENTSOF(protect_system_full_table) :
+                  ((protect_system == PROTECT_SYSTEM_YES) ?
+                   ELEMENTSOF(protect_system_yes_table) : 0)));
+
         return !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -546,10 +645,7 @@ static unsigned namespace_calculate_mounts(
                 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (protect_cgroups ? 1 : 0) +
                 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                (protect_system == PROTECT_SYSTEM_STRICT ?
-                 (2 + !private_dev + !protect_sysctl) :
-                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
-                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+                protect_system_cnt;
 }
 
 int setup_namespace(
@@ -648,50 +744,9 @@ int setup_namespace(
                                 return r;
                 }
 
-                if (protect_system == PROTECT_SYSTEM_STRICT) {
-                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
-                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
-                         * protect those, and these options should be fully orthogonal. (And of course /home and
-                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
-                         * above). */
-
-                        m->path = prefix_roota(root_directory, "/");
-                        m->mode = READONLY;
-                        m++;
-
-                        m->path = prefix_roota(root_directory, "/proc");
-                        m->mode = READWRITE;
-                        m++;
-
-                        if (!private_dev) {
-                                m->path = prefix_roota(root_directory, "/dev");
-                                m->mode = READWRITE;
-                                m++;
-                        }
-                        if (!protect_sysctl) {
-                                m->path = prefix_roota(root_directory, "/sys");
-                                m->mode = READWRITE;
-                                m++;
-                        }
-
-                } else if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
-
-                        /* In any other mode we simply mark the relevant three directories ready-only. */
-
-                        usr_dir = prefix_roota(root_directory, "/usr");
-                        boot_dir = prefix_roota(root_directory, "/boot");
-                        boot_dir = strjoina("-", boot_dir);
-                        efi_dir = prefix_roota(root_directory, "/efi");
-                        efi_dir = strjoina("-", efi_dir);
-                        etc_dir = prefix_roota(root_directory, "/etc");
-
-                        r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
-                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
-                        if (r < 0)
-                                return r;
-                }
+                r = append_protect_system(&m, root_directory, protect_system);
+                if (r < 0)
+                        return r;
 
                 assert(mounts + n == m);
 
-- 
cgit v1.2.3-54-g00ecf


From b6c432ca7ed930c7e9078ac2266ae439aa242632 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:41:16 +0200
Subject: core:namespace: simplify ProtectHome= implementation

As with previous patch simplify ProtectHome and don't care about
duplicates, they will be sorted by most restrictive mode and cleaned.
---
 src/core/namespace.c | 75 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 985e343096..43a2f4ba6e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -97,6 +97,23 @@ static const TargetMount protect_kernel_tunables_table[] = {
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
+/*
+ * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
+ * system should be protected by ProtectSystem=
+ */
+static const TargetMount protect_home_read_only_table[] = {
+        { "/home",      READONLY,       true },
+        { "/run/user",  READONLY,       true },
+        { "/root",      READONLY,       true },
+};
+
+/* ProtectHome=yes table */
+static const TargetMount protect_home_yes_table[] = {
+        { "/home",      INACCESSIBLE,   true },
+        { "/run/user",  INACCESSIBLE,   true },
+        { "/root",      INACCESSIBLE,   true },
+};
+
 /* ProtectSystem=yes table */
 static const TargetMount protect_system_yes_table[] = {
         { "/usr",       READONLY,       false },
@@ -190,6 +207,31 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct
                                     ELEMENTSOF(protect_kernel_tunables_table));
 }
 
+static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) {
+        int r = 0;
+
+        assert(p);
+
+        if (protect_home == PROTECT_HOME_NO)
+                return 0;
+
+        switch (protect_home) {
+        case PROTECT_HOME_READ_ONLY:
+                r = append_target_mounts(p, root_directory, protect_home_read_only_table,
+                                         ELEMENTSOF(protect_home_read_only_table));
+                break;
+        case PROTECT_HOME_YES:
+                r = append_target_mounts(p, root_directory, protect_home_yes_table,
+                                         ELEMENTSOF(protect_home_yes_table));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+
+        return r;
+}
+
 static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
         int r = 0;
 
@@ -629,6 +671,7 @@ static unsigned namespace_calculate_mounts(
                 ProtectHome protect_home,
                 ProtectSystem protect_system) {
 
+        unsigned protect_home_cnt;
         unsigned protect_system_cnt =
                 (protect_system == PROTECT_SYSTEM_STRICT ?
                  ELEMENTSOF(protect_system_strict_table) :
@@ -637,6 +680,12 @@ static unsigned namespace_calculate_mounts(
                   ((protect_system == PROTECT_SYSTEM_YES) ?
                    ELEMENTSOF(protect_system_yes_table) : 0)));
 
+        protect_home_cnt =
+                (protect_home == PROTECT_HOME_YES ?
+                 ELEMENTSOF(protect_home_yes_table) :
+                 ((protect_home == PROTECT_HOME_READ_ONLY) ?
+                  ELEMENTSOF(protect_home_read_only_table) : 0));
+
         return !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -644,8 +693,7 @@ static unsigned namespace_calculate_mounts(
                 private_dev +
                 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (protect_cgroups ? 1 : 0) +
-                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                protect_system_cnt;
+                protect_home_cnt + protect_system_cnt;
 }
 
 int setup_namespace(
@@ -723,26 +771,9 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
-                        const char *home_dir, *run_user_dir, *root_dir;
-
-                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
-                         * strict system protection mode, then also add entries for these directories, but mark them
-                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
-
-                        home_dir = prefix_roota(root_directory, "/home");
-                        home_dir = strjoina("-", home_dir);
-                        run_user_dir = prefix_roota(root_directory, "/run/user");
-                        run_user_dir = strjoina("-", run_user_dir);
-                        root_dir = prefix_roota(root_directory, "/root");
-                        root_dir = strjoina("-", root_dir);
-
-                        r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
-                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
-                        if (r < 0)
-                                return r;
-                }
+                r = append_protect_home(&m, root_directory, protect_home);
+                if (r < 0)
+                        return r;
 
                 r = append_protect_system(&m, root_directory, protect_system);
                 if (r < 0)
-- 
cgit v1.2.3-54-g00ecf


From 8f81a5f61bcf745bae3acad599d7a9da686643e3 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:52:27 +0200
Subject: core: Use @raw-io syscall group to filter I/O syscalls when
 PrivateDevices= is set

Instead of having a local syscall list, use the @raw-io group which
contains the same set of syscalls to filter.
---
 man/systemd.exec.xml |  6 ++++--
 src/core/execute.c   | 55 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index f19e7f6ee9..f70e5c36d4 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -933,8 +933,10 @@
         <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
         <filename>/dev/sda</filename>, system memory <filename>/dev/mem</filename>, system ports
         <filename>/dev/port</filename> and others. This is useful to securely turn off physical device access by the
-        executed process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from
-        the capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
+        executed process. Defaults to false. Enabling this option will install a system call filter to block low-level
+        I/O system calls that are grouped in the <varname>@raw-io</varname> set, will also remove
+        <constant>CAP_MKNOD</constant> from the capability bounding set for the unit (see above), and set
+        <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         for details). Note that using this setting will disconnect propagation of mounts from the service to the host
         (propagation in the opposite direction continues to work).  This means that this setting may not be used for
diff --git a/src/core/execute.c b/src/core/execute.c
index 0488ba2ca9..3da7ef3be6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1429,28 +1429,15 @@ finish:
 }
 
 static int apply_private_devices(Unit *u, const ExecContext *c) {
-
-        static const int device_syscalls[] = {
-                SCMP_SYS(ioperm),
-                SCMP_SYS(iopl),
-                SCMP_SYS(pciconfig_iobase),
-                SCMP_SYS(pciconfig_read),
-                SCMP_SYS(pciconfig_write),
-#ifdef __NR_s390_pci_mmio_read
-                SCMP_SYS(s390_pci_mmio_read),
-#endif
-#ifdef __NR_s390_pci_mmio_write
-                SCMP_SYS(s390_pci_mmio_write),
-#endif
-        };
-
+        const SystemCallFilterSet *set;
         scmp_filter_ctx *seccomp;
-        unsigned i;
+        const char *sys;
+        bool syscalls_found = false;
         int r;
 
         assert(c);
 
-        /* If PrivateDevices= is set, also turn off iopl and friends. */
+        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
 
         if (skip_seccomp_unavailable(u, "PrivateDevices="))
                 return 0;
@@ -1463,12 +1450,40 @@ static int apply_private_devices(Unit *u, const ExecContext *c) {
         if (r < 0)
                 goto finish;
 
-        for (i = 0; i < ELEMENTSOF(device_syscalls); i++) {
+        for (set = syscall_filter_sets; set->set_name; set++)
+                if (streq(set->set_name, "@raw-io")) {
+                        syscalls_found = true;
+                        break;
+                }
+
+        /* We should never fail here */
+        if (!syscalls_found) {
+                r = -EOPNOTSUPP;
+                goto finish;
+        }
+
+        NULSTR_FOREACH(sys, set->value) {
+                int id;
+                bool add = true;
+
+#ifndef __NR_s390_pci_mmio_read
+                if (streq(sys, "s390_pci_mmio_read"))
+                        add = false;
+#endif
+#ifndef __NR_s390_pci_mmio_write
+                if (streq(sys, "s390_pci_mmio_write"))
+                        add = false;
+#endif
+
+                if (!add)
+                        continue;
+
+                id = seccomp_syscall_resolve_name(sys);
+
                 r = seccomp_rule_add(
                                 seccomp,
                                 SCMP_ACT_ERRNO(EPERM),
-                                device_syscalls[i],
-                                0);
+                                id, 0);
                 if (r < 0)
                         goto finish;
         }
-- 
cgit v1.2.3-54-g00ecf


From 615a1f4b26f3c7d10ad9ea638341a6920a6bc435 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 13:04:30 +0200
Subject: test: add CAP_MKNOD tests for PrivateDevices=

---
 Makefile.am                                                    |  2 ++
 src/test/test-execute.c                                        | 10 ++++++++++
 .../exec-privatedevices-no-capability-mknod.service            |  7 +++++++
 .../exec-privatedevices-yes-capability-mknod.service           |  7 +++++++
 4 files changed, 26 insertions(+)
 create mode 100644 test/test-execute/exec-privatedevices-no-capability-mknod.service
 create mode 100644 test/test-execute/exec-privatedevices-yes-capability-mknod.service

diff --git a/Makefile.am b/Makefile.am
index e823a5c515..66dbbeca0e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1639,6 +1639,8 @@ EXTRA_DIST += \
 	test/test-execute/exec-personality-aarch64.service \
 	test/test-execute/exec-privatedevices-no.service \
 	test/test-execute/exec-privatedevices-yes.service \
+	test/test-execute/exec-privatedevices-no-capability-mknod.service \
+	test/test-execute/exec-privatedevices-yes-capability-mknod.service \
 	test/test-execute/exec-privatetmp-no.service \
 	test/test-execute/exec-privatetmp-yes.service \
 	test/test-execute/exec-spec-interpolation.service \
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index 25489cefbc..2bc1854485 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -133,6 +133,15 @@ static void test_exec_privatedevices(Manager *m) {
         test(m, "exec-privatedevices-no.service", 0, CLD_EXITED);
 }
 
+static void test_exec_privatedevices_capabilities(Manager *m) {
+        if (detect_container() > 0) {
+                log_notice("testing in container, skipping private device tests");
+                return;
+        }
+        test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED);
+        test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED);
+}
+
 static void test_exec_systemcallfilter(Manager *m) {
 #ifdef HAVE_SECCOMP
         if (!is_seccomp_available())
@@ -345,6 +354,7 @@ int main(int argc, char *argv[]) {
                 test_exec_ignoresigpipe,
                 test_exec_privatetmp,
                 test_exec_privatedevices,
+                test_exec_privatedevices_capabilities,
                 test_exec_privatenetwork,
                 test_exec_systemcallfilter,
                 test_exec_systemcallerrornumber,
diff --git a/test/test-execute/exec-privatedevices-no-capability-mknod.service b/test/test-execute/exec-privatedevices-no-capability-mknod.service
new file mode 100644
index 0000000000..6d39469da8
--- /dev/null
+++ b/test/test-execute/exec-privatedevices-no-capability-mknod.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test CAP_MKNOD capability for PrivateDevices=no
+
+[Service]
+PrivateDevices=no
+ExecStart=/bin/sh -x -c 'capsh --print | grep cap_mknod'
+Type=oneshot
diff --git a/test/test-execute/exec-privatedevices-yes-capability-mknod.service b/test/test-execute/exec-privatedevices-yes-capability-mknod.service
new file mode 100644
index 0000000000..fb1fc2875a
--- /dev/null
+++ b/test/test-execute/exec-privatedevices-yes-capability-mknod.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test CAP_MKNOD capability for PrivateDevices=yes
+
+[Service]
+PrivateDevices=yes
+ExecStart=/bin/sh -x -c '! capsh --print | grep cap_mknod'
+Type=oneshot
-- 
cgit v1.2.3-54-g00ecf


From f78b36f016b5f3e6ce1dfbdfcb78ba227ff8ccac Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 19:24:25 +0200
Subject: test: add tests for simple ReadOnlyPaths= case

---
 Makefile.am                                  | 1 +
 src/test/test-execute.c                      | 5 +++++
 test/test-execute/exec-readonlypaths.service | 7 +++++++
 3 files changed, 13 insertions(+)
 create mode 100644 test/test-execute/exec-readonlypaths.service

diff --git a/Makefile.am b/Makefile.am
index 66dbbeca0e..0417a0511f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1643,6 +1643,7 @@ EXTRA_DIST += \
 	test/test-execute/exec-privatedevices-yes-capability-mknod.service \
 	test/test-execute/exec-privatetmp-no.service \
 	test/test-execute/exec-privatetmp-yes.service \
+	test/test-execute/exec-readonlypaths.service \
 	test/test-execute/exec-spec-interpolation.service \
 	test/test-execute/exec-systemcallerrornumber.service \
 	test/test-execute/exec-systemcallfilter-failing2.service \
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index 2bc1854485..aa8544e21a 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -142,6 +142,10 @@ static void test_exec_privatedevices_capabilities(Manager *m) {
         test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED);
 }
 
+static void test_exec_readonlypaths(Manager *m) {
+        test(m, "exec-readonlypaths.service", 0, CLD_EXITED);
+}
+
 static void test_exec_systemcallfilter(Manager *m) {
 #ifdef HAVE_SECCOMP
         if (!is_seccomp_available())
@@ -355,6 +359,7 @@ int main(int argc, char *argv[]) {
                 test_exec_privatetmp,
                 test_exec_privatedevices,
                 test_exec_privatedevices_capabilities,
+                test_exec_readonlypaths,
                 test_exec_privatenetwork,
                 test_exec_systemcallfilter,
                 test_exec_systemcallerrornumber,
diff --git a/test/test-execute/exec-readonlypaths.service b/test/test-execute/exec-readonlypaths.service
new file mode 100644
index 0000000000..6866fdc700
--- /dev/null
+++ b/test/test-execute/exec-readonlypaths.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test for ReadOnlyPaths=
+
+[Service]
+ReadOnlyPaths=/etc -/i-dont-exist /usr
+ExecStart=/bin/sh -x -c 'test ! -w /etc && test ! -w /usr && test ! -e /i-dont-exist && test -w /var'
+Type=oneshot
-- 
cgit v1.2.3-54-g00ecf


From cdfbd1fb26eb75fe6beca47dce7e5e348b077d97 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 19:50:25 +0200
Subject: test: make sure that {readonly|inaccessible|readwrite}paths
 disconnect mount propagation

Better safe.
---
 Makefile.am                                                   |  3 +++
 src/test/test-execute.c                                       | 11 +++++++++++
 .../exec-inaccessiblepaths-mount-propagation.service          |  7 +++++++
 .../test-execute/exec-readonlypaths-mount-propagation.service |  7 +++++++
 .../exec-readwritepaths-mount-propagation.service             |  7 +++++++
 5 files changed, 35 insertions(+)
 create mode 100644 test/test-execute/exec-inaccessiblepaths-mount-propagation.service
 create mode 100644 test/test-execute/exec-readonlypaths-mount-propagation.service
 create mode 100644 test/test-execute/exec-readwritepaths-mount-propagation.service

diff --git a/Makefile.am b/Makefile.am
index 0417a0511f..9185bae7b7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1644,6 +1644,9 @@ EXTRA_DIST += \
 	test/test-execute/exec-privatetmp-no.service \
 	test/test-execute/exec-privatetmp-yes.service \
 	test/test-execute/exec-readonlypaths.service \
+	test/test-execute/exec-readonlypaths-mount-propagation.service \
+	test/test-execute/exec-readwritepaths-mount-propagation.service \
+	test/test-execute/exec-inaccessiblepaths-mount-propagation.service \
 	test/test-execute/exec-spec-interpolation.service \
 	test/test-execute/exec-systemcallerrornumber.service \
 	test/test-execute/exec-systemcallfilter-failing2.service \
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index aa8544e21a..8b4ff22495 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -144,6 +144,15 @@ static void test_exec_privatedevices_capabilities(Manager *m) {
 
 static void test_exec_readonlypaths(Manager *m) {
         test(m, "exec-readonlypaths.service", 0, CLD_EXITED);
+        test(m, "exec-readonlypaths-mount-propagation.service", 0, CLD_EXITED);
+}
+
+static void test_exec_readwritepaths(Manager *m) {
+        test(m, "exec-readwritepaths-mount-propagation.service", 0, CLD_EXITED);
+}
+
+static void test_exec_inaccessiblepaths(Manager *m) {
+        test(m, "exec-inaccessiblepaths-mount-propagation.service", 0, CLD_EXITED);
 }
 
 static void test_exec_systemcallfilter(Manager *m) {
@@ -360,6 +369,8 @@ int main(int argc, char *argv[]) {
                 test_exec_privatedevices,
                 test_exec_privatedevices_capabilities,
                 test_exec_readonlypaths,
+                test_exec_readwritepaths,
+                test_exec_inaccessiblepaths,
                 test_exec_privatenetwork,
                 test_exec_systemcallfilter,
                 test_exec_systemcallerrornumber,
diff --git a/test/test-execute/exec-inaccessiblepaths-mount-propagation.service b/test/test-execute/exec-inaccessiblepaths-mount-propagation.service
new file mode 100644
index 0000000000..23c6ff3f93
--- /dev/null
+++ b/test/test-execute/exec-inaccessiblepaths-mount-propagation.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test to make sure that InaccessiblePaths= disconnect mount propagation
+
+[Service]
+InaccessiblePaths=-/i-dont-exist
+ExecStart=/bin/sh -x -c 'mkdir -p /TEST; mount -t tmpfs tmpfs /TEST; grep TEST /proc/self/mountinfo && ! grep TEST /proc/$${PPID}/mountinfo && ! grep TEST /proc/1/mountinfo'
+Type=oneshot
diff --git a/test/test-execute/exec-readonlypaths-mount-propagation.service b/test/test-execute/exec-readonlypaths-mount-propagation.service
new file mode 100644
index 0000000000..237cbb2efb
--- /dev/null
+++ b/test/test-execute/exec-readonlypaths-mount-propagation.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test to make sure that passing ReadOnlyPaths= disconnect mount propagation
+
+[Service]
+ReadOnlyPaths=-/i-dont-exist
+ExecStart=/bin/sh -x -c 'mkdir -p /TEST; mount -t tmpfs tmpfs /TEST; grep TEST /proc/self/mountinfo && ! grep TEST /proc/$${PPID}/mountinfo && ! grep TEST /proc/1/mountinfo'
+Type=oneshot
diff --git a/test/test-execute/exec-readwritepaths-mount-propagation.service b/test/test-execute/exec-readwritepaths-mount-propagation.service
new file mode 100644
index 0000000000..466ce6c747
--- /dev/null
+++ b/test/test-execute/exec-readwritepaths-mount-propagation.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Test to make sure that passing ReadWritePaths= disconnect mount propagation
+
+[Service]
+ReadWritePaths=-/i-dont-exist
+ExecStart=/bin/sh -x -c 'mkdir -p /TEST; mount -t tmpfs tmpfs /TEST; grep TEST /proc/self/mountinfo && ! grep TEST /proc/$${PPID}/mountinfo && ! grep TEST /proc/1/mountinfo'
+Type=oneshot
-- 
cgit v1.2.3-54-g00ecf