From 72246c2a654ead7f7ee6e7799161e2e46dc0b84b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Aug 2016 19:01:14 +0200
Subject: core: enforce seccomp for secondary archs too, for all rules

Let's make sure that all our rules apply to all archs the local kernel
supports.
---
 src/core/execute.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src/core')
diff --git a/src/core/execute.c b/src/core/execute.c
index 2026137721..ee734e8445 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1273,6 +1273,10 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c)
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         r = seccomp_rule_add(
                         seccomp,
                         SCMP_ACT_ERRNO(EPERM),
@@ -1322,6 +1326,10 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
         if (!seccomp)
                 return -ENOMEM;
 
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
         /* Determine the highest policy constant we want to allow */
         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
                 if (permitted_policies[i] > max_policy)
-- 
cgit v1.2.3-54-g00ecf


From 59eeb84ba65483c5543d1bc840c2ac75642ef638 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Aug 2016 18:43:59 +0200
Subject: core: add two new service settings ProtectKernelTunables= and
 ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.
---
 man/systemd.exec.xml                  |  20 +++++++
 src/core/dbus-execute.c               |   9 ++-
 src/core/execute.c                    | 100 ++++++++++++++++++++++++++++++----
 src/core/execute.h                    |   2 +
 src/core/load-fragment-gperf.gperf.m4 |   2 +
 src/core/namespace.c                  |  36 ++++++++++--
 src/core/namespace.h                  |   2 +
 src/shared/bus-unit-util.c            |   2 +-
 src/test/test-ns.c                    |   2 +
 9 files changed, 159 insertions(+), 16 deletions(-)

(limited to 'src/core')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index bcedebd5bb..07128b489e 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1059,6 +1059,26 @@
         Defaults to off.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>ProtectKernelTunables=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
+        <filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
+        unit. Usually, tunable kernel variables should only be written at boot-time, with the
+        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
+        no services need to write to these at runtime; it is hence recommended to turn this on for most
+        services. Defaults to off.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>ProtectControlGroups=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
+        through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
+        container managers no services should require write access to the control groups hierarchies; it is hence
+        recommended to turn this on for most services. Defaults to off.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>MountFlags=</varname></term>
 
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 7e33a2d201..eec4500c8c 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property(
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                               "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
+                              "ProtectControlGroups")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property(
                                 c->dynamic_user = b;
                         else if (streq(name, "RemoveIPC"))
                                 c->remove_ipc = b;
+                        else if (streq(name, "ProtectKernelTunables"))
+                                c->protect_kernel_tunables = b;
+                        else if (streq(name, "ProtectControlGroups"))
+                                c->protect_control_groups = b;
 
                         unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                 }
diff --git a/src/core/execute.c b/src/core/execute.c
index ee734e8445..609b69a859 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1383,6 +1383,45 @@ finish:
         return r;
 }
 
+static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
+        scmp_filter_ctx *seccomp;
+        int r;
+
+        assert(c);
+
+        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
+         * let's protect even those systems where this is left on in the kernel. */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(_sysctl),
+                        0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace(
 
         if (context->private_devices ||
             context->protect_system != PROTECT_SYSTEM_NO ||
-            context->protect_home != PROTECT_HOME_NO)
+            context->protect_home != PROTECT_HOME_NO ||
+            context->protect_kernel_tunables ||
+            context->protect_control_groups)
                 return true;
 
         return false;
@@ -1804,6 +1845,37 @@ static int close_remaining_fds(
         return close_all_fds(dont_close, n_dont_close);
 }
 
+static bool context_has_address_families(const ExecContext *c) {
+        assert(c);
+
+        return c->address_families_whitelist ||
+                !set_isempty(c->address_families);
+}
+
+static bool context_has_syscall_filters(const ExecContext *c) {
+        assert(c);
+
+        return c->syscall_whitelist ||
+                !set_isempty(c->syscall_filter) ||
+                !set_isempty(c->syscall_archs);
+}
+
+static bool context_has_no_new_privileges(const ExecContext *c) {
+        assert(c);
+
+        if (c->no_new_privileges)
+                return true;
+
+        if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
+                return false;
+
+        return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
+                c->memory_deny_write_execute ||
+                c->restrict_realtime ||
+                c->protect_kernel_tunables ||
+                context_has_syscall_filters(c);
+}
+
 static int send_user_lookup(
                 Unit *unit,
                 int user_lookup_fd,
@@ -2255,6 +2327,8 @@ static int exec_child(
                                 tmp,
                                 var,
                                 context->private_devices,
+                                context->protect_kernel_tunables,
+                                context->protect_control_groups,
                                 context->protect_home,
                                 context->protect_system,
                                 context->mount_flags);
@@ -2343,11 +2417,6 @@ static int exec_child(
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
 
-                bool use_address_families = context->address_families_whitelist ||
-                        !set_isempty(context->address_families);
-                bool use_syscall_filter = context->syscall_whitelist ||
-                        !set_isempty(context->syscall_filter) ||
-                        !set_isempty(context->syscall_archs);
                 int secure_bits = context->secure_bits;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
@@ -2424,15 +2493,14 @@ static int exec_child(
                                 return -errno;
                         }
 
-                if (context->no_new_privileges ||
-                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
+                if (context_has_no_new_privileges(context))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
                         }
 
 #ifdef HAVE_SECCOMP
-                if (use_address_families) {
+                if (context_has_address_families(context)) {
                         r = apply_address_families(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_ADDRESS_FAMILIES;
@@ -2456,7 +2524,15 @@ static int exec_child(
                         }
                 }
 
-                if (use_syscall_filter) {
+                if (context->protect_kernel_tunables) {
+                        r = apply_protect_sysctl(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
+                if (context_has_syscall_filters(context)) {
                         r = apply_seccomp(unit, context);
                         if (r < 0) {
                                 *exit_status = EXIT_SECCOMP;
@@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sNonBlocking: %s\n"
                 "%sPrivateTmp: %s\n"
                 "%sPrivateDevices: %s\n"
+                "%sProtectKernelTunables: %s\n"
+                "%sProtectControlGroups: %s\n"
                 "%sPrivateNetwork: %s\n"
                 "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
@@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->non_blocking),
                 prefix, yes_no(c->private_tmp),
                 prefix, yes_no(c->private_devices),
+                prefix, yes_no(c->protect_kernel_tunables),
+                prefix, yes_no(c->protect_control_groups),
                 prefix, yes_no(c->private_network),
                 prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
diff --git a/src/core/execute.h b/src/core/execute.h
index 6082c42aba..449180c903 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -174,6 +174,8 @@ struct ExecContext {
         bool private_users;
         ProtectSystem protect_system;
         ProtectHome protect_home;
+        bool protect_kernel_tunables;
+        bool protect_control_groups;
 
         bool no_new_privileges;
 
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 2e6c965aec..c49c1d6732 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -89,6 +89,8 @@ $1.ReadOnlyPaths,                config_parse_namespace_path_strv,   0,
 $1.InaccessiblePaths,            config_parse_namespace_path_strv,   0,                             offsetof($1, exec_context.inaccessible_paths)
 $1.PrivateTmp,                   config_parse_bool,                  0,                             offsetof($1, exec_context.private_tmp)
 $1.PrivateDevices,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_devices)
+$1.ProtectKernelTunables,        config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_tunables)
+$1.ProtectControlGroups,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_control_groups)
 $1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)
 $1.PrivateUsers,                 config_parse_bool,                  0,                             offsetof($1, exec_context.private_users)
 $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 52a2505d94..f2768aeb28 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -53,7 +53,7 @@ typedef enum MountMode {
         PRIVATE_TMP,
         PRIVATE_VAR_TMP,
         PRIVATE_DEV,
-        READWRITE
+        READWRITE,
 } MountMode;
 
 typedef struct BindMount {
@@ -366,6 +366,8 @@ int setup_namespace(
                 const char* tmp_dir,
                 const char* var_tmp_dir,
                 bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
                 ProtectHome protect_home,
                 ProtectSystem protect_system,
                 unsigned long mount_flags) {
@@ -385,6 +387,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
+                (protect_sysctl ? 3 : 0) +
+                (protect_cgroups != protect_sysctl) +
                 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
                 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
                 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
@@ -421,6 +425,27 @@ int setup_namespace(
                         m++;
                 }
 
+                if (protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/proc/sys");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
+                        m->mode = READONLY;
+                        m->ignore = true; /* Not always compiled into the kernel */
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/sys");
+                        m->mode = READONLY;
+                        m++;
+                }
+
+                if (protect_cgroups != protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
+                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m++;
+                }
+
                 if (protect_home != PROTECT_HOME_NO) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
@@ -505,9 +530,12 @@ int setup_namespace(
 
 fail:
         if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m)
-                        if (m->done)
-                                (void) umount2(m->path, MNT_DETACH);
+                for (m = mounts; m < mounts + n; ++m) {
+                        if (!m->done)
+                                continue;
+
+                        (void) umount2(m->path, MNT_DETACH);
+                }
         }
 
         return r;
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 1aedf5f208..3845336287 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -46,6 +46,8 @@ int setup_namespace(const char *chroot,
                     const char *tmp_dir,
                     const char *var_tmp_dir,
                     bool private_dev,
+                    bool protect_sysctl,
+                    bool protect_cgroups,
                     ProtectHome protect_home,
                     ProtectSystem protect_system,
                     unsigned long mount_flags);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index feb4a06737..c6bd2f145c 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                               "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
 
                 r = parse_boolean(eq);
                 if (r < 0)
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 9248f2987c..05f243c75c 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -69,6 +69,8 @@ int main(int argc, char *argv[]) {
                             tmp_dir,
                             var_tmp_dir,
                             true,
+                            true,
+                            true,
                             PROTECT_HOME_NO,
                             PROTECT_SYSTEM_NO,
                             0);
-- 
cgit v1.2.3-54-g00ecf


From fe3c2583bee339b6744872dc1897e6486d5bd7e0 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 24 Aug 2016 23:17:42 +0200
Subject: namespace: make sure InaccessibleDirectories= masks all mounts
 further down

If a dir is marked to be inaccessible then everything below it should be masked
by it.
---
 src/core/namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 src/test/test-ns.c   |  4 +++-
 2 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index f2768aeb28..102fe576f3 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -116,16 +116,47 @@ static void drop_duplicates(BindMount *m, unsigned *n) {
         assert(m);
         assert(n);
 
+        /* Drops duplicate entries. Expects that the array is properly ordered already. */
+
         for (f = m, t = m, previous = NULL; f < m+*n; f++) {
 
-                /* The first one wins */
-                if (previous && path_equal(f->path, previous->path))
+                /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
+                 * above. */
+                if (previous && path_equal(f->path, previous->path)) {
+                        log_debug("%s is duplicate.", f->path);
                         continue;
+                }
 
                 *t = *f;
-
                 previous = t;
+                t++;
+        }
+
+        *n = t - m;
+}
+
+static void drop_inaccessible(BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+        const char *clear = NULL;
+
+        assert(m);
+        assert(n);
+
+        /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
+         * ordered already. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
+                 * it, as inaccessible paths really should drop the entire subtree. */
+                if (clear && path_startswith(f->path, clear)) {
+                        log_debug("%s is masked by %s.", f->path, clear);
+                        continue;
+                }
 
+                clear = f->mode == INACCESSIBLE ? f->path : NULL;
+
+                *t = *f;
                 t++;
         }
 
@@ -282,6 +313,8 @@ static int apply_mount(
 
         assert(m);
 
+        log_debug("Applying namespace mount on %s", m->path);
+
         switch (m->mode) {
 
         case INACCESSIBLE:
@@ -289,7 +322,7 @@ static int apply_mount(
                 /* First, get rid of everything that is below if there
                  * is anything... Then, overmount it with an
                  * inaccessible path. */
-                umount_recursive(m->path, 0);
+                (void) umount_recursive(m->path, 0);
 
                 if (lstat(m->path, &target) < 0) {
                         if (m->ignore && errno == ENOENT)
@@ -303,6 +336,7 @@ static int apply_mount(
                         return -ELOOP;
                 }
                 break;
+
         case READONLY:
         case READWRITE:
                 /* Nothing to mount here, we just later toggle the
@@ -480,7 +514,9 @@ int setup_namespace(
                 assert(mounts + n == m);
 
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
+
                 drop_duplicates(mounts, &n);
+                drop_inaccessible(mounts, &n);
         }
 
         if (n > 0 || root_directory) {
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 05f243c75c..03a24620af 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -26,6 +26,7 @@
 int main(int argc, char *argv[]) {
         const char * const writable[] = {
                 "/home",
+                "/home/lennart/projects/foobar", /* this should be masked automatically */
                 NULL
         };
 
@@ -42,11 +43,12 @@ int main(int argc, char *argv[]) {
         };
         char *root_directory;
         char *projects_directory;
-
         int r;
         char tmp_dir[] = "/tmp/systemd-private-XXXXXX",
              var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX";
 
+        log_set_max_level(LOG_DEBUG);
+
         assert_se(mkdtemp(tmp_dir));
         assert_se(mkdtemp(var_tmp_dir));
 
-- 
cgit v1.2.3-54-g00ecf


From 07689d5d2c07ee434437de5e39bf0abaa772818b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:12:57 +0200
Subject: execute: split out creation of runtime dirs into its own functions

---
 src/core/execute.c | 57 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

(limited to 'src/core')

diff --git a/src/core/execute.c b/src/core/execute.c
index 609b69a859..3877293b4f 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1789,6 +1789,37 @@ static int setup_private_users(uid_t uid, gid_t gid) {
         return 0;
 }
 
+static int setup_runtime_directory(
+                const ExecContext *context,
+                const ExecParameters *params,
+                uid_t uid,
+                gid_t gid) {
+
+        char **rt;
+        int r;
+
+        assert(context);
+        assert(params);
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                _cleanup_free_ char *p;
+
+                p = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!p)
+                        return -ENOMEM;
+
+                r = mkdir_p_label(p, context->runtime_directory_mode);
+                if (r < 0)
+                        return r;
+
+                r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -2188,28 +2219,10 @@ static int exec_child(
         }
 
         if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) {
-                char **rt;
-
-                STRV_FOREACH(rt, context->runtime_directory) {
-                        _cleanup_free_ char *p;
-
-                        p = strjoin(params->runtime_prefix, "/", *rt, NULL);
-                        if (!p) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return -ENOMEM;
-                        }
-
-                        r = mkdir_p_label(p, context->runtime_directory_mode);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
-
-                        r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid);
-                        if (r < 0) {
-                                *exit_status = EXIT_RUNTIME_DIRECTORY;
-                                return r;
-                        }
+                r = setup_runtime_directory(context, params, uid, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_RUNTIME_DIRECTORY;
+                        return r;
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From be39ccf3a0d4d15324af1de4d8552a1d65f40808 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:24:10 +0200
Subject: execute: move suppression of HOME=/ and SHELL=/bin/nologin into
 user-util.c

This adds a new call get_user_creds_clean(), which is just like
get_user_creds() but returns NULL in the home/shell parameters if they contain
no useful information. This code previously lived in execute.c, but by
generalizing this we can reuse it in run.c.
---
 src/basic/user-util.c | 32 +++++++++++++++++++++++++++++++-
 src/basic/user-util.h |  1 +
 src/core/execute.c    | 14 +++-----------
 src/run/run.c         | 18 +++++++++++-------
 4 files changed, 46 insertions(+), 19 deletions(-)

(limited to 'src/core')

diff --git a/src/basic/user-util.c b/src/basic/user-util.c
index 122d9a0c7c..0522bce1d1 100644
--- a/src/basic/user-util.c
+++ b/src/basic/user-util.c
@@ -31,14 +31,15 @@
 #include <unistd.h>
 #include <utmp.h>
 
-#include "missing.h"
 #include "alloc-util.h"
 #include "fd-util.h"
 #include "formats-util.h"
 #include "macro.h"
+#include "missing.h"
 #include "parse-util.h"
 #include "path-util.h"
 #include "string-util.h"
+#include "strv.h"
 #include "user-util.h"
 #include "utf8.h"
 
@@ -175,6 +176,35 @@ int get_user_creds(
         return 0;
 }
 
+int get_user_creds_clean(
+                const char **username,
+                uid_t *uid, gid_t *gid,
+                const char **home,
+                const char **shell) {
+
+        int r;
+
+        /* Like get_user_creds(), but resets home/shell to NULL if they don't contain anything relevant. */
+
+        r = get_user_creds(username, uid, gid, home, shell);
+        if (r < 0)
+                return r;
+
+        if (shell &&
+            (isempty(*shell) || PATH_IN_SET(*shell,
+                                            "/bin/nologin",
+                                            "/sbin/nologin",
+                                            "/usr/bin/nologin",
+                                            "/usr/sbin/nologin")))
+                *shell = NULL;
+
+        if (home &&
+            (isempty(*home) || path_equal(*home, "/")))
+                *home = NULL;
+
+        return 0;
+}
+
 int get_group_creds(const char **groupname, gid_t *gid) {
         struct group *g;
         gid_t id;
diff --git a/src/basic/user-util.h b/src/basic/user-util.h
index f569363811..6c61f63cae 100644
--- a/src/basic/user-util.h
+++ b/src/basic/user-util.h
@@ -40,6 +40,7 @@ char* getlogname_malloc(void);
 char* getusername_malloc(void);
 
 int get_user_creds(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell);
+int get_user_creds_clean(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell);
 int get_group_creds(const char **groupname, gid_t *gid);
 
 char* uid_to_name(uid_t uid);
diff --git a/src/core/execute.c b/src/core/execute.c
index 3877293b4f..c7a3ea39e7 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -2051,22 +2051,14 @@ static int exec_child(
         } else {
                 if (context->user) {
                         username = context->user;
-                        r = get_user_creds(&username, &uid, &gid, &home, &shell);
+                        r = get_user_creds_clean(&username, &uid, &gid, &home, &shell);
                         if (r < 0) {
                                 *exit_status = EXIT_USER;
                                 return r;
                         }
 
-                        /* Don't set $HOME or $SHELL if they are are not particularly enlightening anyway. */
-                        if (isempty(home) || path_equal(home, "/"))
-                                home = NULL;
-
-                        if (isempty(shell) || PATH_IN_SET(shell,
-                                                          "/bin/nologin",
-                                                          "/sbin/nologin",
-                                                          "/usr/bin/nologin",
-                                                          "/usr/sbin/nologin"))
-                                shell = NULL;
+                        /* Note that we don't set $HOME or $SHELL if they are are not particularly enlightening anyway
+                         * (i.e. are "/" or "/bin/nologin"). */
                 }
 
                 if (context->group) {
diff --git a/src/run/run.c b/src/run/run.c
index 2dd229868c..81b53fdfab 100644
--- a/src/run/run.c
+++ b/src/run/run.c
@@ -1168,17 +1168,21 @@ static int start_transient_scope(
                 uid_t uid;
                 gid_t gid;
 
-                r = get_user_creds(&arg_exec_user, &uid, &gid, &home, &shell);
+                r = get_user_creds_clean(&arg_exec_user, &uid, &gid, &home, &shell);
                 if (r < 0)
                         return log_error_errno(r, "Failed to resolve user %s: %m", arg_exec_user);
 
-                r = strv_extendf(&user_env, "HOME=%s", home);
-                if (r < 0)
-                        return log_oom();
+                if (home) {
+                        r = strv_extendf(&user_env, "HOME=%s", home);
+                        if (r < 0)
+                                return log_oom();
+                }
 
-                r = strv_extendf(&user_env, "SHELL=%s", shell);
-                if (r < 0)
-                        return log_oom();
+                if (shell) {
+                        r = strv_extendf(&user_env, "SHELL=%s", shell);
+                        if (r < 0)
+                                return log_oom();
+                }
 
                 r = strv_extendf(&user_env, "USER=%s", arg_exec_user);
                 if (r < 0)
-- 
cgit v1.2.3-54-g00ecf


From 3fbe8dbe41ad662d7cae0525f6fd62a66d2c5ec5 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:42:38 +0200
Subject: execute: if RuntimeDirectory= is set, it should be writable

Implicitly make all dirs set with RuntimeDirectory= writable, as the concept
otherwise makes no sense.
---
 src/core/execute.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

(limited to 'src/core')

diff --git a/src/core/execute.c b/src/core/execute.c
index c7a3ea39e7..20e74ec8a6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1820,6 +1820,44 @@ static int setup_runtime_directory(
         return 0;
 }
 
+static int compile_read_write_paths(
+                const ExecContext *context,
+                const ExecParameters *params,
+                char ***ret) {
+
+        _cleanup_strv_free_ char **l = NULL;
+        char **rt;
+
+        /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all
+         * runtime directories. */
+
+        if (strv_isempty(context->read_write_paths) &&
+            strv_isempty(context->runtime_directory)) {
+                *ret = NULL; /* NOP if neither is set */
+                return 0;
+        }
+
+        l = strv_copy(context->read_write_paths);
+        if (!l)
+                return -ENOMEM;
+
+        STRV_FOREACH(rt, context->runtime_directory) {
+                char *s;
+
+                s = strjoin(params->runtime_prefix, "/", *rt, NULL);
+                if (!s)
+                        return -ENOMEM;
+
+                if (strv_consume(&l, s) < 0)
+                        return -ENOMEM;
+        }
+
+        *ret = l;
+        l = NULL;
+
+        return 0;
+}
+
 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
         assert(array);
         assert(n);
@@ -2307,8 +2345,8 @@ static int exec_child(
         }
 
         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
-
         if (needs_mount_namespace) {
+                _cleanup_free_ char **rw = NULL;
                 char *tmp = NULL, *var = NULL;
 
                 /* The runtime struct only contains the parent
@@ -2324,9 +2362,15 @@ static int exec_child(
                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
                 }
 
+                r = compile_read_write_paths(context, params, &rw);
+                if (r < 0) {
+                        *exit_status = EXIT_NAMESPACE;
+                        return r;
+                }
+
                 r = setup_namespace(
                                 (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
-                                context->read_write_paths,
+                                rw,
                                 context->read_only_paths,
                                 context->inaccessible_paths,
                                 tmp,
-- 
cgit v1.2.3-54-g00ecf


From 6ee1a919cf9013a695da2a01ae67327b996a6ef6 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:44:09 +0200
Subject: namespace: simplify mount_path_compare() a bit

---
 src/core/namespace.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 102fe576f3..74201caa10 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -93,21 +93,19 @@ static int mount_path_compare(const void *a, const void *b) {
         const BindMount *p = a, *q = b;
         int d;
 
+        /* If the paths are not equal, then order prefixes first */
         d = path_compare(p->path, q->path);
+        if (d != 0)
+                return d;
 
-        if (d == 0) {
-                /* If the paths are equal, check the mode */
-                if (p->mode < q->mode)
-                        return -1;
-
-                if (p->mode > q->mode)
-                        return 1;
+        /* If the paths are equal, check the mode */
+        if (p->mode < q->mode)
+                return -1;
 
-                return 0;
-        }
+        if (p->mode > q->mode)
+                return 1;
 
-        /* If the paths are not equal, then order prefixes first */
-        return d;
+        return 0;
 }
 
 static void drop_duplicates(BindMount *m, unsigned *n) {
-- 
cgit v1.2.3-54-g00ecf


From 7648a565d14dfb5516d93bacf0d87de2de5b5d91 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 11:29:32 +0200
Subject: namespace: when enforcing fs namespace restrictions suppress
 redundant mounts

If /foo is marked to be read-only, and /foo/bar too, then the latter may be
suppressed as it has no effect.
---
 src/core/namespace.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 74201caa10..72f850b2f2 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -161,6 +161,44 @@ static void drop_inaccessible(BindMount *m, unsigned *n) {
         *n = t - m;
 }
 
+static void drop_nop(BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+
+        assert(m);
+        assert(n);
+
+        /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
+         * list is ordered by prefixes. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                /* Only suppress such subtrees for READONLY and READWRITE entries */
+                if (IN_SET(f->mode, READONLY, READWRITE)) {
+                        BindMount *p;
+                        bool found = false;
+
+                        /* Now let's find the first parent of the entry we are looking at. */
+                        for (p = t-1; p >= m; p--) {
+                                if (path_startswith(f->path, p->path)) {
+                                        found = true;
+                                        break;
+                                }
+                        }
+
+                        /* We found it, let's see if it's the same mode, if so, we can drop this entry */
+                        if (found && p->mode == f->mode) {
+                                log_debug("%s is redundant by %s", f->path, p->path);
+                                continue;
+                        }
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+}
+
 static int mount_dev(BindMount *m) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -515,6 +553,7 @@ int setup_namespace(
 
                 drop_duplicates(mounts, &n);
                 drop_inaccessible(mounts, &n);
+                drop_nop(mounts, &n);
         }
 
         if (n > 0 || root_directory) {
-- 
cgit v1.2.3-54-g00ecf


From 6b7c9f8bce4679c89f3b89cacfd4932c0aeadad4 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Sun, 25 Sep 2016 10:40:51 +0200
Subject: namespace: rework how ReadWritePaths= is applied

Previously, if ReadWritePaths= was nested inside a ReadOnlyPaths=
specification, then we'd first recursively apply the ReadOnlyPaths= paths, and
make everything below read-only, only in order to then flip the read-only bit
again for the subdirs listed in ReadWritePaths= below it.

This is not only ugly (as for the dirs in question we first turn on the RO bit,
only to turn it off again immediately after), but also problematic in
containers, where a container manager might have marked a set of dirs read-only
and this code will undo this is ReadWritePaths= is set for any.

With this patch behaviour in this regard is altered: ReadOnlyPaths= will not be
applied to the children listed in ReadWritePaths= in the first place, so that
we do not need to turn off the RO bit for those after all.

This means that ReadWritePaths=/ReadOnlyPaths= may only be used to turn on the
RO bit, but never to turn it off again. Or to say this differently: if some
dirs are marked read-only via some external tool, then ReadWritePaths= will not
undo it.

This is not only the safer option, but also more in-line with what the man page
currently claims:

        "Entries (files or directories) listed in ReadWritePaths= are
        accessible from within the namespace with the same access rights as
        from outside."

To implement this change bind_remount_recursive() gained a new "blacklist"
string list parameter, which when passed may contain subdirs that shall be
excluded from the read-only mounting.

A number of functions are updated to add more debug logging to make this more
digestable.
---
 src/basic/mount-util.c    | 71 ++++++++++++++++++++++++++++++++---------------
 src/basic/mount-util.h    |  2 +-
 src/core/namespace.c      | 66 ++++++++++++++++++++++++++++---------------
 src/nspawn/nspawn-mount.c |  6 ++--
 src/nspawn/nspawn.c       |  2 +-
 5 files changed, 96 insertions(+), 51 deletions(-)

(limited to 'src/core')

diff --git a/src/basic/mount-util.c b/src/basic/mount-util.c
index bfa04394fe..b9affb4e70 100644
--- a/src/basic/mount-util.c
+++ b/src/basic/mount-util.c
@@ -36,6 +36,7 @@
 #include "set.h"
 #include "stdio-util.h"
 #include "string-util.h"
+#include "strv.h"
 
 static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) {
         char path[strlen("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
@@ -287,10 +288,12 @@ int umount_recursive(const char *prefix, int flags) {
                                 continue;
 
                         if (umount2(p, flags) < 0) {
-                                r = -errno;
+                                r = log_debug_errno(errno, "Failed to umount %s: %m", p);
                                 continue;
                         }
 
+                        log_debug("Successfully unmounted %s", p);
+
                         again = true;
                         n++;
 
@@ -311,24 +314,21 @@ static int get_mount_flags(const char *path, unsigned long *flags) {
         return 0;
 }
 
-int bind_remount_recursive(const char *prefix, bool ro) {
+int bind_remount_recursive(const char *prefix, bool ro, char **blacklist) {
         _cleanup_set_free_free_ Set *done = NULL;
         _cleanup_free_ char *cleaned = NULL;
         int r;
 
-        /* Recursively remount a directory (and all its submounts)
-         * read-only or read-write. If the directory is already
-         * mounted, we reuse the mount and simply mark it
-         * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
-         * operation). If it isn't we first make it one. Afterwards we
-         * apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to all
-         * submounts we can access, too. When mounts are stacked on
-         * the same mount point we only care for each individual
-         * "top-level" mount on each point, as we cannot
-         * influence/access the underlying mounts anyway. We do not
-         * have any effect on future submounts that might get
-         * propagated, they migt be writable. This includes future
-         * submounts that have been triggered via autofs. */
+        /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
+         * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
+         * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
+         * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
+         * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
+         * do not have any effect on future submounts that might get propagated, they migt be writable. This includes
+         * future submounts that have been triggered via autofs.
+         *
+         * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the
+         * remount operation. Note that we'll ignore the blacklist for the top-level path. */
 
         cleaned = strdup(prefix);
         if (!cleaned)
@@ -385,6 +385,33 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (r < 0)
                                 return r;
 
+                        if (!path_startswith(p, cleaned))
+                                continue;
+
+                        /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall
+                         * operate on. */
+                        if (!path_equal(cleaned, p)) {
+                                bool blacklisted = false;
+                                char **i;
+
+                                STRV_FOREACH(i, blacklist) {
+
+                                        if (path_equal(*i, cleaned))
+                                                continue;
+
+                                        if (!path_startswith(*i, cleaned))
+                                                continue;
+
+                                        if (path_startswith(p, *i)) {
+                                                blacklisted = true;
+                                                log_debug("Not remounting %s, because blacklisted by %s, called for %s", p, *i, cleaned);
+                                                break;
+                                        }
+                                }
+                                if (blacklisted)
+                                        continue;
+                        }
+
                         /* Let's ignore autofs mounts.  If they aren't
                          * triggered yet, we want to avoid triggering
                          * them, as we don't make any guarantees for
@@ -396,12 +423,9 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                                 continue;
                         }
 
-                        if (path_startswith(p, cleaned) &&
-                            !set_contains(done, p)) {
-
+                        if (!set_contains(done, p)) {
                                 r = set_consume(todo, p);
                                 p = NULL;
-
                                 if (r == -EEXIST)
                                         continue;
                                 if (r < 0)
@@ -418,8 +442,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
 
                 if (!set_contains(done, cleaned) &&
                     !set_contains(todo, cleaned)) {
-                        /* The prefix directory itself is not yet a
-                         * mount, make it one. */
+                        /* The prefix directory itself is not yet a mount, make it one. */
                         if (mount(cleaned, cleaned, NULL, MS_BIND|MS_REC, NULL) < 0)
                                 return -errno;
 
@@ -430,6 +453,8 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (mount(NULL, prefix, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
                                 return -errno;
 
+                        log_debug("Made top-level directory %s a mount point.", prefix);
+
                         x = strdup(cleaned);
                         if (!x)
                                 return -ENOMEM;
@@ -447,8 +472,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (r < 0)
                                 return r;
 
-                        /* Deal with mount points that are obstructed by a
-                         * later mount */
+                        /* Deal with mount points that are obstructed by a later mount */
                         r = path_is_mount_point(x, 0);
                         if (r == -ENOENT || r == 0)
                                 continue;
@@ -463,6 +487,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (mount(NULL, x, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
                                 return -errno;
 
+                        log_debug("Remounted %s read-only.", x);
                 }
         }
 }
diff --git a/src/basic/mount-util.h b/src/basic/mount-util.h
index f46989ebb3..74730de663 100644
--- a/src/basic/mount-util.h
+++ b/src/basic/mount-util.h
@@ -35,7 +35,7 @@ int path_is_mount_point(const char *path, int flags);
 int repeat_unmount(const char *path, int flags);
 
 int umount_recursive(const char *target, int flags);
-int bind_remount_recursive(const char *prefix, bool ro);
+int bind_remount_recursive(const char *prefix, bool ro, char **blacklist);
 
 int mount_move_root(const char *path);
 
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 72f850b2f2..b0dab9459e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -375,9 +375,19 @@ static int apply_mount(
 
         case READONLY:
         case READWRITE:
-                /* Nothing to mount here, we just later toggle the
-                 * MS_RDONLY bit for the mount point */
-                return 0;
+
+                r = path_is_mount_point(m->path, 0);
+                if (r < 0) {
+                        if (m->ignore && errno == ENOENT)
+                                return 0;
+                        return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
+                }
+                if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
+                        return 0;
+
+                /* This isn't a mount point yet, let's make it one. */
+                what = m->path;
+                break;
 
         case PRIVATE_TMP:
                 what = tmp_dir;
@@ -396,31 +406,33 @@ static int apply_mount(
 
         assert(what);
 
-        r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
-        if (r >= 0) {
-                log_debug("Successfully mounted %s to %s", what, m->path);
-                return r;
-        } else {
+        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
                 if (m->ignore && errno == ENOENT)
                         return 0;
+
                 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
         }
+
+        log_debug("Successfully mounted %s to %s", what, m->path);
+        return 0;
 }
 
-static int make_read_only(BindMount *m) {
-        int r;
+static int make_read_only(BindMount *m, char **blacklist) {
+        int r = 0;
 
         assert(m);
 
         if (IN_SET(m->mode, INACCESSIBLE, READONLY))
-                r = bind_remount_recursive(m->path, true);
-        else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV)) {
-                r = bind_remount_recursive(m->path, false);
-                if (r == 0 && m->mode == PRIVATE_DEV) /* can be readonly but the submounts can't*/
-                        if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
-                                r = -errno;
+                r = bind_remount_recursive(m->path, true, blacklist);
+        else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
+                if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
+                        r = -errno;
         } else
-                r = 0;
+                return 0;
+
+        /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
+         * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
+         * read-only mounts already applied. */
 
         if (m->ignore && r == -ENOENT)
                 return 0;
@@ -570,14 +582,25 @@ int setup_namespace(
         }
 
         if (n > 0) {
+                char **blacklist;
+                unsigned j;
+
+                /* First round, add in all special mounts we need */
                 for (m = mounts; m < mounts + n; ++m) {
                         r = apply_mount(m, tmp_dir, var_tmp_dir);
                         if (r < 0)
                                 goto fail;
                 }
 
+                /* Create a blacklist we can pass to bind_mount_recursive() */
+                blacklist = newa(char*, n+1);
+                for (j = 0; j < n; j++)
+                        blacklist[j] = (char*) mounts[j].path;
+                blacklist[j] = NULL;
+
+                /* Second round, flip the ro bits if necessary. */
                 for (m = mounts; m < mounts + n; ++m) {
-                        r = make_read_only(m);
+                        r = make_read_only(m, blacklist);
                         if (r < 0)
                                 goto fail;
                 }
@@ -586,9 +609,7 @@ int setup_namespace(
         if (root_directory) {
                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
                 r = mount_move_root(root_directory);
-
-                /* at this point, we cannot rollback */
-                if (r < 0)
+                if (r < 0) /* at this point, we cannot rollback */
                         return r;
         }
 
@@ -596,8 +617,7 @@ int setup_namespace(
          * reestablish propagation from our side to the host, since
          * what's disconnected is disconnected. */
         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
-                /* at this point, we cannot rollback */
-                return -errno;
+                return -errno; /* at this point, we cannot rollback */
 
         return 0;
 
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 295b75341f..8457357003 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -476,7 +476,7 @@ static int mount_bind(const char *dest, CustomMount *m) {
                 return log_error_errno(errno, "mount(%s) failed: %m", where);
 
         if (m->read_only) {
-                r = bind_remount_recursive(where, true);
+                r = bind_remount_recursive(where, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Read-only bind mount failed: %m");
         }
@@ -990,7 +990,7 @@ int setup_volatile_state(
         /* --volatile=state means we simply overmount /var
            with a tmpfs, and the rest read-only. */
 
-        r = bind_remount_recursive(directory, true);
+        r = bind_remount_recursive(directory, true, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
@@ -1065,7 +1065,7 @@ int setup_volatile(
 
         bind_mounted = true;
 
-        r = bind_remount_recursive(t, true);
+        r = bind_remount_recursive(t, true, NULL);
         if (r < 0) {
                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
                 goto fail;
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 0d61d34ebf..1f3e1f2dac 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -3019,7 +3019,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only) {
-                r = bind_remount_recursive(directory, true);
+                r = bind_remount_recursive(directory, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
-- 
cgit v1.2.3-54-g00ecf


From 160cfdbed3eb23b6bc3c17613685b756f23be4a1 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:51:37 +0200
Subject: namespace: add some debug logging when enforcing InaccessiblePaths=

---
 src/core/namespace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index b0dab9459e..e08d7459c5 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -345,7 +345,6 @@ static int apply_mount(
 
         const char *what;
         int r;
-        struct stat target;
 
         assert(m);
 
@@ -353,7 +352,8 @@ static int apply_mount(
 
         switch (m->mode) {
 
-        case INACCESSIBLE:
+        case INACCESSIBLE: {
+                struct stat target;
 
                 /* First, get rid of everything that is below if there
                  * is anything... Then, overmount it with an
@@ -363,7 +363,7 @@ static int apply_mount(
                 if (lstat(m->path, &target) < 0) {
                         if (m->ignore && errno == ENOENT)
                                 return 0;
-                        return -errno;
+                        return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
                 }
 
                 what = mode_to_inaccessible_node(target.st_mode);
@@ -372,6 +372,7 @@ static int apply_mount(
                         return -ELOOP;
                 }
                 break;
+        }
 
         case READONLY:
         case READWRITE:
-- 
cgit v1.2.3-54-g00ecf


From 3f815163ff8fdcdbd329680580df36f94e15325d Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:57:21 +0200
Subject: core: introduce ProtectSystem=strict

Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a
new setting "strict". If set, the entire directory tree of the system is
mounted read-only, but the API file systems /proc, /dev, /sys are excluded
(they may be managed with PrivateDevices= and ProtectKernelTunables=). Also,
/home and /root are excluded as those are left for ProtectHome= to manage.

In this mode, all "real" file systems (i.e. non-API file systems) are mounted
read-only, and specific directories may only be excluded via
ReadWriteDirectories=, thus implementing an effective whitelist instead of
blacklist of writable directories.

While we are at, also add /efi to the list of paths always affected by
ProtectSystem=. This is a follow-up for
b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for
/boot. Our namespacing logic should respect that too.
---
 man/systemd.exec.xml | 33 ++++++++++++++++---------------
 src/core/namespace.c | 56 +++++++++++++++++++++++++++++++++++++++++++---------
 src/core/namespace.h |  1 +
 3 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'src/core')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 07128b489e..1b672fe0c9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1020,22 +1020,23 @@
       <varlistentry>
         <term><varname>ProtectSystem=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>full</literal>. If true, mounts the
-        <filename>/usr</filename> and <filename>/boot</filename>
-        directories read-only for processes invoked by this unit. If
-        set to <literal>full</literal>, the <filename>/etc</filename>
-        directory is mounted read-only, too. This setting ensures that
-        any modification of the vendor-supplied operating system (and
-        optionally its configuration) is prohibited for the service.
-        It is recommended to enable this setting for all long-running
-        services, unless they are involved with system updates or need
-        to modify the operating system in other ways. Note however
-        that processes retaining the CAP_SYS_ADMIN capability can undo
-        the effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or the special values <literal>full</literal> or
+        <literal>strict</literal>. If true, mounts the <filename>/usr</filename> and <filename>/boot</filename>
+        directories read-only for processes invoked by this unit. If set to <literal>full</literal>, the
+        <filename>/etc</filename> directory is mounted read-only, too. If set to <literal>strict</literal> the entire
+        file system hierarchy is mounted read-only, except for the API file system subtrees <filename>/dev</filename>,
+        <filename>/proc</filename> and <filename>/sys</filename> (protect these directories using
+        <varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>). This setting ensures that any modification of the vendor-supplied
+        operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
+        recommended to enable this setting for all long-running services, unless they are involved with system updates
+        or need to modify the operating system in other ways. If this option is used,
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
+        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
+        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
+        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
+        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
+        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/namespace.c b/src/core/namespace.c
index e08d7459c5..498cd139bf 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -472,9 +472,11 @@ int setup_namespace(
                 private_dev +
                 (protect_sysctl ? 3 : 0) +
                 (protect_cgroups != protect_sysctl) +
-                (protect_home != PROTECT_HOME_NO ? 3 : 0) +
-                (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
-                (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
@@ -529,9 +531,13 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO) {
+                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
+                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
+                         * strict system protection mode, then also add entries for these directories, but mark them
+                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
+
                         home_dir = prefix_roota(root_directory, "/home");
                         home_dir = strjoina("-", home_dir);
                         run_user_dir = prefix_roota(root_directory, "/run/user");
@@ -540,22 +546,53 @@ int setup_namespace(
                         root_dir = strjoina("-", root_dir);
 
                         r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
+                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
+                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
                         if (r < 0)
                                 return r;
                 }
 
-                if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *etc_dir;
+                if (protect_system == PROTECT_SYSTEM_STRICT) {
+                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
+                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+                         * protect those, and these options should be fully orthogonal. (And of course /home and
+                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
+                         * above). */
+
+                        m->path = prefix_roota(root_directory, "/");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc");
+                        m->mode = READWRITE;
+                        m++;
+
+                        if (!private_dev) {
+                                m->path = prefix_roota(root_directory, "/dev");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+                        if (!protect_sysctl) {
+                                m->path = prefix_roota(root_directory, "/sys");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+
+                } else if (protect_system != PROTECT_SYSTEM_NO) {
+                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
+
+                        /* In any other mode we simply mark the relevant three directories ready-only. */
 
                         usr_dir = prefix_roota(root_directory, "/usr");
                         boot_dir = prefix_roota(root_directory, "/boot");
                         boot_dir = strjoina("-", boot_dir);
+                        efi_dir = prefix_roota(root_directory, "/efi");
+                        efi_dir = strjoina("-", efi_dir);
                         etc_dir = prefix_roota(root_directory, "/etc");
 
                         r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
-                                : STRV_MAKE(usr_dir, boot_dir), READONLY);
+                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
+                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
                         if (r < 0)
                                 return r;
                 }
@@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
         [PROTECT_SYSTEM_NO] = "no",
         [PROTECT_SYSTEM_YES] = "yes",
         [PROTECT_SYSTEM_FULL] = "full",
+        [PROTECT_SYSTEM_STRICT] = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 3845336287..6505bcc499 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -35,6 +35,7 @@ typedef enum ProtectSystem {
         PROTECT_SYSTEM_NO,
         PROTECT_SYSTEM_YES,
         PROTECT_SYSTEM_FULL,
+        PROTECT_SYSTEM_STRICT,
         _PROTECT_SYSTEM_MAX,
         _PROTECT_SYSTEM_INVALID = -1
 } ProtectSystem;
-- 
cgit v1.2.3-54-g00ecf


From 63bb64a056113d4be5fefb16604accf08c8c204a Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 16:12:46 +0200
Subject: core: imply ProtectHome=read-only and ProtectSystem=strict if
 DynamicUser=1

Let's make sure that services that use DynamicUser=1 cannot leave files in the
file system should the system accidentally have a world-writable directory
somewhere.

This effectively ensures that directories need to be whitelisted rather than
blacklisted for access when DynamicUser=1 is set.
---
 man/systemd.exec.xml | 12 ++++++++----
 src/core/unit.c      |  6 ++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'src/core')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 1b672fe0c9..e4d9c0ef1b 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -160,14 +160,18 @@
         use. However, UID/GIDs are recycled after a unit is terminated. Care should be taken that any processes running
         as part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by these
         users/groups around, as a different unit might get the same UID/GID assigned later on, and thus gain access to
-        these files or directories. If <varname>DynamicUser=</varname> is enabled, <varname>RemoveIPC=</varname> and
+        these files or directories. If <varname>DynamicUser=</varname> is enabled, <varname>RemoveIPC=</varname>,
         <varname>PrivateTmp=</varname> are implied. This ensures that the lifetime of IPC objects and temporary files
         created by the executed processes is bound to the runtime of the service, and hence the lifetime of the dynamic
         user/group. Since <filename>/tmp</filename> and <filename>/var/tmp</filename> are usually the only
         world-writable directories on a system this ensures that a unit making use of dynamic user/group allocation
-        cannot leave files around after unit termination. Use <varname>RuntimeDirectory=</varname> (see below) in order
-        to assign a writable runtime directory to a service, owned by the dynamic user/group and removed automatically
-        when the unit is terminated. Defaults to off.</para></listitem>
+        cannot leave files around after unit termination. Moreover <varname>ProtectSystem=strict</varname> and
+        <varname>ProtectHome=read-only</varname> are implied, thus prohibiting the service to write to arbitrary file
+        system locations. In order to allow the service to write to certain directories, they have to be whitelisted
+        using <varname>ReadWritePaths=</varname>, but care must be taken so that that UID/GID recycling doesn't
+        create security issues involving files created by the service. Use <varname>RuntimeDirectory=</varname> (see
+        below) in order to assign a writable runtime directory to a service, owned by the dynamic user/group and
+        removed automatically when the unit is terminated. Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/unit.c b/src/core/unit.c
index de22f657c6..5d284a359d 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3377,8 +3377,14 @@ int unit_patch_contexts(Unit *u) {
                                         return -ENOMEM;
                         }
 
+                        /* If the dynamic user option is on, let's make sure that the unit can't leave its UID/GID
+                         * around in the file system or on IPC objects. Hence enforce a strict sandbox. */
+
                         ec->private_tmp = true;
                         ec->remove_ipc = true;
+                        ec->protect_system = PROTECT_SYSTEM_STRICT;
+                        if (ec->protect_home == PROTECT_HOME_NO)
+                                ec->protect_home = PROTECT_HOME_READ_ONLY;
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From 096424d1230e0a0339735c51b43949809e972430 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 17:29:12 +0200
Subject: execute: drop group priviliges only after setting up namespace

If PrivateDevices=yes is set, the namespace code creates device nodes in /dev
that should be owned by the host's root, hence let's make sure we set up the
namespace before dropping group privileges.
---
 src/core/execute.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'src/core')

diff --git a/src/core/execute.c b/src/core/execute.c
index 20e74ec8a6..ae251b2a4c 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -2291,14 +2291,9 @@ static int exec_child(
         }
         accum_env = strv_env_clean(accum_env);
 
-        umask(context->umask);
+        (void) umask(context->umask);
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-                r = enforce_groups(context, username, gid);
-                if (r < 0) {
-                        *exit_status = EXIT_GROUP;
-                        return r;
-                }
 #ifdef HAVE_SMACK
                 if (context->smack_process_label) {
                         r = mac_smack_apply_pid(0, context->smack_process_label);
@@ -2395,6 +2390,14 @@ static int exec_child(
                 }
         }
 
+        if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
+                r = enforce_groups(context, username, gid);
+                if (r < 0) {
+                        *exit_status = EXIT_GROUP;
+                        return r;
+                }
+        }
+
         if (context->working_directory_home)
                 wd = home;
         else if (context->working_directory)
-- 
cgit v1.2.3-54-g00ecf


From 1e4e94c8819e2fe3a7217690c0590dba8ab0be9e Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 17:30:47 +0200
Subject: namespace: invoke unshare() only after checking all parameters

Let's create the new namespace only after we validated and processed all
parameters, right before we start with actually mounting things.

This way, the window where we can roll back is larger (not that it matters
IRL...)
---
 src/core/namespace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 498cd139bf..356d3c8121 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -462,9 +462,6 @@ int setup_namespace(
         if (mount_flags == 0)
                 mount_flags = MS_SHARED;
 
-        if (unshare(CLONE_NEWNS) < 0)
-                return -errno;
-
         n = !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -606,6 +603,9 @@ int setup_namespace(
                 drop_nop(mounts, &n);
         }
 
+        if (unshare(CLONE_NEWNS) < 0)
+                return -errno;
+
         if (n > 0 || root_directory) {
                 /* Remount / as SLAVE so that nothing now mounted in the namespace
                    shows up in the parent */
-- 
cgit v1.2.3-54-g00ecf


From d944dc9553009822deaddec76814f5642a6a8176 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Sat, 24 Sep 2016 12:41:30 +0200
Subject: namespace: chase symlinks for mounts to set up in userspace

This adds logic to chase symlinks for all mount points that shall be created in
a namespace environment in userspace, instead of leaving this to the kernel.
This has the advantage that we can correctly handle absolute symlinks that
shall be taken relative to a specific root directory. Moreover, we can properly
handle mounts created on symlinked files or directories as we can merge their
mounts as necessary.

(This also drops the "done" flag in the namespace logic, which was never
actually working, but was supposed to permit a partial rollback of the
namespace logic, which however is only mildly useful as it wasn't clear in
which case it would or would not be able to roll back.)

Fixes: #3867
---
 src/basic/fs-util.c     | 187 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/basic/fs-util.h     |   2 +
 src/core/namespace.c    | 118 +++++++++++++++++++-----------
 src/test/test-fs-util.c |  96 ++++++++++++++++++++++++-
 src/test/test-ns.c      |  10 ++-
 5 files changed, 367 insertions(+), 46 deletions(-)

(limited to 'src/core')

diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c
index ce87257bc1..86d9ad7e36 100644
--- a/src/basic/fs-util.c
+++ b/src/basic/fs-util.c
@@ -597,3 +597,190 @@ int inotify_add_watch_fd(int fd, int what, uint32_t mask) {
 
         return r;
 }
+
+int chase_symlinks(const char *path, const char *_root, char **ret) {
+        _cleanup_free_ char *buffer = NULL, *done = NULL, *root = NULL;
+        _cleanup_close_ int fd = -1;
+        unsigned max_follow = 32; /* how many symlinks to follow before giving up and returning ELOOP */
+        char *todo;
+        int r;
+
+        assert(path);
+
+        /* This is a lot like canonicalize_file_name(), but takes an additional "root" parameter, that allows following
+         * symlinks relative to a root directory, instead of the root of the host.
+         *
+         * Note that "root" matters only if we encounter an absolute symlink, it's unused otherwise. Most importantly
+         * this means the path parameter passed in is not prefixed by it.
+         *
+         * Algorithmically this operates on two path buffers: "done" are the components of the path we already
+         * processed and resolved symlinks, "." and ".." of. "todo" are the components of the path we still need to
+         * process. On each iteration, we move one component from "todo" to "done", processing it's special meaning
+         * each time. The "todo" path always starts with at least one slash, the "done" path always ends in no
+         * slash. We always keep an O_PATH fd to the component we are currently processing, thus keeping lookup races
+         * at a minimum. */
+
+        r = path_make_absolute_cwd(path, &buffer);
+        if (r < 0)
+                return r;
+
+        if (_root) {
+                r = path_make_absolute_cwd(_root, &root);
+                if (r < 0)
+                        return r;
+        }
+
+        fd = open("/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+        if (fd < 0)
+                return -errno;
+
+        todo = buffer;
+        for (;;) {
+                _cleanup_free_ char *first = NULL;
+                _cleanup_close_ int child = -1;
+                struct stat st;
+                size_t n, m;
+
+                /* Determine length of first component in the path */
+                n = strspn(todo, "/");                  /* The slashes */
+                m = n + strcspn(todo + n, "/");         /* The entire length of the component */
+
+                /* Extract the first component. */
+                first = strndup(todo, m);
+                if (!first)
+                        return -ENOMEM;
+
+                todo += m;
+
+                /* Just a single slash? Then we reached the end. */
+                if (isempty(first) || path_equal(first, "/"))
+                        break;
+
+                /* Just a dot? Then let's eat this up. */
+                if (path_equal(first, "/."))
+                        continue;
+
+                /* Two dots? Then chop off the last bit of what we already found out. */
+                if (path_equal(first, "/..")) {
+                        _cleanup_free_ char *parent = NULL;
+                        int fd_parent = -1;
+
+                        if (isempty(done) || path_equal(done, "/"))
+                                return -EINVAL;
+
+                        parent = dirname_malloc(done);
+                        if (!parent)
+                                return -ENOMEM;
+
+                        /* Don't allow this to leave the root dir */
+                        if (root &&
+                            path_startswith(done, root) &&
+                            !path_startswith(parent, root))
+                                return -EINVAL;
+
+                        free(done);
+                        done = parent;
+                        parent = NULL;
+
+                        fd_parent = openat(fd, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                        if (fd_parent < 0)
+                                return -errno;
+
+                        safe_close(fd);
+                        fd = fd_parent;
+
+                        continue;
+                }
+
+                /* Otherwise let's see what this is. */
+                child = openat(fd, first + n, O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                if (child < 0)
+                        return -errno;
+
+                if (fstat(child, &st) < 0)
+                        return -errno;
+
+                if (S_ISLNK(st.st_mode)) {
+                        _cleanup_free_ char *destination = NULL;
+
+                        /* This is a symlink, in this case read the destination. But let's make sure we don't follow
+                         * symlinks without bounds. */
+                        if (--max_follow <= 0)
+                                return -ELOOP;
+
+                        r = readlinkat_malloc(fd, first + n, &destination);
+                        if (r < 0)
+                                return r;
+                        if (isempty(destination))
+                                return -EINVAL;
+
+                        if (path_is_absolute(destination)) {
+
+                                /* An absolute destination. Start the loop from the beginning, but use the root
+                                 * directory as base. */
+
+                                safe_close(fd);
+                                fd = open(root ?: "/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                                if (fd < 0)
+                                        return -errno;
+
+                                free(buffer);
+                                buffer = destination;
+                                destination = NULL;
+
+                                todo = buffer;
+                                free(done);
+
+                                /* Note that we do not revalidate the root, we take it as is. */
+                                if (isempty(root))
+                                        done = NULL;
+                                else {
+                                        done = strdup(root);
+                                        if (!done)
+                                                return -ENOMEM;
+                                }
+
+                        } else {
+                                char *joined;
+
+                                /* A relative destination. If so, this is what we'll prefix what's left to do with what
+                                 * we just read, and start the loop again, but remain in the current directory. */
+
+                                joined = strjoin("/", destination, todo, NULL);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                free(buffer);
+                                todo = buffer = joined;
+                        }
+
+                        continue;
+                }
+
+                /* If this is not a symlink, then let's just add the name we read to what we already verified. */
+                if (!done) {
+                        done = first;
+                        first = NULL;
+                } else {
+                        if (!strextend(&done, first, NULL))
+                                return -ENOMEM;
+                }
+
+                /* And iterate again, but go one directory further down. */
+                safe_close(fd);
+                fd = child;
+                child = -1;
+        }
+
+        if (!done) {
+                /* Special case, turn the empty string into "/", to indicate the root directory. */
+                done = strdup("/");
+                if (!done)
+                        return -ENOMEM;
+        }
+
+        *ret = done;
+        done = NULL;
+
+        return 0;
+}
diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h
index 2c3b9a1c74..31df47cf1e 100644
--- a/src/basic/fs-util.h
+++ b/src/basic/fs-util.h
@@ -77,3 +77,5 @@ union inotify_event_buffer {
 };
 
 int inotify_add_watch_fd(int fd, int what, uint32_t mask);
+
+int chase_symlinks(const char *path, const char *_root, char **ret);
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 356d3c8121..d3ab2e8e3e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -29,6 +29,7 @@
 #include "alloc-util.h"
 #include "dev-setup.h"
 #include "fd-util.h"
+#include "fs-util.h"
 #include "loopback-setup.h"
 #include "missing.h"
 #include "mkdir.h"
@@ -57,9 +58,9 @@ typedef enum MountMode {
 } MountMode;
 
 typedef struct BindMount {
-        const char *path;
+        const char *path; /* stack memory, doesn't need to be freed explicitly */
+        char *chased; /* malloc()ed memory, needs to be freed */
         MountMode mode;
-        bool done;
         bool ignore;
 } BindMount;
 
@@ -71,7 +72,6 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         STRV_FOREACH(i, strv) {
 
                 (*p)->ignore = false;
-                (*p)->done = false;
 
                 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
                         (*p)->ignore = true;
@@ -360,11 +360,8 @@ static int apply_mount(
                  * inaccessible path. */
                 (void) umount_recursive(m->path, 0);
 
-                if (lstat(m->path, &target) < 0) {
-                        if (m->ignore && errno == ENOENT)
-                                return 0;
+                if (lstat(m->path, &target) < 0)
                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
-                }
 
                 what = mode_to_inaccessible_node(target.st_mode);
                 if (!what) {
@@ -378,11 +375,8 @@ static int apply_mount(
         case READWRITE:
 
                 r = path_is_mount_point(m->path, 0);
-                if (r < 0) {
-                        if (m->ignore && errno == ENOENT)
-                                return 0;
+                if (r < 0)
                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
-                }
                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
                         return 0;
 
@@ -407,12 +401,8 @@ static int apply_mount(
 
         assert(what);
 
-        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                if (m->ignore && errno == ENOENT)
-                        return 0;
-
+        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
                 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
-        }
 
         log_debug("Successfully mounted %s to %s", what, m->path);
         return 0;
@@ -435,12 +425,43 @@ static int make_read_only(BindMount *m, char **blacklist) {
          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
          * read-only mounts already applied. */
 
-        if (m->ignore && r == -ENOENT)
-                return 0;
-
         return r;
 }
 
+static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+        int r;
+
+        assert(m);
+        assert(n);
+
+        /* Since mount() will always follow symlinks and we need to take the different root directory into account we
+         * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
+         * can't resolve the path, and which have been marked for such removal. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                r = chase_symlinks(f->path, root_directory, &f->chased);
+                if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
+                        continue;
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
+
+                if (path_equal(f->path, f->chased))
+                        f->chased = mfree(f->chased);
+                else {
+                        log_debug("Chased %s → %s", f->path, f->chased);
+                        f->path = f->chased;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+        return 0;
+}
+
 int setup_namespace(
                 const char* root_directory,
                 char** read_write_paths,
@@ -456,6 +477,7 @@ int setup_namespace(
                 unsigned long mount_flags) {
 
         BindMount *m, *mounts = NULL;
+        bool make_slave = false;
         unsigned n;
         int r = 0;
 
@@ -475,6 +497,9 @@ int setup_namespace(
                  ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
                   (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
+        if (root_directory || n > 0)
+                make_slave = true;
+
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
                 r = append_mounts(&m, read_write_paths, READWRITE);
@@ -596,6 +621,13 @@ int setup_namespace(
 
                 assert(mounts + n == m);
 
+                /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
+                 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
+                 * racy. */
+                r = chase_all_symlinks(root_directory, mounts, &n);
+                if (r < 0)
+                        goto finish;
+
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
 
                 drop_duplicates(mounts, &n);
@@ -603,20 +635,26 @@ int setup_namespace(
                 drop_nop(mounts, &n);
         }
 
-        if (unshare(CLONE_NEWNS) < 0)
-                return -errno;
+        if (unshare(CLONE_NEWNS) < 0) {
+                r = -errno;
+                goto finish;
+        }
 
-        if (n > 0 || root_directory) {
+        if (make_slave) {
                 /* Remount / as SLAVE so that nothing now mounted in the namespace
                    shows up in the parent */
-                if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                        return -errno;
+                if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
         }
 
         if (root_directory) {
                 /* Turn directory into bind mount */
-                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
-                        return -errno;
+                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
         }
 
         if (n > 0) {
@@ -627,7 +665,7 @@ int setup_namespace(
                 for (m = mounts; m < mounts + n; ++m) {
                         r = apply_mount(m, tmp_dir, var_tmp_dir);
                         if (r < 0)
-                                goto fail;
+                                goto finish;
                 }
 
                 /* Create a blacklist we can pass to bind_mount_recursive() */
@@ -640,34 +678,30 @@ int setup_namespace(
                 for (m = mounts; m < mounts + n; ++m) {
                         r = make_read_only(m, blacklist);
                         if (r < 0)
-                                goto fail;
+                                goto finish;
                 }
         }
 
         if (root_directory) {
                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
                 r = mount_move_root(root_directory);
-                if (r < 0) /* at this point, we cannot rollback */
-                        return r;
+                if (r < 0)
+                        goto finish;
         }
 
         /* Remount / as the desired mode. Not that this will not
          * reestablish propagation from our side to the host, since
          * what's disconnected is disconnected. */
-        if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
-                return -errno; /* at this point, we cannot rollback */
+        if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
+                r = -errno;
+                goto finish;
+        }
 
-        return 0;
+        r = 0;
 
-fail:
-        if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m) {
-                        if (!m->done)
-                                continue;
-
-                        (void) umount2(m->path, MNT_DETACH);
-                }
-        }
+finish:
+        for (m = mounts; m < mounts + n; m++)
+                free(m->chased);
 
         return r;
 }
diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c
index b35a2ea2c8..53a3cdc663 100644
--- a/src/test/test-fs-util.c
+++ b/src/test/test-fs-util.c
@@ -20,16 +20,109 @@
 #include <unistd.h>
 
 #include "alloc-util.h"
-#include "fileio.h"
 #include "fd-util.h"
+#include "fileio.h"
 #include "fs-util.h"
 #include "macro.h"
 #include "mkdir.h"
+#include "path-util.h"
 #include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
 #include "util.h"
 
+static void test_chase_symlinks(void) {
+        _cleanup_free_ char *result = NULL;
+        char temp[] = "/tmp/test-chase.XXXXXX";
+        const char *top, *p, *q;
+        int r;
+
+        assert_se(mkdtemp(temp));
+
+        top = strjoina(temp, "/top");
+        assert_se(mkdir(top, 0700) >= 0);
+
+        p = strjoina(top, "/dot");
+        assert_se(symlink(".", p) >= 0);
+
+        p = strjoina(top, "/dotdot");
+        assert_se(symlink("..", p) >= 0);
+
+        p = strjoina(top, "/dotdota");
+        assert_se(symlink("../a", p) >= 0);
+
+        p = strjoina(temp, "/a");
+        assert_se(symlink("b", p) >= 0);
+
+        p = strjoina(temp, "/b");
+        assert_se(symlink("/usr", p) >= 0);
+
+        p = strjoina(temp, "/start");
+        assert_se(symlink("top/dot/dotdota", p) >= 0);
+
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/usr"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r == -ENOENT);
+
+        q = strjoina(temp, "/usr");
+        assert_se(mkdir(q, 0700) >= 0);
+
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, q));
+
+        p = strjoina(temp, "/slash");
+        assert_se(symlink("/", p) >= 0);
+
+        result = mfree(result);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, temp));
+
+        p = strjoina(temp, "/slashslash");
+        assert_se(symlink("///usr///", p) >= 0);
+
+        result = mfree(result);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/usr"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, q));
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/./.././", NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/"));
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/./.././", "/etc", &result);
+        assert_se(r == -EINVAL);
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/machine-id/foo", NULL, &result);
+        assert_se(r == -ENOTDIR);
+
+        result = mfree(result);
+        p = strjoina(temp, "/recursive-symlink");
+        assert_se(symlink("recursive-symlink", p) >= 0);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r == -ELOOP);
+
+        assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
+}
+
 static void test_unlink_noerrno(void) {
         char name[] = "/tmp/test-close_nointr.XXXXXX";
         int fd;
@@ -144,6 +237,7 @@ int main(int argc, char *argv[]) {
         test_readlink_and_make_absolute();
         test_get_files_in_directory();
         test_var_tmp();
+        test_chase_symlinks();
 
         return 0;
 }
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 03a24620af..c4d4da6d05 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -26,14 +26,18 @@
 int main(int argc, char *argv[]) {
         const char * const writable[] = {
                 "/home",
-                "/home/lennart/projects/foobar", /* this should be masked automatically */
+                "-/home/lennart/projects/foobar", /* this should be masked automatically */
                 NULL
         };
 
         const char * const readonly[] = {
-                "/",
-                "/usr",
+                /* "/", */
+                /* "/usr", */
                 "/boot",
+                "/lib",
+                "/usr/lib",
+                "-/lib64",
+                "-/usr/lib64",
                 NULL
         };
 
-- 
cgit v1.2.3-54-g00ecf


From 8f1ad200f010dc2106f7e3ff5879f0330ee36996 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 11:27:38 +0200
Subject: namespace: don't make the root directory of a namespace a mount if it
 already is one

Let's not stack mounts needlessly.
---
 src/core/namespace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index d3ab2e8e3e..a7451ffbdc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -650,10 +650,15 @@ int setup_namespace(
         }
 
         if (root_directory) {
-                /* Turn directory into bind mount */
-                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                        r = -errno;
+                /* Turn directory into bind mount, if it isn't one yet */
+                r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW);
+                if (r < 0)
                         goto finish;
+                if (r == 0) {
+                        if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
+                                r = -errno;
+                                goto finish;
+                        }
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From ba128bb809cc59ca60db65f0c09bd7f48876fa83 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 16:39:04 +0200
Subject: execute: filter low-level I/O syscalls if PrivateDevices= is set

If device access is restricted via PrivateDevices=, let's also block the
various low-level I/O syscalls at the same time, so that we know that the
minimal set of devices in our virtualized /dev are really everything the unit
can access.
---
 src/core/execute.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

(limited to 'src/core')

diff --git a/src/core/execute.c b/src/core/execute.c
index ae251b2a4c..a20e9ea829 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1422,12 +1422,67 @@ finish:
         return r;
 }
 
+static int apply_private_devices(Unit *u, const ExecContext *c) {
+
+        static const int device_syscalls[] = {
+                SCMP_SYS(ioperm),
+                SCMP_SYS(iopl),
+                SCMP_SYS(pciconfig_iobase),
+                SCMP_SYS(pciconfig_read),
+                SCMP_SYS(pciconfig_write),
+#ifdef __NR_s390_pci_mmio_read
+                SCMP_SYS(s390_pci_mmio_read),
+#endif
+#ifdef __NR_s390_pci_mmio_write
+                SCMP_SYS(s390_pci_mmio_write),
+#endif
+        };
+
+        scmp_filter_ctx *seccomp;
+        unsigned i;
+        int r;
+
+        assert(c);
+
+        /* If PrivateDevices= is set, also turn off iopl and friends. */
+
+        if (skip_seccomp_unavailable(u, "PrivateDevices="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        for (i = 0; i < ELEMENTSOF(device_syscalls); i++) {
+                r = seccomp_rule_add(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EPERM),
+                                device_syscalls[i],
+                                0);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
         assert(idle_pipe);
 
-
         idle_pipe[1] = safe_close(idle_pipe[1]);
         idle_pipe[2] = safe_close(idle_pipe[2]);
 
@@ -2584,6 +2639,14 @@ static int exec_child(
                         }
                 }
 
+                if (context->private_devices) {
+                        r = apply_private_devices(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
                 if (context_has_syscall_filters(context)) {
                         r = apply_seccomp(unit, context);
                         if (r < 0) {
-- 
cgit v1.2.3-54-g00ecf


From 54500613a46023fe991f424e21ed15948b9a74f5 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:25:08 +0200
Subject: main: minor simplification

---
 src/core/main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'src/core')

diff --git a/src/core/main.c b/src/core/main.c
index 803307c9d5..be0cb0b6d1 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -996,10 +996,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_MACHINE_ID:
                         r = set_machine_id(optarg);
-                        if (r < 0) {
-                                log_error("MachineID '%s' is not valid.", optarg);
-                                return r;
-                        }
+                        if (r < 0)
+                                return log_error_errno(r, "MachineID '%s' is not valid.", optarg);
                         break;
 
                 case 'h':
-- 
cgit v1.2.3-54-g00ecf


From cd2902c9546eabfffcf5d6de4d0bd4dfe6a4d427 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:25:40 +0200
Subject: namespace: drop all mounts outside of the new root directory

There's no point in mounting these, if they are outside of the root directory
we'll move to.
---
 src/core/namespace.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index a7451ffbdc..c9b2154985 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -199,6 +199,31 @@ static void drop_nop(BindMount *m, unsigned *n) {
         *n = t - m;
 }
 
+static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+
+        assert(m);
+        assert(n);
+
+        if (!root_directory)
+                return;
+
+        /* Drops all mounts that are outside of the root directory. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                if (!path_startswith(f->path, root_directory)) {
+                        log_debug("%s is outside of root directory.", f->path);
+                        continue;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+}
+
 static int mount_dev(BindMount *m) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -631,6 +656,7 @@ int setup_namespace(
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
 
                 drop_duplicates(mounts, &n);
+                drop_outside_root(root_directory, mounts, &n);
                 drop_inaccessible(mounts, &n);
                 drop_nop(mounts, &n);
         }
-- 
cgit v1.2.3-54-g00ecf


From cefc33aee299fa214f093d3d1b4c171ac3b30dde Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:40:42 +0200
Subject: execute: move SMACK setup code into its own function

While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the
main execution logic a bit.
---
 src/core/execute.c | 74 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 27 deletions(-)

(limited to 'src/core')

diff --git a/src/core/execute.c b/src/core/execute.c
index a20e9ea829..0488ba2ca9 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -837,6 +837,8 @@ static int null_conv(
         return PAM_CONV_ERR;
 }
 
+#endif
+
 static int setup_pam(
                 const char *name,
                 const char *user,
@@ -845,6 +847,8 @@ static int setup_pam(
                 char ***env,
                 int fds[], unsigned n_fds) {
 
+#ifdef HAVE_PAM
+
         static const struct pam_conv conv = {
                 .conv = null_conv,
                 .appdata_ptr = NULL
@@ -1038,8 +1042,10 @@ fail:
         closelog();
 
         return r;
-}
+#else
+        return 0;
 #endif
+}
 
 static void rename_process_from_path(const char *path) {
         char process_name[11];
@@ -1875,6 +1881,42 @@ static int setup_runtime_directory(
         return 0;
 }
 
+static int setup_smack(
+                const ExecContext *context,
+                const ExecCommand *command) {
+
+#ifdef HAVE_SMACK
+        int r;
+
+        assert(context);
+        assert(command);
+
+        if (!mac_smack_use())
+                return 0;
+
+        if (context->smack_process_label) {
+                r = mac_smack_apply_pid(0, context->smack_process_label);
+                if (r < 0)
+                        return r;
+        }
+#ifdef SMACK_DEFAULT_PROCESS_LABEL
+        else {
+                _cleanup_free_ char *exec_label = NULL;
+
+                r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
+                if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
+                        return r;
+
+                r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
+                if (r < 0)
+                        return r;
+        }
+#endif
+#endif
+
+        return 0;
+}
+
 static int compile_read_write_paths(
                 const ExecContext *context,
                 const ExecParameters *params,
@@ -2349,33 +2391,12 @@ static int exec_child(
         (void) umask(context->umask);
 
         if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
-#ifdef HAVE_SMACK
-                if (context->smack_process_label) {
-                        r = mac_smack_apply_pid(0, context->smack_process_label);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
+                r = setup_smack(context, command);
+                if (r < 0) {
+                        *exit_status = EXIT_SMACK_PROCESS_LABEL;
+                        return r;
                 }
-#ifdef SMACK_DEFAULT_PROCESS_LABEL
-                else {
-                        _cleanup_free_ char *exec_label = NULL;
 
-                        r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
-                        if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-
-                        r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
-                        if (r < 0) {
-                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
-                                return r;
-                        }
-                }
-#endif
-#endif
-#ifdef HAVE_PAM
                 if (context->pam_name && username) {
                         r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds);
                         if (r < 0) {
@@ -2383,7 +2404,6 @@ static int exec_child(
                                 return r;
                         }
                 }
-#endif
         }
 
         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
-- 
cgit v1.2.3-54-g00ecf


From 9c94d52e0919e4d7999e49b9ba2654a9e2ca4543 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:03:21 +0200
Subject: core:namespace: minor improvements to append_mounts()

---
 src/core/namespace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index c9b2154985..8de774e6f6 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -70,12 +70,11 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         assert(p);
 
         STRV_FOREACH(i, strv) {
+                bool ignore = false;
 
-                (*p)->ignore = false;
-
-                if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
-                        (*p)->ignore = true;
+                if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) {
                         (*i)++;
+                        ignore = true;
                 }
 
                 if (!path_is_absolute(*i))
@@ -83,6 +82,7 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
 
                 (*p)->path = *i;
                 (*p)->mode = mode;
+                (*p)->ignore = ignore;
                 (*p)++;
         }
 
-- 
cgit v1.2.3-54-g00ecf


From 11a30cec2a9b6168b024c06720ad238dd1390794 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:16:44 +0200
Subject: core:namespace: put paths protected by ProtectKernelTunables= in

Instead of having all these paths everywhere, put the ones that are
protected by ProtectKernelTunables= into their own table. This way it
is easy to add paths and track which ones are protected.
---
 src/core/namespace.c | 54 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8de774e6f6..13f6aeba51 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -61,9 +61,23 @@ typedef struct BindMount {
         const char *path; /* stack memory, doesn't need to be freed explicitly */
         char *chased; /* malloc()ed memory, needs to be freed */
         MountMode mode;
-        bool ignore;
+        bool ignore; /* Ignore if path does not exist */
 } BindMount;
 
+typedef struct TargetMount {
+        const char *path;
+        MountMode mode;
+        bool ignore; /* Ignore if path does not exist */
+} TargetMount;
+
+/* ProtectKernelTunables= option and the related filesystem APIs */
+static const TargetMount protect_kernel_tunables_table[] = {
+        { "/proc/sys",                  READONLY,       false },
+        { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/sys",                       READONLY,       false },
+        { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
+};
+
 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         char **i;
 
@@ -89,6 +103,20 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         return 0;
 }
 
+static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
+        unsigned int i;
+
+        assert(p);
+
+        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
+                const TargetMount *t = &protect_kernel_tunables_table[i];
+                (*p)->path = prefix_roota(root_directory, t->path);
+                (*p)->mode = t->mode;
+                (*p)->ignore = t->ignore;
+                (*p)++;
+        }
+}
+
 static int mount_path_compare(const void *a, const void *b) {
         const BindMount *p = a, *q = b;
         int d;
@@ -514,8 +542,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
-                (protect_sysctl ? 3 : 0) +
-                (protect_cgroups != protect_sysctl) +
+                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+                (protect_cgroups ? 1 : 0) +
                 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
                 (protect_system == PROTECT_SYSTEM_STRICT ?
                  (2 + !private_dev + !protect_sysctl) :
@@ -557,24 +585,12 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_sysctl) {
-                        m->path = prefix_roota(root_directory, "/proc/sys");
-                        m->mode = READONLY;
-                        m++;
-
-                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
-                        m->mode = READONLY;
-                        m->ignore = true; /* Not always compiled into the kernel */
-                        m++;
+                if (protect_sysctl)
+                        append_protect_kernel_tunables(&m, root_directory);
 
-                        m->path = prefix_roota(root_directory, "/sys");
-                        m->mode = READONLY;
-                        m++;
-                }
-
-                if (protect_cgroups != protect_sysctl) {
+                if (protect_cgroups) {
                         m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
-                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m->mode = READONLY;
                         m++;
                 }
 
-- 
cgit v1.2.3-54-g00ecf


From 2652c6c10394623b2c3e2ed5d4616c85918d140c Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:25:00 +0200
Subject: core:namespace: simplify mount calculation

Move out mount calculation on its own function. Actually the logic is
smart enough to later drop nop and duplicates mounts, this change
improves code readability.
---
 src/core/namespace.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)
---
 src/core/namespace.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 13f6aeba51..8aa8b83c88 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -515,6 +515,32 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned
         return 0;
 }
 
+static unsigned namespace_calculate_mounts(
+                char** read_write_paths,
+                char** read_only_paths,
+                char** inaccessible_paths,
+                const char* tmp_dir,
+                const char* var_tmp_dir,
+                bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
+                ProtectHome protect_home,
+                ProtectSystem protect_system) {
+
+        return !!tmp_dir + !!var_tmp_dir +
+                strv_length(read_write_paths) +
+                strv_length(read_only_paths) +
+                strv_length(inaccessible_paths) +
+                private_dev +
+                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+                (protect_cgroups ? 1 : 0) +
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+}
+
 int setup_namespace(
                 const char* root_directory,
                 char** read_write_paths,
@@ -537,19 +563,15 @@ int setup_namespace(
         if (mount_flags == 0)
                 mount_flags = MS_SHARED;
 
-        n = !!tmp_dir + !!var_tmp_dir +
-                strv_length(read_write_paths) +
-                strv_length(read_only_paths) +
-                strv_length(inaccessible_paths) +
-                private_dev +
-                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
-                (protect_cgroups ? 1 : 0) +
-                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                (protect_system == PROTECT_SYSTEM_STRICT ?
-                 (2 + !private_dev + !protect_sysctl) :
-                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
-                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+        n = namespace_calculate_mounts(read_write_paths,
+                                       read_only_paths,
+                                       inaccessible_paths,
+                                       tmp_dir, var_tmp_dir,
+                                       private_dev, protect_sysctl,
+                                       protect_cgroups, protect_home,
+                                       protect_system);
 
+        /* Set mount slave mode */
         if (root_directory || n > 0)
                 make_slave = true;
 
-- 
cgit v1.2.3-54-g00ecf


From 49accde7bd915944d99c947dca0cf26ae0f24165 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:30:11 +0200
Subject: core:sandbox: add more /proc/* entries to ProtectKernelTunables=

Make ALSA entries, latency interface, mtrr, apm/acpi, suspend interface,
filesystems configuration and IRQ tuning readonly.

Most of these interfaces now days should be in /sys but they are still
available through /proc, so just protect them. This patch does not touch
/proc/net/...
---
 man/systemd.exec.xml |  6 ++++--
 src/core/namespace.c | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'src/core')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index a3a431c82b..f19e7f6ee9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1026,8 +1026,10 @@
         <term><varname>ProtectKernelTunables=</varname></term>
 
         <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
-        <filename>/proc/sys</filename>, <filename>/sys</filename> and <filename>/proc/sysrq-trigger</filename> will be
-        made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
+        <filename>/proc/sys</filename>, <filename>/sys</filename>, <filename>/proc/sysrq-trigger</filename>,
+        <filename>/proc/latency_stats</filename>, <filename>/proc/acpi</filename>,
+        <filename>/proc/timer_stats</filename>, <filename>/proc/fs</filename> and <filename>/proc/irq</filename> will
+        be made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
         boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
         most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8aa8b83c88..3234fab4bc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -74,7 +74,18 @@ typedef struct TargetMount {
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
         { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/proc/latency_stats",        READONLY,       true  },
+        { "/proc/mtrr",                 READONLY,       true  },
+        { "/proc/apm",                  READONLY,       true  },
+        { "/proc/acpi",                 READONLY,       true  },
+        { "/proc/timer_stats",          READONLY,       true  },
+        { "/proc/asound",               READONLY,       true  },
+        { "/proc/bus",                  READONLY,       true  },
+        { "/proc/fs",                   READONLY,       true  },
+        { "/proc/irq",                  READONLY,       true  },
         { "/sys",                       READONLY,       false },
+        { "/sys/kernel/debug",          READONLY,       true  },
+        { "/sys/kernel/tracing",        READONLY,       true  },
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
-- 
cgit v1.2.3-54-g00ecf


From f471b2afa11c97e48a4b6756f7254f88cc436960 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:21:25 +0200
Subject: core: simplify ProtectSystem= implementation

ProtectSystem= with all its different modes and other options like
PrivateDevices= + ProtectKernelTunables= + ProtectHome= are orthogonal,
however currently it's a bit hard to parse that from the implementation
view. Simplify it by giving each mode its own table with all paths and
references to other Protect options.

With this change some entries are duplicated, but we do not care since
duplicate mounts are first sorted by the most restrictive mode then
cleaned.
---
 src/core/namespace.c | 171 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 58 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 3234fab4bc..985e343096 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -70,6 +70,14 @@ typedef struct TargetMount {
         bool ignore; /* Ignore if path does not exist */
 } TargetMount;
 
+/*
+ * The following Protect tables are to protect paths and mark some of them
+ * READONLY, in case a path is covered by an option from another table, then
+ * it is marked READWRITE in the current one, and the more restrictive mode is
+ * applied from that other table. This way all options can be combined in a
+ * safe and comprehensible way for users.
+ */
+
 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
@@ -89,6 +97,45 @@ static const TargetMount protect_kernel_tunables_table[] = {
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
+/* ProtectSystem=yes table */
+static const TargetMount protect_system_yes_table[] = {
+        { "/usr",       READONLY,       false },
+        { "/boot",      READONLY,       true  },
+        { "/efi",       READONLY,       true  },
+};
+
+/* ProtectSystem=full includes ProtectSystem=yes */
+static const TargetMount protect_system_full_table[] = {
+        { "/usr",       READONLY,       false },
+        { "/boot",      READONLY,       true  },
+        { "/efi",       READONLY,       true  },
+        { "/etc",       READONLY,       false },
+};
+
+/*
+ * ProtectSystem=strict table. In this strict mode, we mount everything
+ * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
+ * which are left writable, but PrivateDevices= + ProtectKernelTunables=
+ * protect those, and these options should be fully orthogonal.
+ * (And of course /home and friends are also left writable, as ProtectHome=
+ * shall manage those, orthogonally).
+ */
+static const TargetMount protect_system_strict_table[] = {
+        { "/",          READONLY,       false },
+        { "/proc",      READWRITE,      false },      /* ProtectKernelTunables= */
+        { "/sys",       READWRITE,      false },      /* ProtectKernelTunables= */
+        { "/dev",       READWRITE,      false },      /* PrivateDevices= */
+        { "/home",      READWRITE,      true  },      /* ProtectHome= */
+        { "/run/user",  READWRITE,      true  },      /* ProtectHome= */
+        { "/root",      READWRITE,      true  },      /* ProtectHome= */
+};
+
+static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) {
+        (*p)->path = path;
+        (*p)->mode = mode;
+        (*p)->ignore = ignore;
+}
+
 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         char **i;
 
@@ -105,27 +152,71 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
                 if (!path_is_absolute(*i))
                         return -EINVAL;
 
-                (*p)->path = *i;
-                (*p)->mode = mode;
-                (*p)->ignore = ignore;
+                set_bind_mount(p, *i, mode, ignore);
                 (*p)++;
         }
 
         return 0;
 }
 
-static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
-        unsigned int i;
+static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) {
+        unsigned i;
 
         assert(p);
+        assert(mounts);
 
-        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
-                const TargetMount *t = &protect_kernel_tunables_table[i];
-                (*p)->path = prefix_roota(root_directory, t->path);
-                (*p)->mode = t->mode;
-                (*p)->ignore = t->ignore;
+        for (i = 0; i < size; i++) {
+                /*
+                 * Here we assume that the ignore field is set during
+                 * declaration we do not support "-" at the beginning.
+                 */
+                const TargetMount *m = &mounts[i];
+                const char *path = prefix_roota(root_directory, m->path);
+
+                if (!path_is_absolute(path))
+                        return -EINVAL;
+
+                set_bind_mount(p, path, m->mode, m->ignore);
                 (*p)++;
         }
+
+        return 0;
+}
+
+static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
+        assert(p);
+
+        return append_target_mounts(p, root_directory, protect_kernel_tunables_table,
+                                    ELEMENTSOF(protect_kernel_tunables_table));
+}
+
+static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
+        int r = 0;
+
+        assert(p);
+
+        if (protect_system == PROTECT_SYSTEM_NO)
+                return 0;
+
+        switch (protect_system) {
+        case PROTECT_SYSTEM_STRICT:
+                r = append_target_mounts(p, root_directory, protect_system_strict_table,
+                                         ELEMENTSOF(protect_system_strict_table));
+                break;
+        case PROTECT_SYSTEM_YES:
+                r = append_target_mounts(p, root_directory, protect_system_yes_table,
+                                         ELEMENTSOF(protect_system_yes_table));
+                break;
+        case PROTECT_SYSTEM_FULL:
+                r = append_target_mounts(p, root_directory, protect_system_full_table,
+                                         ELEMENTSOF(protect_system_full_table));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+
+        return r;
 }
 
 static int mount_path_compare(const void *a, const void *b) {
@@ -538,6 +629,14 @@ static unsigned namespace_calculate_mounts(
                 ProtectHome protect_home,
                 ProtectSystem protect_system) {
 
+        unsigned protect_system_cnt =
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 ELEMENTSOF(protect_system_strict_table) :
+                 ((protect_system == PROTECT_SYSTEM_FULL) ?
+                  ELEMENTSOF(protect_system_full_table) :
+                  ((protect_system == PROTECT_SYSTEM_YES) ?
+                   ELEMENTSOF(protect_system_yes_table) : 0)));
+
         return !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -546,10 +645,7 @@ static unsigned namespace_calculate_mounts(
                 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (protect_cgroups ? 1 : 0) +
                 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                (protect_system == PROTECT_SYSTEM_STRICT ?
-                 (2 + !private_dev + !protect_sysctl) :
-                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
-                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+                protect_system_cnt;
 }
 
 int setup_namespace(
@@ -648,50 +744,9 @@ int setup_namespace(
                                 return r;
                 }
 
-                if (protect_system == PROTECT_SYSTEM_STRICT) {
-                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
-                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
-                         * protect those, and these options should be fully orthogonal. (And of course /home and
-                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
-                         * above). */
-
-                        m->path = prefix_roota(root_directory, "/");
-                        m->mode = READONLY;
-                        m++;
-
-                        m->path = prefix_roota(root_directory, "/proc");
-                        m->mode = READWRITE;
-                        m++;
-
-                        if (!private_dev) {
-                                m->path = prefix_roota(root_directory, "/dev");
-                                m->mode = READWRITE;
-                                m++;
-                        }
-                        if (!protect_sysctl) {
-                                m->path = prefix_roota(root_directory, "/sys");
-                                m->mode = READWRITE;
-                                m++;
-                        }
-
-                } else if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
-
-                        /* In any other mode we simply mark the relevant three directories ready-only. */
-
-                        usr_dir = prefix_roota(root_directory, "/usr");
-                        boot_dir = prefix_roota(root_directory, "/boot");
-                        boot_dir = strjoina("-", boot_dir);
-                        efi_dir = prefix_roota(root_directory, "/efi");
-                        efi_dir = strjoina("-", efi_dir);
-                        etc_dir = prefix_roota(root_directory, "/etc");
-
-                        r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
-                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
-                        if (r < 0)
-                                return r;
-                }
+                r = append_protect_system(&m, root_directory, protect_system);
+                if (r < 0)
+                        return r;
 
                 assert(mounts + n == m);
 
-- 
cgit v1.2.3-54-g00ecf


From b6c432ca7ed930c7e9078ac2266ae439aa242632 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:41:16 +0200
Subject: core:namespace: simplify ProtectHome= implementation

As with previous patch simplify ProtectHome and don't care about
duplicates, they will be sorted by most restrictive mode and cleaned.
---
 src/core/namespace.c | 75 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 22 deletions(-)

(limited to 'src/core')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 985e343096..43a2f4ba6e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -97,6 +97,23 @@ static const TargetMount protect_kernel_tunables_table[] = {
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
+/*
+ * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
+ * system should be protected by ProtectSystem=
+ */
+static const TargetMount protect_home_read_only_table[] = {
+        { "/home",      READONLY,       true },
+        { "/run/user",  READONLY,       true },
+        { "/root",      READONLY,       true },
+};
+
+/* ProtectHome=yes table */
+static const TargetMount protect_home_yes_table[] = {
+        { "/home",      INACCESSIBLE,   true },
+        { "/run/user",  INACCESSIBLE,   true },
+        { "/root",      INACCESSIBLE,   true },
+};
+
 /* ProtectSystem=yes table */
 static const TargetMount protect_system_yes_table[] = {
         { "/usr",       READONLY,       false },
@@ -190,6 +207,31 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct
                                     ELEMENTSOF(protect_kernel_tunables_table));
 }
 
+static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) {
+        int r = 0;
+
+        assert(p);
+
+        if (protect_home == PROTECT_HOME_NO)
+                return 0;
+
+        switch (protect_home) {
+        case PROTECT_HOME_READ_ONLY:
+                r = append_target_mounts(p, root_directory, protect_home_read_only_table,
+                                         ELEMENTSOF(protect_home_read_only_table));
+                break;
+        case PROTECT_HOME_YES:
+                r = append_target_mounts(p, root_directory, protect_home_yes_table,
+                                         ELEMENTSOF(protect_home_yes_table));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+
+        return r;
+}
+
 static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
         int r = 0;
 
@@ -629,6 +671,7 @@ static unsigned namespace_calculate_mounts(
                 ProtectHome protect_home,
                 ProtectSystem protect_system) {
 
+        unsigned protect_home_cnt;
         unsigned protect_system_cnt =
                 (protect_system == PROTECT_SYSTEM_STRICT ?
                  ELEMENTSOF(protect_system_strict_table) :
@@ -637,6 +680,12 @@ static unsigned namespace_calculate_mounts(
                   ((protect_system == PROTECT_SYSTEM_YES) ?
                    ELEMENTSOF(protect_system_yes_table) : 0)));
 
+        protect_home_cnt =
+                (protect_home == PROTECT_HOME_YES ?
+                 ELEMENTSOF(protect_home_yes_table) :
+                 ((protect_home == PROTECT_HOME_READ_ONLY) ?
+                  ELEMENTSOF(protect_home_read_only_table) : 0));
+
         return !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -644,8 +693,7 @@ static unsigned namespace_calculate_mounts(
                 private_dev +
                 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (protect_cgroups ? 1 : 0) +
-                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                protect_system_cnt;
+                protect_home_cnt + protect_system_cnt;
 }
 
 int setup_namespace(
@@ -723,26 +771,9 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
-                        const char *home_dir, *run_user_dir, *root_dir;
-
-                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
-                         * strict system protection mode, then also add entries for these directories, but mark them
-                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
-
-                        home_dir = prefix_roota(root_directory, "/home");
-                        home_dir = strjoina("-", home_dir);
-                        run_user_dir = prefix_roota(root_directory, "/run/user");
-                        run_user_dir = strjoina("-", run_user_dir);
-                        root_dir = prefix_roota(root_directory, "/root");
-                        root_dir = strjoina("-", root_dir);
-
-                        r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
-                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
-                        if (r < 0)
-                                return r;
-                }
+                r = append_protect_home(&m, root_directory, protect_home);
+                if (r < 0)
+                        return r;
 
                 r = append_protect_system(&m, root_directory, protect_system);
                 if (r < 0)
-- 
cgit v1.2.3-54-g00ecf


From 8f81a5f61bcf745bae3acad599d7a9da686643e3 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:52:27 +0200
Subject: core: Use @raw-io syscall group to filter I/O syscalls when
 PrivateDevices= is set

Instead of having a local syscall list, use the @raw-io group which
contains the same set of syscalls to filter.
---
 man/systemd.exec.xml |  6 ++++--
 src/core/execute.c   | 55 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'src/core')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index f19e7f6ee9..f70e5c36d4 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -933,8 +933,10 @@
         <filename>/dev/random</filename> (as well as the pseudo TTY subsystem) to it, but no physical devices such as
         <filename>/dev/sda</filename>, system memory <filename>/dev/mem</filename>, system ports
         <filename>/dev/port</filename> and others. This is useful to securely turn off physical device access by the
-        executed process. Defaults to false. Enabling this option will also remove <constant>CAP_MKNOD</constant> from
-        the capability bounding set for the unit (see above), and set <varname>DevicePolicy=closed</varname> (see
+        executed process. Defaults to false. Enabling this option will install a system call filter to block low-level
+        I/O system calls that are grouped in the <varname>@raw-io</varname> set, will also remove
+        <constant>CAP_MKNOD</constant> from the capability bounding set for the unit (see above), and set
+        <varname>DevicePolicy=closed</varname> (see
         <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         for details). Note that using this setting will disconnect propagation of mounts from the service to the host
         (propagation in the opposite direction continues to work).  This means that this setting may not be used for
diff --git a/src/core/execute.c b/src/core/execute.c
index 0488ba2ca9..3da7ef3be6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1429,28 +1429,15 @@ finish:
 }
 
 static int apply_private_devices(Unit *u, const ExecContext *c) {
-
-        static const int device_syscalls[] = {
-                SCMP_SYS(ioperm),
-                SCMP_SYS(iopl),
-                SCMP_SYS(pciconfig_iobase),
-                SCMP_SYS(pciconfig_read),
-                SCMP_SYS(pciconfig_write),
-#ifdef __NR_s390_pci_mmio_read
-                SCMP_SYS(s390_pci_mmio_read),
-#endif
-#ifdef __NR_s390_pci_mmio_write
-                SCMP_SYS(s390_pci_mmio_write),
-#endif
-        };
-
+        const SystemCallFilterSet *set;
         scmp_filter_ctx *seccomp;
-        unsigned i;
+        const char *sys;
+        bool syscalls_found = false;
         int r;
 
         assert(c);
 
-        /* If PrivateDevices= is set, also turn off iopl and friends. */
+        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
 
         if (skip_seccomp_unavailable(u, "PrivateDevices="))
                 return 0;
@@ -1463,12 +1450,40 @@ static int apply_private_devices(Unit *u, const ExecContext *c) {
         if (r < 0)
                 goto finish;
 
-        for (i = 0; i < ELEMENTSOF(device_syscalls); i++) {
+        for (set = syscall_filter_sets; set->set_name; set++)
+                if (streq(set->set_name, "@raw-io")) {
+                        syscalls_found = true;
+                        break;
+                }
+
+        /* We should never fail here */
+        if (!syscalls_found) {
+                r = -EOPNOTSUPP;
+                goto finish;
+        }
+
+        NULSTR_FOREACH(sys, set->value) {
+                int id;
+                bool add = true;
+
+#ifndef __NR_s390_pci_mmio_read
+                if (streq(sys, "s390_pci_mmio_read"))
+                        add = false;
+#endif
+#ifndef __NR_s390_pci_mmio_write
+                if (streq(sys, "s390_pci_mmio_write"))
+                        add = false;
+#endif
+
+                if (!add)
+                        continue;
+
+                id = seccomp_syscall_resolve_name(sys);
+
                 r = seccomp_rule_add(
                                 seccomp,
                                 SCMP_ACT_ERRNO(EPERM),
-                                device_syscalls[i],
-                                0);
+                                id, 0);
                 if (r < 0)
                         goto finish;
         }
-- 
cgit v1.2.3-54-g00ecf