From 59eeb84ba65483c5543d1bc840c2ac75642ef638 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 22 Aug 2016 18:43:59 +0200
Subject: core: add two new service settings ProtectKernelTunables= and
 ProtectControlGroups=

If enabled, these will block write access to /sys, /proc/sys and
/proc/sys/fs/cgroup.
---
 src/core/namespace.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 52a2505d94..f2768aeb28 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -53,7 +53,7 @@ typedef enum MountMode {
         PRIVATE_TMP,
         PRIVATE_VAR_TMP,
         PRIVATE_DEV,
-        READWRITE
+        READWRITE,
 } MountMode;
 
 typedef struct BindMount {
@@ -366,6 +366,8 @@ int setup_namespace(
                 const char* tmp_dir,
                 const char* var_tmp_dir,
                 bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
                 ProtectHome protect_home,
                 ProtectSystem protect_system,
                 unsigned long mount_flags) {
@@ -385,6 +387,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
+                (protect_sysctl ? 3 : 0) +
+                (protect_cgroups != protect_sysctl) +
                 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
                 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
                 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
@@ -421,6 +425,27 @@ int setup_namespace(
                         m++;
                 }
 
+                if (protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/proc/sys");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
+                        m->mode = READONLY;
+                        m->ignore = true; /* Not always compiled into the kernel */
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/sys");
+                        m->mode = READONLY;
+                        m++;
+                }
+
+                if (protect_cgroups != protect_sysctl) {
+                        m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
+                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m++;
+                }
+
                 if (protect_home != PROTECT_HOME_NO) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
@@ -505,9 +530,12 @@ int setup_namespace(
 
 fail:
         if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m)
-                        if (m->done)
-                                (void) umount2(m->path, MNT_DETACH);
+                for (m = mounts; m < mounts + n; ++m) {
+                        if (!m->done)
+                                continue;
+
+                        (void) umount2(m->path, MNT_DETACH);
+                }
         }
 
         return r;
-- 
cgit v1.2.3-54-g00ecf


From fe3c2583bee339b6744872dc1897e6486d5bd7e0 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 24 Aug 2016 23:17:42 +0200
Subject: namespace: make sure InaccessibleDirectories= masks all mounts
 further down

If a dir is marked to be inaccessible then everything below it should be masked
by it.
---
 src/core/namespace.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 src/test/test-ns.c   |  4 +++-
 2 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index f2768aeb28..102fe576f3 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -116,16 +116,47 @@ static void drop_duplicates(BindMount *m, unsigned *n) {
         assert(m);
         assert(n);
 
+        /* Drops duplicate entries. Expects that the array is properly ordered already. */
+
         for (f = m, t = m, previous = NULL; f < m+*n; f++) {
 
-                /* The first one wins */
-                if (previous && path_equal(f->path, previous->path))
+                /* The first one wins (which is the one with the more restrictive mode), see mount_path_compare()
+                 * above. */
+                if (previous && path_equal(f->path, previous->path)) {
+                        log_debug("%s is duplicate.", f->path);
                         continue;
+                }
 
                 *t = *f;
-
                 previous = t;
+                t++;
+        }
+
+        *n = t - m;
+}
+
+static void drop_inaccessible(BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+        const char *clear = NULL;
+
+        assert(m);
+        assert(n);
+
+        /* Drops all entries obstructed by another entry further up the tree. Expects that the array is properly
+         * ordered already. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                /* If we found a path set for INACCESSIBLE earlier, and this entry has it as prefix we should drop
+                 * it, as inaccessible paths really should drop the entire subtree. */
+                if (clear && path_startswith(f->path, clear)) {
+                        log_debug("%s is masked by %s.", f->path, clear);
+                        continue;
+                }
 
+                clear = f->mode == INACCESSIBLE ? f->path : NULL;
+
+                *t = *f;
                 t++;
         }
 
@@ -282,6 +313,8 @@ static int apply_mount(
 
         assert(m);
 
+        log_debug("Applying namespace mount on %s", m->path);
+
         switch (m->mode) {
 
         case INACCESSIBLE:
@@ -289,7 +322,7 @@ static int apply_mount(
                 /* First, get rid of everything that is below if there
                  * is anything... Then, overmount it with an
                  * inaccessible path. */
-                umount_recursive(m->path, 0);
+                (void) umount_recursive(m->path, 0);
 
                 if (lstat(m->path, &target) < 0) {
                         if (m->ignore && errno == ENOENT)
@@ -303,6 +336,7 @@ static int apply_mount(
                         return -ELOOP;
                 }
                 break;
+
         case READONLY:
         case READWRITE:
                 /* Nothing to mount here, we just later toggle the
@@ -480,7 +514,9 @@ int setup_namespace(
                 assert(mounts + n == m);
 
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
+
                 drop_duplicates(mounts, &n);
+                drop_inaccessible(mounts, &n);
         }
 
         if (n > 0 || root_directory) {
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 05f243c75c..03a24620af 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -26,6 +26,7 @@
 int main(int argc, char *argv[]) {
         const char * const writable[] = {
                 "/home",
+                "/home/lennart/projects/foobar", /* this should be masked automatically */
                 NULL
         };
 
@@ -42,11 +43,12 @@ int main(int argc, char *argv[]) {
         };
         char *root_directory;
         char *projects_directory;
-
         int r;
         char tmp_dir[] = "/tmp/systemd-private-XXXXXX",
              var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX";
 
+        log_set_max_level(LOG_DEBUG);
+
         assert_se(mkdtemp(tmp_dir));
         assert_se(mkdtemp(var_tmp_dir));
 
-- 
cgit v1.2.3-54-g00ecf


From 6ee1a919cf9013a695da2a01ae67327b996a6ef6 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 10:44:09 +0200
Subject: namespace: simplify mount_path_compare() a bit

---
 src/core/namespace.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 102fe576f3..74201caa10 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -93,21 +93,19 @@ static int mount_path_compare(const void *a, const void *b) {
         const BindMount *p = a, *q = b;
         int d;
 
+        /* If the paths are not equal, then order prefixes first */
         d = path_compare(p->path, q->path);
+        if (d != 0)
+                return d;
 
-        if (d == 0) {
-                /* If the paths are equal, check the mode */
-                if (p->mode < q->mode)
-                        return -1;
-
-                if (p->mode > q->mode)
-                        return 1;
+        /* If the paths are equal, check the mode */
+        if (p->mode < q->mode)
+                return -1;
 
-                return 0;
-        }
+        if (p->mode > q->mode)
+                return 1;
 
-        /* If the paths are not equal, then order prefixes first */
-        return d;
+        return 0;
 }
 
 static void drop_duplicates(BindMount *m, unsigned *n) {
-- 
cgit v1.2.3-54-g00ecf


From 7648a565d14dfb5516d93bacf0d87de2de5b5d91 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 11:29:32 +0200
Subject: namespace: when enforcing fs namespace restrictions suppress
 redundant mounts

If /foo is marked to be read-only, and /foo/bar too, then the latter may be
suppressed as it has no effect.
---
 src/core/namespace.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 74201caa10..72f850b2f2 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -161,6 +161,44 @@ static void drop_inaccessible(BindMount *m, unsigned *n) {
         *n = t - m;
 }
 
+static void drop_nop(BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+
+        assert(m);
+        assert(n);
+
+        /* Drops all entries which have an immediate parent that has the same type, as they are redundant. Assumes the
+         * list is ordered by prefixes. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                /* Only suppress such subtrees for READONLY and READWRITE entries */
+                if (IN_SET(f->mode, READONLY, READWRITE)) {
+                        BindMount *p;
+                        bool found = false;
+
+                        /* Now let's find the first parent of the entry we are looking at. */
+                        for (p = t-1; p >= m; p--) {
+                                if (path_startswith(f->path, p->path)) {
+                                        found = true;
+                                        break;
+                                }
+                        }
+
+                        /* We found it, let's see if it's the same mode, if so, we can drop this entry */
+                        if (found && p->mode == f->mode) {
+                                log_debug("%s is redundant by %s", f->path, p->path);
+                                continue;
+                        }
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+}
+
 static int mount_dev(BindMount *m) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -515,6 +553,7 @@ int setup_namespace(
 
                 drop_duplicates(mounts, &n);
                 drop_inaccessible(mounts, &n);
+                drop_nop(mounts, &n);
         }
 
         if (n > 0 || root_directory) {
-- 
cgit v1.2.3-54-g00ecf


From 6b7c9f8bce4679c89f3b89cacfd4932c0aeadad4 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Sun, 25 Sep 2016 10:40:51 +0200
Subject: namespace: rework how ReadWritePaths= is applied

Previously, if ReadWritePaths= was nested inside a ReadOnlyPaths=
specification, then we'd first recursively apply the ReadOnlyPaths= paths, and
make everything below read-only, only in order to then flip the read-only bit
again for the subdirs listed in ReadWritePaths= below it.

This is not only ugly (as for the dirs in question we first turn on the RO bit,
only to turn it off again immediately after), but also problematic in
containers, where a container manager might have marked a set of dirs read-only
and this code will undo this is ReadWritePaths= is set for any.

With this patch behaviour in this regard is altered: ReadOnlyPaths= will not be
applied to the children listed in ReadWritePaths= in the first place, so that
we do not need to turn off the RO bit for those after all.

This means that ReadWritePaths=/ReadOnlyPaths= may only be used to turn on the
RO bit, but never to turn it off again. Or to say this differently: if some
dirs are marked read-only via some external tool, then ReadWritePaths= will not
undo it.

This is not only the safer option, but also more in-line with what the man page
currently claims:

        "Entries (files or directories) listed in ReadWritePaths= are
        accessible from within the namespace with the same access rights as
        from outside."

To implement this change bind_remount_recursive() gained a new "blacklist"
string list parameter, which when passed may contain subdirs that shall be
excluded from the read-only mounting.

A number of functions are updated to add more debug logging to make this more
digestable.
---
 src/basic/mount-util.c    | 71 ++++++++++++++++++++++++++++++++---------------
 src/basic/mount-util.h    |  2 +-
 src/core/namespace.c      | 66 ++++++++++++++++++++++++++++---------------
 src/nspawn/nspawn-mount.c |  6 ++--
 src/nspawn/nspawn.c       |  2 +-
 5 files changed, 96 insertions(+), 51 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/basic/mount-util.c b/src/basic/mount-util.c
index bfa04394fe..b9affb4e70 100644
--- a/src/basic/mount-util.c
+++ b/src/basic/mount-util.c
@@ -36,6 +36,7 @@
 #include "set.h"
 #include "stdio-util.h"
 #include "string-util.h"
+#include "strv.h"
 
 static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *mnt_id) {
         char path[strlen("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
@@ -287,10 +288,12 @@ int umount_recursive(const char *prefix, int flags) {
                                 continue;
 
                         if (umount2(p, flags) < 0) {
-                                r = -errno;
+                                r = log_debug_errno(errno, "Failed to umount %s: %m", p);
                                 continue;
                         }
 
+                        log_debug("Successfully unmounted %s", p);
+
                         again = true;
                         n++;
 
@@ -311,24 +314,21 @@ static int get_mount_flags(const char *path, unsigned long *flags) {
         return 0;
 }
 
-int bind_remount_recursive(const char *prefix, bool ro) {
+int bind_remount_recursive(const char *prefix, bool ro, char **blacklist) {
         _cleanup_set_free_free_ Set *done = NULL;
         _cleanup_free_ char *cleaned = NULL;
         int r;
 
-        /* Recursively remount a directory (and all its submounts)
-         * read-only or read-write. If the directory is already
-         * mounted, we reuse the mount and simply mark it
-         * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
-         * operation). If it isn't we first make it one. Afterwards we
-         * apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to all
-         * submounts we can access, too. When mounts are stacked on
-         * the same mount point we only care for each individual
-         * "top-level" mount on each point, as we cannot
-         * influence/access the underlying mounts anyway. We do not
-         * have any effect on future submounts that might get
-         * propagated, they migt be writable. This includes future
-         * submounts that have been triggered via autofs. */
+        /* Recursively remount a directory (and all its submounts) read-only or read-write. If the directory is already
+         * mounted, we reuse the mount and simply mark it MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write
+         * operation). If it isn't we first make it one. Afterwards we apply MS_BIND|MS_RDONLY (or remove MS_RDONLY) to
+         * all submounts we can access, too. When mounts are stacked on the same mount point we only care for each
+         * individual "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We
+         * do not have any effect on future submounts that might get propagated, they migt be writable. This includes
+         * future submounts that have been triggered via autofs.
+         *
+         * If the "blacklist" parameter is specified it may contain a list of subtrees to exclude from the
+         * remount operation. Note that we'll ignore the blacklist for the top-level path. */
 
         cleaned = strdup(prefix);
         if (!cleaned)
@@ -385,6 +385,33 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (r < 0)
                                 return r;
 
+                        if (!path_startswith(p, cleaned))
+                                continue;
+
+                        /* Ignore this mount if it is blacklisted, but only if it isn't the top-level mount we shall
+                         * operate on. */
+                        if (!path_equal(cleaned, p)) {
+                                bool blacklisted = false;
+                                char **i;
+
+                                STRV_FOREACH(i, blacklist) {
+
+                                        if (path_equal(*i, cleaned))
+                                                continue;
+
+                                        if (!path_startswith(*i, cleaned))
+                                                continue;
+
+                                        if (path_startswith(p, *i)) {
+                                                blacklisted = true;
+                                                log_debug("Not remounting %s, because blacklisted by %s, called for %s", p, *i, cleaned);
+                                                break;
+                                        }
+                                }
+                                if (blacklisted)
+                                        continue;
+                        }
+
                         /* Let's ignore autofs mounts.  If they aren't
                          * triggered yet, we want to avoid triggering
                          * them, as we don't make any guarantees for
@@ -396,12 +423,9 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                                 continue;
                         }
 
-                        if (path_startswith(p, cleaned) &&
-                            !set_contains(done, p)) {
-
+                        if (!set_contains(done, p)) {
                                 r = set_consume(todo, p);
                                 p = NULL;
-
                                 if (r == -EEXIST)
                                         continue;
                                 if (r < 0)
@@ -418,8 +442,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
 
                 if (!set_contains(done, cleaned) &&
                     !set_contains(todo, cleaned)) {
-                        /* The prefix directory itself is not yet a
-                         * mount, make it one. */
+                        /* The prefix directory itself is not yet a mount, make it one. */
                         if (mount(cleaned, cleaned, NULL, MS_BIND|MS_REC, NULL) < 0)
                                 return -errno;
 
@@ -430,6 +453,8 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (mount(NULL, prefix, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
                                 return -errno;
 
+                        log_debug("Made top-level directory %s a mount point.", prefix);
+
                         x = strdup(cleaned);
                         if (!x)
                                 return -ENOMEM;
@@ -447,8 +472,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (r < 0)
                                 return r;
 
-                        /* Deal with mount points that are obstructed by a
-                         * later mount */
+                        /* Deal with mount points that are obstructed by a later mount */
                         r = path_is_mount_point(x, 0);
                         if (r == -ENOENT || r == 0)
                                 continue;
@@ -463,6 +487,7 @@ int bind_remount_recursive(const char *prefix, bool ro) {
                         if (mount(NULL, x, NULL, orig_flags|MS_BIND|MS_REMOUNT|(ro ? MS_RDONLY : 0), NULL) < 0)
                                 return -errno;
 
+                        log_debug("Remounted %s read-only.", x);
                 }
         }
 }
diff --git a/src/basic/mount-util.h b/src/basic/mount-util.h
index f46989ebb3..74730de663 100644
--- a/src/basic/mount-util.h
+++ b/src/basic/mount-util.h
@@ -35,7 +35,7 @@ int path_is_mount_point(const char *path, int flags);
 int repeat_unmount(const char *path, int flags);
 
 int umount_recursive(const char *target, int flags);
-int bind_remount_recursive(const char *prefix, bool ro);
+int bind_remount_recursive(const char *prefix, bool ro, char **blacklist);
 
 int mount_move_root(const char *path);
 
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 72f850b2f2..b0dab9459e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -375,9 +375,19 @@ static int apply_mount(
 
         case READONLY:
         case READWRITE:
-                /* Nothing to mount here, we just later toggle the
-                 * MS_RDONLY bit for the mount point */
-                return 0;
+
+                r = path_is_mount_point(m->path, 0);
+                if (r < 0) {
+                        if (m->ignore && errno == ENOENT)
+                                return 0;
+                        return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
+                }
+                if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
+                        return 0;
+
+                /* This isn't a mount point yet, let's make it one. */
+                what = m->path;
+                break;
 
         case PRIVATE_TMP:
                 what = tmp_dir;
@@ -396,31 +406,33 @@ static int apply_mount(
 
         assert(what);
 
-        r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
-        if (r >= 0) {
-                log_debug("Successfully mounted %s to %s", what, m->path);
-                return r;
-        } else {
+        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
                 if (m->ignore && errno == ENOENT)
                         return 0;
+
                 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
         }
+
+        log_debug("Successfully mounted %s to %s", what, m->path);
+        return 0;
 }
 
-static int make_read_only(BindMount *m) {
-        int r;
+static int make_read_only(BindMount *m, char **blacklist) {
+        int r = 0;
 
         assert(m);
 
         if (IN_SET(m->mode, INACCESSIBLE, READONLY))
-                r = bind_remount_recursive(m->path, true);
-        else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV)) {
-                r = bind_remount_recursive(m->path, false);
-                if (r == 0 && m->mode == PRIVATE_DEV) /* can be readonly but the submounts can't*/
-                        if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
-                                r = -errno;
+                r = bind_remount_recursive(m->path, true, blacklist);
+        else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
+                if (mount(NULL, m->path, NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
+                        r = -errno;
         } else
-                r = 0;
+                return 0;
+
+        /* Not that we only turn on the MS_RDONLY flag here, we never turn it off. Something that was marked read-only
+         * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
+         * read-only mounts already applied. */
 
         if (m->ignore && r == -ENOENT)
                 return 0;
@@ -570,14 +582,25 @@ int setup_namespace(
         }
 
         if (n > 0) {
+                char **blacklist;
+                unsigned j;
+
+                /* First round, add in all special mounts we need */
                 for (m = mounts; m < mounts + n; ++m) {
                         r = apply_mount(m, tmp_dir, var_tmp_dir);
                         if (r < 0)
                                 goto fail;
                 }
 
+                /* Create a blacklist we can pass to bind_mount_recursive() */
+                blacklist = newa(char*, n+1);
+                for (j = 0; j < n; j++)
+                        blacklist[j] = (char*) mounts[j].path;
+                blacklist[j] = NULL;
+
+                /* Second round, flip the ro bits if necessary. */
                 for (m = mounts; m < mounts + n; ++m) {
-                        r = make_read_only(m);
+                        r = make_read_only(m, blacklist);
                         if (r < 0)
                                 goto fail;
                 }
@@ -586,9 +609,7 @@ int setup_namespace(
         if (root_directory) {
                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
                 r = mount_move_root(root_directory);
-
-                /* at this point, we cannot rollback */
-                if (r < 0)
+                if (r < 0) /* at this point, we cannot rollback */
                         return r;
         }
 
@@ -596,8 +617,7 @@ int setup_namespace(
          * reestablish propagation from our side to the host, since
          * what's disconnected is disconnected. */
         if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
-                /* at this point, we cannot rollback */
-                return -errno;
+                return -errno; /* at this point, we cannot rollback */
 
         return 0;
 
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 295b75341f..8457357003 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -476,7 +476,7 @@ static int mount_bind(const char *dest, CustomMount *m) {
                 return log_error_errno(errno, "mount(%s) failed: %m", where);
 
         if (m->read_only) {
-                r = bind_remount_recursive(where, true);
+                r = bind_remount_recursive(where, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Read-only bind mount failed: %m");
         }
@@ -990,7 +990,7 @@ int setup_volatile_state(
         /* --volatile=state means we simply overmount /var
            with a tmpfs, and the rest read-only. */
 
-        r = bind_remount_recursive(directory, true);
+        r = bind_remount_recursive(directory, true, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
@@ -1065,7 +1065,7 @@ int setup_volatile(
 
         bind_mounted = true;
 
-        r = bind_remount_recursive(t, true);
+        r = bind_remount_recursive(t, true, NULL);
         if (r < 0) {
                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
                 goto fail;
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 0d61d34ebf..1f3e1f2dac 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -3019,7 +3019,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only) {
-                r = bind_remount_recursive(directory, true);
+                r = bind_remount_recursive(directory, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
-- 
cgit v1.2.3-54-g00ecf


From 160cfdbed3eb23b6bc3c17613685b756f23be4a1 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:51:37 +0200
Subject: namespace: add some debug logging when enforcing InaccessiblePaths=

---
 src/core/namespace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index b0dab9459e..e08d7459c5 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -345,7 +345,6 @@ static int apply_mount(
 
         const char *what;
         int r;
-        struct stat target;
 
         assert(m);
 
@@ -353,7 +352,8 @@ static int apply_mount(
 
         switch (m->mode) {
 
-        case INACCESSIBLE:
+        case INACCESSIBLE: {
+                struct stat target;
 
                 /* First, get rid of everything that is below if there
                  * is anything... Then, overmount it with an
@@ -363,7 +363,7 @@ static int apply_mount(
                 if (lstat(m->path, &target) < 0) {
                         if (m->ignore && errno == ENOENT)
                                 return 0;
-                        return -errno;
+                        return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
                 }
 
                 what = mode_to_inaccessible_node(target.st_mode);
@@ -372,6 +372,7 @@ static int apply_mount(
                         return -ELOOP;
                 }
                 break;
+        }
 
         case READONLY:
         case READWRITE:
-- 
cgit v1.2.3-54-g00ecf


From 3f815163ff8fdcdbd329680580df36f94e15325d Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 15:57:21 +0200
Subject: core: introduce ProtectSystem=strict

Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a
new setting "strict". If set, the entire directory tree of the system is
mounted read-only, but the API file systems /proc, /dev, /sys are excluded
(they may be managed with PrivateDevices= and ProtectKernelTunables=). Also,
/home and /root are excluded as those are left for ProtectHome= to manage.

In this mode, all "real" file systems (i.e. non-API file systems) are mounted
read-only, and specific directories may only be excluded via
ReadWriteDirectories=, thus implementing an effective whitelist instead of
blacklist of writable directories.

While we are at, also add /efi to the list of paths always affected by
ProtectSystem=. This is a follow-up for
b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for
/boot. Our namespacing logic should respect that too.
---
 man/systemd.exec.xml | 33 ++++++++++++++++---------------
 src/core/namespace.c | 56 +++++++++++++++++++++++++++++++++++++++++++---------
 src/core/namespace.h |  1 +
 3 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 07128b489e..1b672fe0c9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1020,22 +1020,23 @@
       <varlistentry>
         <term><varname>ProtectSystem=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>full</literal>. If true, mounts the
-        <filename>/usr</filename> and <filename>/boot</filename>
-        directories read-only for processes invoked by this unit. If
-        set to <literal>full</literal>, the <filename>/etc</filename>
-        directory is mounted read-only, too. This setting ensures that
-        any modification of the vendor-supplied operating system (and
-        optionally its configuration) is prohibited for the service.
-        It is recommended to enable this setting for all long-running
-        services, unless they are involved with system updates or need
-        to modify the operating system in other ways. Note however
-        that processes retaining the CAP_SYS_ADMIN capability can undo
-        the effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or the special values <literal>full</literal> or
+        <literal>strict</literal>. If true, mounts the <filename>/usr</filename> and <filename>/boot</filename>
+        directories read-only for processes invoked by this unit. If set to <literal>full</literal>, the
+        <filename>/etc</filename> directory is mounted read-only, too. If set to <literal>strict</literal> the entire
+        file system hierarchy is mounted read-only, except for the API file system subtrees <filename>/dev</filename>,
+        <filename>/proc</filename> and <filename>/sys</filename> (protect these directories using
+        <varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>). This setting ensures that any modification of the vendor-supplied
+        operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
+        recommended to enable this setting for all long-running services, unless they are involved with system updates
+        or need to modify the operating system in other ways. If this option is used,
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
+        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
+        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
+        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
+        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
+        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
diff --git a/src/core/namespace.c b/src/core/namespace.c
index e08d7459c5..498cd139bf 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -472,9 +472,11 @@ int setup_namespace(
                 private_dev +
                 (protect_sysctl ? 3 : 0) +
                 (protect_cgroups != protect_sysctl) +
-                (protect_home != PROTECT_HOME_NO ? 3 : 0) +
-                (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
-                (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
@@ -529,9 +531,13 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO) {
+                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
+                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
+                         * strict system protection mode, then also add entries for these directories, but mark them
+                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
+
                         home_dir = prefix_roota(root_directory, "/home");
                         home_dir = strjoina("-", home_dir);
                         run_user_dir = prefix_roota(root_directory, "/run/user");
@@ -540,22 +546,53 @@ int setup_namespace(
                         root_dir = strjoina("-", root_dir);
 
                         r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
+                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
+                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
                         if (r < 0)
                                 return r;
                 }
 
-                if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *etc_dir;
+                if (protect_system == PROTECT_SYSTEM_STRICT) {
+                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
+                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+                         * protect those, and these options should be fully orthogonal. (And of course /home and
+                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
+                         * above). */
+
+                        m->path = prefix_roota(root_directory, "/");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc");
+                        m->mode = READWRITE;
+                        m++;
+
+                        if (!private_dev) {
+                                m->path = prefix_roota(root_directory, "/dev");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+                        if (!protect_sysctl) {
+                                m->path = prefix_roota(root_directory, "/sys");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+
+                } else if (protect_system != PROTECT_SYSTEM_NO) {
+                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
+
+                        /* In any other mode we simply mark the relevant three directories ready-only. */
 
                         usr_dir = prefix_roota(root_directory, "/usr");
                         boot_dir = prefix_roota(root_directory, "/boot");
                         boot_dir = strjoina("-", boot_dir);
+                        efi_dir = prefix_roota(root_directory, "/efi");
+                        efi_dir = strjoina("-", efi_dir);
                         etc_dir = prefix_roota(root_directory, "/etc");
 
                         r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
-                                : STRV_MAKE(usr_dir, boot_dir), READONLY);
+                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
+                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
                         if (r < 0)
                                 return r;
                 }
@@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
         [PROTECT_SYSTEM_NO] = "no",
         [PROTECT_SYSTEM_YES] = "yes",
         [PROTECT_SYSTEM_FULL] = "full",
+        [PROTECT_SYSTEM_STRICT] = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 3845336287..6505bcc499 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -35,6 +35,7 @@ typedef enum ProtectSystem {
         PROTECT_SYSTEM_NO,
         PROTECT_SYSTEM_YES,
         PROTECT_SYSTEM_FULL,
+        PROTECT_SYSTEM_STRICT,
         _PROTECT_SYSTEM_MAX,
         _PROTECT_SYSTEM_INVALID = -1
 } ProtectSystem;
-- 
cgit v1.2.3-54-g00ecf


From 1e4e94c8819e2fe3a7217690c0590dba8ab0be9e Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 25 Aug 2016 17:30:47 +0200
Subject: namespace: invoke unshare() only after checking all parameters

Let's create the new namespace only after we validated and processed all
parameters, right before we start with actually mounting things.

This way, the window where we can roll back is larger (not that it matters
IRL...)
---
 src/core/namespace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 498cd139bf..356d3c8121 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -462,9 +462,6 @@ int setup_namespace(
         if (mount_flags == 0)
                 mount_flags = MS_SHARED;
 
-        if (unshare(CLONE_NEWNS) < 0)
-                return -errno;
-
         n = !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -606,6 +603,9 @@ int setup_namespace(
                 drop_nop(mounts, &n);
         }
 
+        if (unshare(CLONE_NEWNS) < 0)
+                return -errno;
+
         if (n > 0 || root_directory) {
                 /* Remount / as SLAVE so that nothing now mounted in the namespace
                    shows up in the parent */
-- 
cgit v1.2.3-54-g00ecf


From d944dc9553009822deaddec76814f5642a6a8176 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Sat, 24 Sep 2016 12:41:30 +0200
Subject: namespace: chase symlinks for mounts to set up in userspace

This adds logic to chase symlinks for all mount points that shall be created in
a namespace environment in userspace, instead of leaving this to the kernel.
This has the advantage that we can correctly handle absolute symlinks that
shall be taken relative to a specific root directory. Moreover, we can properly
handle mounts created on symlinked files or directories as we can merge their
mounts as necessary.

(This also drops the "done" flag in the namespace logic, which was never
actually working, but was supposed to permit a partial rollback of the
namespace logic, which however is only mildly useful as it wasn't clear in
which case it would or would not be able to roll back.)

Fixes: #3867
---
 src/basic/fs-util.c     | 187 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/basic/fs-util.h     |   2 +
 src/core/namespace.c    | 118 +++++++++++++++++++-----------
 src/test/test-fs-util.c |  96 ++++++++++++++++++++++++-
 src/test/test-ns.c      |  10 ++-
 5 files changed, 367 insertions(+), 46 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c
index ce87257bc1..86d9ad7e36 100644
--- a/src/basic/fs-util.c
+++ b/src/basic/fs-util.c
@@ -597,3 +597,190 @@ int inotify_add_watch_fd(int fd, int what, uint32_t mask) {
 
         return r;
 }
+
+int chase_symlinks(const char *path, const char *_root, char **ret) {
+        _cleanup_free_ char *buffer = NULL, *done = NULL, *root = NULL;
+        _cleanup_close_ int fd = -1;
+        unsigned max_follow = 32; /* how many symlinks to follow before giving up and returning ELOOP */
+        char *todo;
+        int r;
+
+        assert(path);
+
+        /* This is a lot like canonicalize_file_name(), but takes an additional "root" parameter, that allows following
+         * symlinks relative to a root directory, instead of the root of the host.
+         *
+         * Note that "root" matters only if we encounter an absolute symlink, it's unused otherwise. Most importantly
+         * this means the path parameter passed in is not prefixed by it.
+         *
+         * Algorithmically this operates on two path buffers: "done" are the components of the path we already
+         * processed and resolved symlinks, "." and ".." of. "todo" are the components of the path we still need to
+         * process. On each iteration, we move one component from "todo" to "done", processing it's special meaning
+         * each time. The "todo" path always starts with at least one slash, the "done" path always ends in no
+         * slash. We always keep an O_PATH fd to the component we are currently processing, thus keeping lookup races
+         * at a minimum. */
+
+        r = path_make_absolute_cwd(path, &buffer);
+        if (r < 0)
+                return r;
+
+        if (_root) {
+                r = path_make_absolute_cwd(_root, &root);
+                if (r < 0)
+                        return r;
+        }
+
+        fd = open("/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+        if (fd < 0)
+                return -errno;
+
+        todo = buffer;
+        for (;;) {
+                _cleanup_free_ char *first = NULL;
+                _cleanup_close_ int child = -1;
+                struct stat st;
+                size_t n, m;
+
+                /* Determine length of first component in the path */
+                n = strspn(todo, "/");                  /* The slashes */
+                m = n + strcspn(todo + n, "/");         /* The entire length of the component */
+
+                /* Extract the first component. */
+                first = strndup(todo, m);
+                if (!first)
+                        return -ENOMEM;
+
+                todo += m;
+
+                /* Just a single slash? Then we reached the end. */
+                if (isempty(first) || path_equal(first, "/"))
+                        break;
+
+                /* Just a dot? Then let's eat this up. */
+                if (path_equal(first, "/."))
+                        continue;
+
+                /* Two dots? Then chop off the last bit of what we already found out. */
+                if (path_equal(first, "/..")) {
+                        _cleanup_free_ char *parent = NULL;
+                        int fd_parent = -1;
+
+                        if (isempty(done) || path_equal(done, "/"))
+                                return -EINVAL;
+
+                        parent = dirname_malloc(done);
+                        if (!parent)
+                                return -ENOMEM;
+
+                        /* Don't allow this to leave the root dir */
+                        if (root &&
+                            path_startswith(done, root) &&
+                            !path_startswith(parent, root))
+                                return -EINVAL;
+
+                        free(done);
+                        done = parent;
+                        parent = NULL;
+
+                        fd_parent = openat(fd, "..", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                        if (fd_parent < 0)
+                                return -errno;
+
+                        safe_close(fd);
+                        fd = fd_parent;
+
+                        continue;
+                }
+
+                /* Otherwise let's see what this is. */
+                child = openat(fd, first + n, O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                if (child < 0)
+                        return -errno;
+
+                if (fstat(child, &st) < 0)
+                        return -errno;
+
+                if (S_ISLNK(st.st_mode)) {
+                        _cleanup_free_ char *destination = NULL;
+
+                        /* This is a symlink, in this case read the destination. But let's make sure we don't follow
+                         * symlinks without bounds. */
+                        if (--max_follow <= 0)
+                                return -ELOOP;
+
+                        r = readlinkat_malloc(fd, first + n, &destination);
+                        if (r < 0)
+                                return r;
+                        if (isempty(destination))
+                                return -EINVAL;
+
+                        if (path_is_absolute(destination)) {
+
+                                /* An absolute destination. Start the loop from the beginning, but use the root
+                                 * directory as base. */
+
+                                safe_close(fd);
+                                fd = open(root ?: "/", O_CLOEXEC|O_NOFOLLOW|O_PATH);
+                                if (fd < 0)
+                                        return -errno;
+
+                                free(buffer);
+                                buffer = destination;
+                                destination = NULL;
+
+                                todo = buffer;
+                                free(done);
+
+                                /* Note that we do not revalidate the root, we take it as is. */
+                                if (isempty(root))
+                                        done = NULL;
+                                else {
+                                        done = strdup(root);
+                                        if (!done)
+                                                return -ENOMEM;
+                                }
+
+                        } else {
+                                char *joined;
+
+                                /* A relative destination. If so, this is what we'll prefix what's left to do with what
+                                 * we just read, and start the loop again, but remain in the current directory. */
+
+                                joined = strjoin("/", destination, todo, NULL);
+                                if (!joined)
+                                        return -ENOMEM;
+
+                                free(buffer);
+                                todo = buffer = joined;
+                        }
+
+                        continue;
+                }
+
+                /* If this is not a symlink, then let's just add the name we read to what we already verified. */
+                if (!done) {
+                        done = first;
+                        first = NULL;
+                } else {
+                        if (!strextend(&done, first, NULL))
+                                return -ENOMEM;
+                }
+
+                /* And iterate again, but go one directory further down. */
+                safe_close(fd);
+                fd = child;
+                child = -1;
+        }
+
+        if (!done) {
+                /* Special case, turn the empty string into "/", to indicate the root directory. */
+                done = strdup("/");
+                if (!done)
+                        return -ENOMEM;
+        }
+
+        *ret = done;
+        done = NULL;
+
+        return 0;
+}
diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h
index 2c3b9a1c74..31df47cf1e 100644
--- a/src/basic/fs-util.h
+++ b/src/basic/fs-util.h
@@ -77,3 +77,5 @@ union inotify_event_buffer {
 };
 
 int inotify_add_watch_fd(int fd, int what, uint32_t mask);
+
+int chase_symlinks(const char *path, const char *_root, char **ret);
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 356d3c8121..d3ab2e8e3e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -29,6 +29,7 @@
 #include "alloc-util.h"
 #include "dev-setup.h"
 #include "fd-util.h"
+#include "fs-util.h"
 #include "loopback-setup.h"
 #include "missing.h"
 #include "mkdir.h"
@@ -57,9 +58,9 @@ typedef enum MountMode {
 } MountMode;
 
 typedef struct BindMount {
-        const char *path;
+        const char *path; /* stack memory, doesn't need to be freed explicitly */
+        char *chased; /* malloc()ed memory, needs to be freed */
         MountMode mode;
-        bool done;
         bool ignore;
 } BindMount;
 
@@ -71,7 +72,6 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         STRV_FOREACH(i, strv) {
 
                 (*p)->ignore = false;
-                (*p)->done = false;
 
                 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
                         (*p)->ignore = true;
@@ -360,11 +360,8 @@ static int apply_mount(
                  * inaccessible path. */
                 (void) umount_recursive(m->path, 0);
 
-                if (lstat(m->path, &target) < 0) {
-                        if (m->ignore && errno == ENOENT)
-                                return 0;
+                if (lstat(m->path, &target) < 0)
                         return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", m->path);
-                }
 
                 what = mode_to_inaccessible_node(target.st_mode);
                 if (!what) {
@@ -378,11 +375,8 @@ static int apply_mount(
         case READWRITE:
 
                 r = path_is_mount_point(m->path, 0);
-                if (r < 0) {
-                        if (m->ignore && errno == ENOENT)
-                                return 0;
+                if (r < 0)
                         return log_debug_errno(r, "Failed to determine whether %s is already a mount point: %m", m->path);
-                }
                 if (r > 0) /* Nothing to do here, it is already a mount. We just later toggle the MS_RDONLY bit for the mount point if needed. */
                         return 0;
 
@@ -407,12 +401,8 @@ static int apply_mount(
 
         assert(what);
 
-        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                if (m->ignore && errno == ENOENT)
-                        return 0;
-
+        if (mount(what, m->path, NULL, MS_BIND|MS_REC, NULL) < 0)
                 return log_debug_errno(errno, "Failed to mount %s to %s: %m", what, m->path);
-        }
 
         log_debug("Successfully mounted %s to %s", what, m->path);
         return 0;
@@ -435,12 +425,43 @@ static int make_read_only(BindMount *m, char **blacklist) {
          * already stays this way. This improves compatibility with container managers, where we won't attempt to undo
          * read-only mounts already applied. */
 
-        if (m->ignore && r == -ENOENT)
-                return 0;
-
         return r;
 }
 
+static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+        int r;
+
+        assert(m);
+        assert(n);
+
+        /* Since mount() will always follow symlinks and we need to take the different root directory into account we
+         * chase the symlinks on our own first. This call wil do so for all entries and remove all entries where we
+         * can't resolve the path, and which have been marked for such removal. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                r = chase_symlinks(f->path, root_directory, &f->chased);
+                if (r == -ENOENT && f->ignore) /* Doesn't exist? Then remove it! */
+                        continue;
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to chase symlinks for %s: %m", f->path);
+
+                if (path_equal(f->path, f->chased))
+                        f->chased = mfree(f->chased);
+                else {
+                        log_debug("Chased %s → %s", f->path, f->chased);
+                        f->path = f->chased;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+        return 0;
+}
+
 int setup_namespace(
                 const char* root_directory,
                 char** read_write_paths,
@@ -456,6 +477,7 @@ int setup_namespace(
                 unsigned long mount_flags) {
 
         BindMount *m, *mounts = NULL;
+        bool make_slave = false;
         unsigned n;
         int r = 0;
 
@@ -475,6 +497,9 @@ int setup_namespace(
                  ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
                   (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
+        if (root_directory || n > 0)
+                make_slave = true;
+
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
                 r = append_mounts(&m, read_write_paths, READWRITE);
@@ -596,6 +621,13 @@ int setup_namespace(
 
                 assert(mounts + n == m);
 
+                /* Resolve symlinks manually first, as mount() will always follow them relative to the host's
+                 * root. Moreover we want to suppress duplicates based on the resolved paths. This of course is a bit
+                 * racy. */
+                r = chase_all_symlinks(root_directory, mounts, &n);
+                if (r < 0)
+                        goto finish;
+
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
 
                 drop_duplicates(mounts, &n);
@@ -603,20 +635,26 @@ int setup_namespace(
                 drop_nop(mounts, &n);
         }
 
-        if (unshare(CLONE_NEWNS) < 0)
-                return -errno;
+        if (unshare(CLONE_NEWNS) < 0) {
+                r = -errno;
+                goto finish;
+        }
 
-        if (n > 0 || root_directory) {
+        if (make_slave) {
                 /* Remount / as SLAVE so that nothing now mounted in the namespace
                    shows up in the parent */
-                if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                        return -errno;
+                if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
         }
 
         if (root_directory) {
                 /* Turn directory into bind mount */
-                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
-                        return -errno;
+                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
         }
 
         if (n > 0) {
@@ -627,7 +665,7 @@ int setup_namespace(
                 for (m = mounts; m < mounts + n; ++m) {
                         r = apply_mount(m, tmp_dir, var_tmp_dir);
                         if (r < 0)
-                                goto fail;
+                                goto finish;
                 }
 
                 /* Create a blacklist we can pass to bind_mount_recursive() */
@@ -640,34 +678,30 @@ int setup_namespace(
                 for (m = mounts; m < mounts + n; ++m) {
                         r = make_read_only(m, blacklist);
                         if (r < 0)
-                                goto fail;
+                                goto finish;
                 }
         }
 
         if (root_directory) {
                 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
                 r = mount_move_root(root_directory);
-                if (r < 0) /* at this point, we cannot rollback */
-                        return r;
+                if (r < 0)
+                        goto finish;
         }
 
         /* Remount / as the desired mode. Not that this will not
          * reestablish propagation from our side to the host, since
          * what's disconnected is disconnected. */
-        if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
-                return -errno; /* at this point, we cannot rollback */
+        if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
+                r = -errno;
+                goto finish;
+        }
 
-        return 0;
+        r = 0;
 
-fail:
-        if (n > 0) {
-                for (m = mounts; m < mounts + n; ++m) {
-                        if (!m->done)
-                                continue;
-
-                        (void) umount2(m->path, MNT_DETACH);
-                }
-        }
+finish:
+        for (m = mounts; m < mounts + n; m++)
+                free(m->chased);
 
         return r;
 }
diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c
index b35a2ea2c8..53a3cdc663 100644
--- a/src/test/test-fs-util.c
+++ b/src/test/test-fs-util.c
@@ -20,16 +20,109 @@
 #include <unistd.h>
 
 #include "alloc-util.h"
-#include "fileio.h"
 #include "fd-util.h"
+#include "fileio.h"
 #include "fs-util.h"
 #include "macro.h"
 #include "mkdir.h"
+#include "path-util.h"
 #include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
 #include "util.h"
 
+static void test_chase_symlinks(void) {
+        _cleanup_free_ char *result = NULL;
+        char temp[] = "/tmp/test-chase.XXXXXX";
+        const char *top, *p, *q;
+        int r;
+
+        assert_se(mkdtemp(temp));
+
+        top = strjoina(temp, "/top");
+        assert_se(mkdir(top, 0700) >= 0);
+
+        p = strjoina(top, "/dot");
+        assert_se(symlink(".", p) >= 0);
+
+        p = strjoina(top, "/dotdot");
+        assert_se(symlink("..", p) >= 0);
+
+        p = strjoina(top, "/dotdota");
+        assert_se(symlink("../a", p) >= 0);
+
+        p = strjoina(temp, "/a");
+        assert_se(symlink("b", p) >= 0);
+
+        p = strjoina(temp, "/b");
+        assert_se(symlink("/usr", p) >= 0);
+
+        p = strjoina(temp, "/start");
+        assert_se(symlink("top/dot/dotdota", p) >= 0);
+
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/usr"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r == -ENOENT);
+
+        q = strjoina(temp, "/usr");
+        assert_se(mkdir(q, 0700) >= 0);
+
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, q));
+
+        p = strjoina(temp, "/slash");
+        assert_se(symlink("/", p) >= 0);
+
+        result = mfree(result);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, temp));
+
+        p = strjoina(temp, "/slashslash");
+        assert_se(symlink("///usr///", p) >= 0);
+
+        result = mfree(result);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/usr"));
+
+        result = mfree(result);
+        r = chase_symlinks(p, temp, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, q));
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/./.././", NULL, &result);
+        assert_se(r >= 0);
+        assert_se(path_equal(result, "/"));
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/./.././", "/etc", &result);
+        assert_se(r == -EINVAL);
+
+        result = mfree(result);
+        r = chase_symlinks("/etc/machine-id/foo", NULL, &result);
+        assert_se(r == -ENOTDIR);
+
+        result = mfree(result);
+        p = strjoina(temp, "/recursive-symlink");
+        assert_se(symlink("recursive-symlink", p) >= 0);
+        r = chase_symlinks(p, NULL, &result);
+        assert_se(r == -ELOOP);
+
+        assert_se(rm_rf(temp, REMOVE_ROOT|REMOVE_PHYSICAL) >= 0);
+}
+
 static void test_unlink_noerrno(void) {
         char name[] = "/tmp/test-close_nointr.XXXXXX";
         int fd;
@@ -144,6 +237,7 @@ int main(int argc, char *argv[]) {
         test_readlink_and_make_absolute();
         test_get_files_in_directory();
         test_var_tmp();
+        test_chase_symlinks();
 
         return 0;
 }
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 03a24620af..c4d4da6d05 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -26,14 +26,18 @@
 int main(int argc, char *argv[]) {
         const char * const writable[] = {
                 "/home",
-                "/home/lennart/projects/foobar", /* this should be masked automatically */
+                "-/home/lennart/projects/foobar", /* this should be masked automatically */
                 NULL
         };
 
         const char * const readonly[] = {
-                "/",
-                "/usr",
+                /* "/", */
+                /* "/usr", */
                 "/boot",
+                "/lib",
+                "/usr/lib",
+                "-/lib64",
+                "-/usr/lib64",
                 NULL
         };
 
-- 
cgit v1.2.3-54-g00ecf


From 8f1ad200f010dc2106f7e3ff5879f0330ee36996 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 11:27:38 +0200
Subject: namespace: don't make the root directory of a namespace a mount if it
 already is one

Let's not stack mounts needlessly.
---
 src/core/namespace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index d3ab2e8e3e..a7451ffbdc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -650,10 +650,15 @@ int setup_namespace(
         }
 
         if (root_directory) {
-                /* Turn directory into bind mount */
-                if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                        r = -errno;
+                /* Turn directory into bind mount, if it isn't one yet */
+                r = path_is_mount_point(root_directory, AT_SYMLINK_FOLLOW);
+                if (r < 0)
                         goto finish;
+                if (r == 0) {
+                        if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
+                                r = -errno;
+                                goto finish;
+                        }
                 }
         }
 
-- 
cgit v1.2.3-54-g00ecf


From cd2902c9546eabfffcf5d6de4d0bd4dfe6a4d427 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 26 Aug 2016 17:25:40 +0200
Subject: namespace: drop all mounts outside of the new root directory

There's no point in mounting these, if they are outside of the root directory
we'll move to.
---
 src/core/namespace.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index a7451ffbdc..c9b2154985 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -199,6 +199,31 @@ static void drop_nop(BindMount *m, unsigned *n) {
         *n = t - m;
 }
 
+static void drop_outside_root(const char *root_directory, BindMount *m, unsigned *n) {
+        BindMount *f, *t;
+
+        assert(m);
+        assert(n);
+
+        if (!root_directory)
+                return;
+
+        /* Drops all mounts that are outside of the root directory. */
+
+        for (f = m, t = m; f < m+*n; f++) {
+
+                if (!path_startswith(f->path, root_directory)) {
+                        log_debug("%s is outside of root directory.", f->path);
+                        continue;
+                }
+
+                *t = *f;
+                t++;
+        }
+
+        *n = t - m;
+}
+
 static int mount_dev(BindMount *m) {
         static const char devnodes[] =
                 "/dev/null\0"
@@ -631,6 +656,7 @@ int setup_namespace(
                 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
 
                 drop_duplicates(mounts, &n);
+                drop_outside_root(root_directory, mounts, &n);
                 drop_inaccessible(mounts, &n);
                 drop_nop(mounts, &n);
         }
-- 
cgit v1.2.3-54-g00ecf


From 9c94d52e0919e4d7999e49b9ba2654a9e2ca4543 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:03:21 +0200
Subject: core:namespace: minor improvements to append_mounts()

---
 src/core/namespace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index c9b2154985..8de774e6f6 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -70,12 +70,11 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         assert(p);
 
         STRV_FOREACH(i, strv) {
+                bool ignore = false;
 
-                (*p)->ignore = false;
-
-                if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
-                        (*p)->ignore = true;
+                if (IN_SET(mode, INACCESSIBLE, READONLY, READWRITE) && startswith(*i, "-")) {
                         (*i)++;
+                        ignore = true;
                 }
 
                 if (!path_is_absolute(*i))
@@ -83,6 +82,7 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
 
                 (*p)->path = *i;
                 (*p)->mode = mode;
+                (*p)->ignore = ignore;
                 (*p)++;
         }
 
-- 
cgit v1.2.3-54-g00ecf


From 11a30cec2a9b6168b024c06720ad238dd1390794 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:16:44 +0200
Subject: core:namespace: put paths protected by ProtectKernelTunables= in

Instead of having all these paths everywhere, put the ones that are
protected by ProtectKernelTunables= into their own table. This way it
is easy to add paths and track which ones are protected.
---
 src/core/namespace.c | 54 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8de774e6f6..13f6aeba51 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -61,9 +61,23 @@ typedef struct BindMount {
         const char *path; /* stack memory, doesn't need to be freed explicitly */
         char *chased; /* malloc()ed memory, needs to be freed */
         MountMode mode;
-        bool ignore;
+        bool ignore; /* Ignore if path does not exist */
 } BindMount;
 
+typedef struct TargetMount {
+        const char *path;
+        MountMode mode;
+        bool ignore; /* Ignore if path does not exist */
+} TargetMount;
+
+/* ProtectKernelTunables= option and the related filesystem APIs */
+static const TargetMount protect_kernel_tunables_table[] = {
+        { "/proc/sys",                  READONLY,       false },
+        { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/sys",                       READONLY,       false },
+        { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
+};
+
 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         char **i;
 
@@ -89,6 +103,20 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         return 0;
 }
 
+static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
+        unsigned int i;
+
+        assert(p);
+
+        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
+                const TargetMount *t = &protect_kernel_tunables_table[i];
+                (*p)->path = prefix_roota(root_directory, t->path);
+                (*p)->mode = t->mode;
+                (*p)->ignore = t->ignore;
+                (*p)++;
+        }
+}
+
 static int mount_path_compare(const void *a, const void *b) {
         const BindMount *p = a, *q = b;
         int d;
@@ -514,8 +542,8 @@ int setup_namespace(
                 strv_length(read_only_paths) +
                 strv_length(inaccessible_paths) +
                 private_dev +
-                (protect_sysctl ? 3 : 0) +
-                (protect_cgroups != protect_sysctl) +
+                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+                (protect_cgroups ? 1 : 0) +
                 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
                 (protect_system == PROTECT_SYSTEM_STRICT ?
                  (2 + !private_dev + !protect_sysctl) :
@@ -557,24 +585,12 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_sysctl) {
-                        m->path = prefix_roota(root_directory, "/proc/sys");
-                        m->mode = READONLY;
-                        m++;
-
-                        m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
-                        m->mode = READONLY;
-                        m->ignore = true; /* Not always compiled into the kernel */
-                        m++;
+                if (protect_sysctl)
+                        append_protect_kernel_tunables(&m, root_directory);
 
-                        m->path = prefix_roota(root_directory, "/sys");
-                        m->mode = READONLY;
-                        m++;
-                }
-
-                if (protect_cgroups != protect_sysctl) {
+                if (protect_cgroups) {
                         m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
-                        m->mode = protect_cgroups ? READONLY : READWRITE;
+                        m->mode = READONLY;
                         m++;
                 }
 
-- 
cgit v1.2.3-54-g00ecf


From 2652c6c10394623b2c3e2ed5d4616c85918d140c Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:25:00 +0200
Subject: core:namespace: simplify mount calculation

Move out mount calculation on its own function. Actually the logic is
smart enough to later drop nop and duplicates mounts, this change
improves code readability.
---
 src/core/namespace.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)
---
 src/core/namespace.c | 46 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 13f6aeba51..8aa8b83c88 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -515,6 +515,32 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned
         return 0;
 }
 
+static unsigned namespace_calculate_mounts(
+                char** read_write_paths,
+                char** read_only_paths,
+                char** inaccessible_paths,
+                const char* tmp_dir,
+                const char* var_tmp_dir,
+                bool private_dev,
+                bool protect_sysctl,
+                bool protect_cgroups,
+                ProtectHome protect_home,
+                ProtectSystem protect_system) {
+
+        return !!tmp_dir + !!var_tmp_dir +
+                strv_length(read_write_paths) +
+                strv_length(read_only_paths) +
+                strv_length(inaccessible_paths) +
+                private_dev +
+                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+                (protect_cgroups ? 1 : 0) +
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+}
+
 int setup_namespace(
                 const char* root_directory,
                 char** read_write_paths,
@@ -537,19 +563,15 @@ int setup_namespace(
         if (mount_flags == 0)
                 mount_flags = MS_SHARED;
 
-        n = !!tmp_dir + !!var_tmp_dir +
-                strv_length(read_write_paths) +
-                strv_length(read_only_paths) +
-                strv_length(inaccessible_paths) +
-                private_dev +
-                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
-                (protect_cgroups ? 1 : 0) +
-                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                (protect_system == PROTECT_SYSTEM_STRICT ?
-                 (2 + !private_dev + !protect_sysctl) :
-                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
-                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+        n = namespace_calculate_mounts(read_write_paths,
+                                       read_only_paths,
+                                       inaccessible_paths,
+                                       tmp_dir, var_tmp_dir,
+                                       private_dev, protect_sysctl,
+                                       protect_cgroups, protect_home,
+                                       protect_system);
 
+        /* Set mount slave mode */
         if (root_directory || n > 0)
                 make_slave = true;
 
-- 
cgit v1.2.3-54-g00ecf


From 49accde7bd915944d99c947dca0cf26ae0f24165 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 11:30:11 +0200
Subject: core:sandbox: add more /proc/* entries to ProtectKernelTunables=

Make ALSA entries, latency interface, mtrr, apm/acpi, suspend interface,
filesystems configuration and IRQ tuning readonly.

Most of these interfaces now days should be in /sys but they are still
available through /proc, so just protect them. This patch does not touch
/proc/net/...
---
 man/systemd.exec.xml |  6 ++++--
 src/core/namespace.c | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index a3a431c82b..f19e7f6ee9 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1026,8 +1026,10 @@
         <term><varname>ProtectKernelTunables=</varname></term>
 
         <listitem><para>Takes a boolean argument. If true, kernel variables accessible through
-        <filename>/proc/sys</filename>, <filename>/sys</filename> and <filename>/proc/sysrq-trigger</filename> will be
-        made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
+        <filename>/proc/sys</filename>, <filename>/sys</filename>, <filename>/proc/sysrq-trigger</filename>,
+        <filename>/proc/latency_stats</filename>, <filename>/proc/acpi</filename>,
+        <filename>/proc/timer_stats</filename>, <filename>/proc/fs</filename> and <filename>/proc/irq</filename> will
+        be made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at
         boot-time, with the <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
         mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for
         most services. For this setting the same restrictions regarding mount propagation and privileges apply as for
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 8aa8b83c88..3234fab4bc 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -74,7 +74,18 @@ typedef struct TargetMount {
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
         { "/proc/sysrq-trigger",        READONLY,       true  },
+        { "/proc/latency_stats",        READONLY,       true  },
+        { "/proc/mtrr",                 READONLY,       true  },
+        { "/proc/apm",                  READONLY,       true  },
+        { "/proc/acpi",                 READONLY,       true  },
+        { "/proc/timer_stats",          READONLY,       true  },
+        { "/proc/asound",               READONLY,       true  },
+        { "/proc/bus",                  READONLY,       true  },
+        { "/proc/fs",                   READONLY,       true  },
+        { "/proc/irq",                  READONLY,       true  },
         { "/sys",                       READONLY,       false },
+        { "/sys/kernel/debug",          READONLY,       true  },
+        { "/sys/kernel/tracing",        READONLY,       true  },
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
-- 
cgit v1.2.3-54-g00ecf


From f471b2afa11c97e48a4b6756f7254f88cc436960 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:21:25 +0200
Subject: core: simplify ProtectSystem= implementation

ProtectSystem= with all its different modes and other options like
PrivateDevices= + ProtectKernelTunables= + ProtectHome= are orthogonal,
however currently it's a bit hard to parse that from the implementation
view. Simplify it by giving each mode its own table with all paths and
references to other Protect options.

With this change some entries are duplicated, but we do not care since
duplicate mounts are first sorted by the most restrictive mode then
cleaned.
---
 src/core/namespace.c | 171 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 58 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 3234fab4bc..985e343096 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -70,6 +70,14 @@ typedef struct TargetMount {
         bool ignore; /* Ignore if path does not exist */
 } TargetMount;
 
+/*
+ * The following Protect tables are to protect paths and mark some of them
+ * READONLY, in case a path is covered by an option from another table, then
+ * it is marked READWRITE in the current one, and the more restrictive mode is
+ * applied from that other table. This way all options can be combined in a
+ * safe and comprehensible way for users.
+ */
+
 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const TargetMount protect_kernel_tunables_table[] = {
         { "/proc/sys",                  READONLY,       false },
@@ -89,6 +97,45 @@ static const TargetMount protect_kernel_tunables_table[] = {
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
+/* ProtectSystem=yes table */
+static const TargetMount protect_system_yes_table[] = {
+        { "/usr",       READONLY,       false },
+        { "/boot",      READONLY,       true  },
+        { "/efi",       READONLY,       true  },
+};
+
+/* ProtectSystem=full includes ProtectSystem=yes */
+static const TargetMount protect_system_full_table[] = {
+        { "/usr",       READONLY,       false },
+        { "/boot",      READONLY,       true  },
+        { "/efi",       READONLY,       true  },
+        { "/etc",       READONLY,       false },
+};
+
+/*
+ * ProtectSystem=strict table. In this strict mode, we mount everything
+ * read-only, except for /proc, /dev, /sys which are the kernel API VFS,
+ * which are left writable, but PrivateDevices= + ProtectKernelTunables=
+ * protect those, and these options should be fully orthogonal.
+ * (And of course /home and friends are also left writable, as ProtectHome=
+ * shall manage those, orthogonally).
+ */
+static const TargetMount protect_system_strict_table[] = {
+        { "/",          READONLY,       false },
+        { "/proc",      READWRITE,      false },      /* ProtectKernelTunables= */
+        { "/sys",       READWRITE,      false },      /* ProtectKernelTunables= */
+        { "/dev",       READWRITE,      false },      /* PrivateDevices= */
+        { "/home",      READWRITE,      true  },      /* ProtectHome= */
+        { "/run/user",  READWRITE,      true  },      /* ProtectHome= */
+        { "/root",      READWRITE,      true  },      /* ProtectHome= */
+};
+
+static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) {
+        (*p)->path = path;
+        (*p)->mode = mode;
+        (*p)->ignore = ignore;
+}
+
 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
         char **i;
 
@@ -105,27 +152,71 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {
                 if (!path_is_absolute(*i))
                         return -EINVAL;
 
-                (*p)->path = *i;
-                (*p)->mode = mode;
-                (*p)->ignore = ignore;
+                set_bind_mount(p, *i, mode, ignore);
                 (*p)++;
         }
 
         return 0;
 }
 
-static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
-        unsigned int i;
+static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) {
+        unsigned i;
 
         assert(p);
+        assert(mounts);
 
-        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) {
-                const TargetMount *t = &protect_kernel_tunables_table[i];
-                (*p)->path = prefix_roota(root_directory, t->path);
-                (*p)->mode = t->mode;
-                (*p)->ignore = t->ignore;
+        for (i = 0; i < size; i++) {
+                /*
+                 * Here we assume that the ignore field is set during
+                 * declaration we do not support "-" at the beginning.
+                 */
+                const TargetMount *m = &mounts[i];
+                const char *path = prefix_roota(root_directory, m->path);
+
+                if (!path_is_absolute(path))
+                        return -EINVAL;
+
+                set_bind_mount(p, path, m->mode, m->ignore);
                 (*p)++;
         }
+
+        return 0;
+}
+
+static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) {
+        assert(p);
+
+        return append_target_mounts(p, root_directory, protect_kernel_tunables_table,
+                                    ELEMENTSOF(protect_kernel_tunables_table));
+}
+
+static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
+        int r = 0;
+
+        assert(p);
+
+        if (protect_system == PROTECT_SYSTEM_NO)
+                return 0;
+
+        switch (protect_system) {
+        case PROTECT_SYSTEM_STRICT:
+                r = append_target_mounts(p, root_directory, protect_system_strict_table,
+                                         ELEMENTSOF(protect_system_strict_table));
+                break;
+        case PROTECT_SYSTEM_YES:
+                r = append_target_mounts(p, root_directory, protect_system_yes_table,
+                                         ELEMENTSOF(protect_system_yes_table));
+                break;
+        case PROTECT_SYSTEM_FULL:
+                r = append_target_mounts(p, root_directory, protect_system_full_table,
+                                         ELEMENTSOF(protect_system_full_table));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+
+        return r;
 }
 
 static int mount_path_compare(const void *a, const void *b) {
@@ -538,6 +629,14 @@ static unsigned namespace_calculate_mounts(
                 ProtectHome protect_home,
                 ProtectSystem protect_system) {
 
+        unsigned protect_system_cnt =
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 ELEMENTSOF(protect_system_strict_table) :
+                 ((protect_system == PROTECT_SYSTEM_FULL) ?
+                  ELEMENTSOF(protect_system_full_table) :
+                  ((protect_system == PROTECT_SYSTEM_YES) ?
+                   ELEMENTSOF(protect_system_yes_table) : 0)));
+
         return !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -546,10 +645,7 @@ static unsigned namespace_calculate_mounts(
                 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (protect_cgroups ? 1 : 0) +
                 (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                (protect_system == PROTECT_SYSTEM_STRICT ?
-                 (2 + !private_dev + !protect_sysctl) :
-                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
-                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
+                protect_system_cnt;
 }
 
 int setup_namespace(
@@ -648,50 +744,9 @@ int setup_namespace(
                                 return r;
                 }
 
-                if (protect_system == PROTECT_SYSTEM_STRICT) {
-                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
-                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
-                         * protect those, and these options should be fully orthogonal. (And of course /home and
-                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
-                         * above). */
-
-                        m->path = prefix_roota(root_directory, "/");
-                        m->mode = READONLY;
-                        m++;
-
-                        m->path = prefix_roota(root_directory, "/proc");
-                        m->mode = READWRITE;
-                        m++;
-
-                        if (!private_dev) {
-                                m->path = prefix_roota(root_directory, "/dev");
-                                m->mode = READWRITE;
-                                m++;
-                        }
-                        if (!protect_sysctl) {
-                                m->path = prefix_roota(root_directory, "/sys");
-                                m->mode = READWRITE;
-                                m++;
-                        }
-
-                } else if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
-
-                        /* In any other mode we simply mark the relevant three directories ready-only. */
-
-                        usr_dir = prefix_roota(root_directory, "/usr");
-                        boot_dir = prefix_roota(root_directory, "/boot");
-                        boot_dir = strjoina("-", boot_dir);
-                        efi_dir = prefix_roota(root_directory, "/efi");
-                        efi_dir = strjoina("-", efi_dir);
-                        etc_dir = prefix_roota(root_directory, "/etc");
-
-                        r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
-                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
-                        if (r < 0)
-                                return r;
-                }
+                r = append_protect_system(&m, root_directory, protect_system);
+                if (r < 0)
+                        return r;
 
                 assert(mounts + n == m);
 
-- 
cgit v1.2.3-54-g00ecf


From b6c432ca7ed930c7e9078ac2266ae439aa242632 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Sun, 25 Sep 2016 12:41:16 +0200
Subject: core:namespace: simplify ProtectHome= implementation

As with previous patch simplify ProtectHome and don't care about
duplicates, they will be sorted by most restrictive mode and cleaned.
---
 src/core/namespace.c | 75 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 22 deletions(-)

(limited to 'src/core/namespace.c')

diff --git a/src/core/namespace.c b/src/core/namespace.c
index 985e343096..43a2f4ba6e 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -97,6 +97,23 @@ static const TargetMount protect_kernel_tunables_table[] = {
         { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */
 };
 
+/*
+ * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
+ * system should be protected by ProtectSystem=
+ */
+static const TargetMount protect_home_read_only_table[] = {
+        { "/home",      READONLY,       true },
+        { "/run/user",  READONLY,       true },
+        { "/root",      READONLY,       true },
+};
+
+/* ProtectHome=yes table */
+static const TargetMount protect_home_yes_table[] = {
+        { "/home",      INACCESSIBLE,   true },
+        { "/run/user",  INACCESSIBLE,   true },
+        { "/root",      INACCESSIBLE,   true },
+};
+
 /* ProtectSystem=yes table */
 static const TargetMount protect_system_yes_table[] = {
         { "/usr",       READONLY,       false },
@@ -190,6 +207,31 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct
                                     ELEMENTSOF(protect_kernel_tunables_table));
 }
 
+static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) {
+        int r = 0;
+
+        assert(p);
+
+        if (protect_home == PROTECT_HOME_NO)
+                return 0;
+
+        switch (protect_home) {
+        case PROTECT_HOME_READ_ONLY:
+                r = append_target_mounts(p, root_directory, protect_home_read_only_table,
+                                         ELEMENTSOF(protect_home_read_only_table));
+                break;
+        case PROTECT_HOME_YES:
+                r = append_target_mounts(p, root_directory, protect_home_yes_table,
+                                         ELEMENTSOF(protect_home_yes_table));
+                break;
+        default:
+                r = -EINVAL;
+                break;
+        }
+
+        return r;
+}
+
 static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) {
         int r = 0;
 
@@ -629,6 +671,7 @@ static unsigned namespace_calculate_mounts(
                 ProtectHome protect_home,
                 ProtectSystem protect_system) {
 
+        unsigned protect_home_cnt;
         unsigned protect_system_cnt =
                 (protect_system == PROTECT_SYSTEM_STRICT ?
                  ELEMENTSOF(protect_system_strict_table) :
@@ -637,6 +680,12 @@ static unsigned namespace_calculate_mounts(
                   ((protect_system == PROTECT_SYSTEM_YES) ?
                    ELEMENTSOF(protect_system_yes_table) : 0)));
 
+        protect_home_cnt =
+                (protect_home == PROTECT_HOME_YES ?
+                 ELEMENTSOF(protect_home_yes_table) :
+                 ((protect_home == PROTECT_HOME_READ_ONLY) ?
+                  ELEMENTSOF(protect_home_read_only_table) : 0));
+
         return !!tmp_dir + !!var_tmp_dir +
                 strv_length(read_write_paths) +
                 strv_length(read_only_paths) +
@@ -644,8 +693,7 @@ static unsigned namespace_calculate_mounts(
                 private_dev +
                 (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (protect_cgroups ? 1 : 0) +
-                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
-                protect_system_cnt;
+                protect_home_cnt + protect_system_cnt;
 }
 
 int setup_namespace(
@@ -723,26 +771,9 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
-                        const char *home_dir, *run_user_dir, *root_dir;
-
-                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
-                         * strict system protection mode, then also add entries for these directories, but mark them
-                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
-
-                        home_dir = prefix_roota(root_directory, "/home");
-                        home_dir = strjoina("-", home_dir);
-                        run_user_dir = prefix_roota(root_directory, "/run/user");
-                        run_user_dir = strjoina("-", run_user_dir);
-                        root_dir = prefix_roota(root_directory, "/root");
-                        root_dir = strjoina("-", root_dir);
-
-                        r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
-                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
-                        if (r < 0)
-                                return r;
-                }
+                r = append_protect_home(&m, root_directory, protect_home);
+                if (r < 0)
+                        return r;
 
                 r = append_protect_system(&m, root_directory, protect_system);
                 if (r < 0)
-- 
cgit v1.2.3-54-g00ecf