9 files changed, 1192 insertions, 850 deletions
diff --git a/src/systemd-nspawn/nspawn-cgroup.c b/src/systemd-nspawn/nspawn-cgroup.c
index c43d747dc3..9e793d85f1 100644
--- a/src/systemd-nspawn/nspawn-cgroup.c
+++ b/src/systemd-nspawn/nspawn-cgroup.c
@@ -20,33 +20,24 @@
 #include <sys/mount.h>
 
 #include "systemd-basic/alloc-util.h"
-#include "systemd-basic/cgroup-util.h"
 #include "systemd-basic/fd-util.h"
 #include "systemd-basic/fileio.h"
 #include "systemd-basic/mkdir.h"
+#include "systemd-basic/mount-util.h"
+#include "systemd-basic/rm-rf.h"
 #include "systemd-basic/string-util.h"
 #include "systemd-basic/strv.h"
 #include "systemd-basic/util.h"
 
 #include "nspawn-cgroup.h"
 
-int chown_cgroup(pid_t pid, uid_t uid_shift) {
-        _cleanup_free_ char *path = NULL, *fs = NULL;
+static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
-        int r;
-
-        r = cg_pid_get_path(NULL, pid, &path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get container cgroup path: %m");
 
-        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-        fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+        fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
         if (fd < 0)
-                return log_error_errno(errno, "Failed to open %s: %m", fs);
+                return -errno;
 
         FOREACH_STRING(fn,
                        ".",
@@ -64,18 +55,37 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) {
         return 0;
 }
 
-int sync_cgroup(pid_t pid, bool unified_requested) {
+int chown_cgroup(pid_t pid, uid_t uid_shift) {
+        _cleanup_free_ char *path = NULL, *fs = NULL;
+        int r;
+
+        r = cg_pid_get_path(NULL, pid, &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+        r = chown_cgroup_path(fs, uid_shift);
+        if (r < 0)
+                return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+        return 0;
+}
+
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) {
         _cleanup_free_ char *cgroup = NULL;
         char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
         bool undo_mount = false;
         const char *fn;
         int unified, r;
 
-        unified = cg_unified();
+        unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
         if (unified < 0)
                 return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
 
-        if ((unified > 0) == unified_requested)
+        if ((unified > 0) == (unified_requested >= CGROUP_UNIFIED_SYSTEMD))
                 return 0;
 
         /* When the host uses the legacy cgroup setup, but the
@@ -92,33 +102,45 @@ int sync_cgroup(pid_t pid, bool unified_requested) {
                 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
 
         if (unified)
-                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+                r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
         else
-                r = mount("cgroup", tree, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-        if (r < 0) {
-                r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+                r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup2",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+        if (r < 0)
                 goto finish;
-        }
 
         undo_mount = true;
 
+        /* If nspawn dies abruptly the cgroup hierarchy created below
+         * its unit isn't cleaned up. So, let's remove it
+         * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
+        fn = strjoina(tree, cgroup);
+        (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
+
         fn = strjoina(tree, cgroup, "/cgroup.procs");
         (void) mkdir_parents(fn, 0755);
 
         sprintf(pid_string, PID_FMT, pid);
         r = write_string_file(fn, pid_string, 0);
-        if (r < 0)
+        if (r < 0) {
                 log_error_errno(r, "Failed to move process: %m");
+                goto finish;
+        }
 
+        fn = strjoina(tree, cgroup);
+        r = chown_cgroup_path(fn, arg_uid_shift);
+        if (r < 0)
+                log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
 finish:
         if (undo_mount)
-                (void) umount(tree);
+                (void) umount_verbose(tree);
 
         (void) rmdir(tree);
         return r;
 }
 
-int create_subcgroup(pid_t pid, bool unified_requested) {
+int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
         _cleanup_free_ char *cgroup = NULL;
         const char *child;
         int unified, r;
@@ -130,10 +152,10 @@ int create_subcgroup(pid_t pid, bool unified_requested) {
          * did not create a scope unit for the container move us and
          * the container into two separate subcgroups. */
 
-        if (!unified_requested)
+        if (unified_requested == CGROUP_UNIFIED_NONE)
                 return 0;
 
-        unified = cg_unified();
+        unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
         if (unified < 0)
                 return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
         if (unified == 0)
diff --git a/src/systemd-nspawn/nspawn-cgroup.h b/src/systemd-nspawn/nspawn-cgroup.h
index 1ff35a299a..6c0ddfc7de 100644
--- a/src/systemd-nspawn/nspawn-cgroup.h
+++ b/src/systemd-nspawn/nspawn-cgroup.h
@@ -22,6 +22,8 @@
 #include <stdbool.h>
 #include <sys/types.h>
 
+#include "systemd-basic/cgroup-util.h"
+
 int chown_cgroup(pid_t pid, uid_t uid_shift);
-int sync_cgroup(pid_t pid, bool unified_requested);
-int create_subcgroup(pid_t pid, bool unified_requested);
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
+int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
diff --git a/src/systemd-nspawn/nspawn-mount.c b/src/systemd-nspawn/nspawn-mount.c
index 10a28ff29b..aac04efd4b 100644
--- a/src/systemd-nspawn/nspawn-mount.c
+++ b/src/systemd-nspawn/nspawn-mount.c
@@ -22,8 +22,9 @@
 #include <linux/magic.h>
 
 #include "systemd-basic/alloc-util.h"
-#include "systemd-basic/cgroup-util.h"
 #include "systemd-basic/escape.h"
+#include "systemd-basic/fd-util.h"
+#include "systemd-basic/fileio.h"
 #include "systemd-basic/fs-util.h"
 #include "systemd-basic/label.h"
 #include "systemd-basic/mkdir.h"
@@ -183,13 +184,15 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
 
 static int tmpfs_patch_options(
                 const char *options,
-                bool userns, uid_t uid_shift, uid_t uid_range,
+                bool userns,
+                uid_t uid_shift, uid_t uid_range,
+                bool patch_ids,
                 const char *selinux_apifs_context,
                 char **ret) {
 
         char *buf = NULL;
 
-        if (userns && uid_shift != 0) {
+        if ((userns && uid_shift != 0) || patch_ids) {
                 assert(uid_shift != UID_INVALID);
 
                 if (options)
@@ -220,7 +223,13 @@ static int tmpfs_patch_options(
         }
 #endif
 
+        if (!buf && options) {
+                buf = strdup(options);
+                if (!buf)
+                        return -ENOMEM;
+        }
         *ret = buf;
+
         return !!buf;
 }
 
@@ -243,8 +252,10 @@ int mount_sysfs(const char *dest) {
 
         (void) mkdir(full, 0755);
 
-        if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
-                return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
+        r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
+                          MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+        if (r < 0)
+                return r;
 
         FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
                 _cleanup_free_ char *from = NULL, *to = NULL;
@@ -259,28 +270,91 @@ int mount_sysfs(const char *dest) {
 
                 (void) mkdir(to, 0755);
 
-                if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
+                r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+                if (r < 0)
+                        return r;
 
-                if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
+                r = mount_verbose(LOG_ERR, NULL, to, NULL,
+                                  MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+                if (r < 0)
+                        return r;
         }
 
-        if (umount(full) < 0)
-                return log_error_errno(errno, "Failed to unmount %s: %m", full);
+        r = umount_verbose(full);
+        if (r < 0)
+                return r;
 
         if (rmdir(full) < 0)
                 return log_error_errno(errno, "Failed to remove %s: %m", full);
 
         x = prefix_roota(top, "/fs/kdbus");
-        (void) mkdir(x, 0755);
+        (void) mkdir_p(x, 0755);
+
+        /* Create mountpoint for cgroups. Otherwise we are not allowed since we
+         * remount /sys read-only.
+         */
+        if (cg_ns_supported()) {
+                x = prefix_roota(top, "/fs/cgroup");
+                (void) mkdir_p(x, 0755);
+        }
+
+        return mount_verbose(LOG_ERR, NULL, top, NULL,
+                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+}
+
+static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+        int r;
+
+        assert(path);
 
-        if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
-                return log_error_errno(errno, "Failed to make %s read-only: %m", top);
+        r = mkdir(path, mode);
+        if (r < 0 && errno != EEXIST)
+                return -errno;
+
+        if (!in_userns) {
+                r = lchown(path, uid_shift, uid_shift);
+                if (r < 0)
+                        return -errno;
+        }
 
         return 0;
 }
 
+static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+        const char *p, *e;
+        int r;
+
+        assert(path);
+
+        if (prefix && !path_startswith(path, prefix))
+                return -ENOTDIR;
+
+        /* create every parent directory in the path, except the last component */
+        p = path + strspn(path, "/");
+        for (;;) {
+                char t[strlen(path) + 1];
+
+                e = p + strcspn(p, "/");
+                p = e + strspn(e, "/");
+
+                /* Is this the last component? If so, then we're done */
+                if (*p == 0)
+                        break;
+
+                memcpy(t, path, e - path);
+                t[e-path] = 0;
+
+                if (prefix && path_startswith(prefix, t))
+                        continue;
+
+                r = mkdir_userns(t, mode, in_userns, uid_shift);
+                if (r < 0)
+                        return r;
+        }
+
+        return mkdir_userns(path, mode, in_userns, uid_shift);
+}
+
 int mount_all(const char *dest,
               bool use_userns, bool in_userns,
               bool use_netns,
@@ -299,19 +373,21 @@ int mount_all(const char *dest,
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
-                { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
-                { "/proc/sys/net",   "/proc/sys/net",   NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
-                { NULL,              "/proc/sys",       NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
-                { "tmpfs",           "/sys",            "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
-                { "sysfs",           "/sys",            "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
-                { "tmpfs",           "/dev",            "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
-                { "tmpfs",           "/dev/shm",        "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",           "/run",            "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",           "/tmp",            "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  false, false },
+                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
+                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
+                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
+                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
+                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                   false, true,  false },   /* Bind mount first ...*/
+                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true,  false },   /* ... then, make it r/o */
+                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
+                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
+                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
+                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
+                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
+                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  false,  false },
 #ifdef HAVE_SELINUX
-                { "/sys/fs/selinux", "/sys/fs/selinux", NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
-                { NULL,              "/sys/fs/selinux", NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
+                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
+                { NULL,                  "/sys/fs/selinux",     NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
 #endif
         };
 
@@ -340,8 +416,8 @@ int mount_all(const char *dest,
                 if (mount_table[k].what && r > 0)
                         continue;
 
-                r = mkdir_p(where, 0755);
-                if (r < 0) {
+                r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
+                if (r < 0 && r != -EEXIST) {
                         if (mount_table[k].fatal)
                                 return log_error_errno(r, "Failed to create directory %s: %m", where);
 
@@ -351,24 +427,24 @@ int mount_all(const char *dest,
 
                 o = mount_table[k].options;
                 if (streq_ptr(mount_table[k].type, "tmpfs")) {
-                        r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
+                        if (in_userns)
+                                r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
+                        else
+                                r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
                         if (r < 0)
                                 return log_oom();
                         if (r > 0)
                                 o = options;
                 }
 
-                if (mount(mount_table[k].what,
-                          where,
-                          mount_table[k].type,
-                          mount_table[k].flags,
-                          o) < 0) {
-
-                        if (mount_table[k].fatal)
-                                return log_error_errno(errno, "mount(%s) failed: %m", where);
-
-                        log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
-                }
+                r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_WARNING,
+                                  mount_table[k].what,
+                                  where,
+                                  mount_table[k].type,
+                                  mount_table[k].flags,
+                                  o);
+                if (r < 0 && mount_table[k].fatal)
+                        return r;
         }
 
         return 0;
@@ -453,15 +529,15 @@ static int mount_bind(const char *dest, CustomMount *m) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to create mount point %s: %m", where);
 
-        } else {
+        } else
                 return log_error_errno(errno, "Failed to stat %s: %m", where);
-        }
 
-        if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
-                return log_error_errno(errno, "mount(%s) failed: %m", where);
+        r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
+        if (r < 0)
+                return r;
 
         if (m->read_only) {
-                r = bind_remount_recursive(where, true);
+                r = bind_remount_recursive(where, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Read-only bind mount failed: %m");
         }
@@ -488,15 +564,12 @@ static int mount_tmpfs(
         if (r < 0 && r != -EEXIST)
                 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
 
-        r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+        r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
         if (r < 0)
                 return log_oom();
         options = r > 0 ? buf : m->options;
 
-        if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
-                return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
-
-        return 0;
+        return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
 }
 
 static char *joined_and_escaped_lower_dirs(char * const *lower) {
@@ -558,10 +631,7 @@ static int mount_overlay(const char *dest, CustomMount *m) {
                 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
         }
 
-        if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
-                return log_error_errno(errno, "overlay mount to %s failed: %m", where);
-
-        return 0;
+        return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
 }
 
 int mount_custom(
@@ -603,8 +673,52 @@ int mount_custom(
         return 0;
 }
 
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
-        char *to;
+/* Retrieve existing subsystems. This function is called in a new cgroup
+ * namespace.
+ */
+static int get_controllers(Set *subsystems) {
+        _cleanup_fclose_ FILE *f = NULL;
+        char line[LINE_MAX];
+
+        assert(subsystems);
+
+        f = fopen("/proc/self/cgroup", "re");
+        if (!f)
+                return errno == ENOENT ? -ESRCH : -errno;
+
+        FOREACH_LINE(line, f, return -errno) {
+                int r;
+                char *e, *l, *p;
+
+                l = strchr(line, ':');
+                if (!l)
+                        continue;
+
+                l++;
+                e = strchr(l, ':');
+                if (!e)
+                        continue;
+
+                *e = 0;
+
+                if (STR_IN_SET(l, "", "name=systemd"))
+                        continue;
+
+                p = strdup(l);
+                if (!p)
+                        return -ENOMEM;
+
+                r = set_consume(subsystems, p);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
+                                         CGroupUnified unified_requested, bool read_only) {
+        const char *to, *fstype, *opts;
         int r;
 
         to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
@@ -619,23 +733,136 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
 
         /* The superblock mount options of the mount point need to be
          * identical to the hosts', and hence writable... */
-        if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
-                return log_error_errno(errno, "Failed to mount to %s: %m", to);
+        if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+                if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
+                        fstype = "cgroup2";
+                        opts = NULL;
+                } else {
+                        fstype = "cgroup";
+                        opts = "none,name=systemd,xattr";
+                }
+        } else {
+                fstype = "cgroup";
+                opts = controller;
+        }
 
-        /* ... hence let's only make the bind mount read-only, not the
-         * superblock. */
+        r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+        if (r < 0)
+                return r;
+
+        /* ... hence let's only make the bind mount read-only, not the superblock. */
         if (read_only) {
-                if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
-                        return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+                r = mount_verbose(LOG_ERR, NULL, to, NULL,
+                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+                if (r < 0)
+                        return r;
         }
+
         return 1;
 }
 
-static int mount_legacy_cgroups(
+/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
+static int mount_legacy_cgns_supported(
+                CGroupUnified unified_requested, bool userns, uid_t uid_shift,
+                uid_t uid_range, const char *selinux_apifs_context) {
+        _cleanup_set_free_free_ Set *controllers = NULL;
+        const char *cgroup_root = "/sys/fs/cgroup", *c;
+        int r;
+
+        (void) mkdir_p(cgroup_root, 0755);
+
+        /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+        if (r == 0) {
+                _cleanup_free_ char *options = NULL;
+
+                /* When cgroup namespaces are enabled and user namespaces are
+                 * used then the mount of the cgroupfs is done *inside* the new
+                 * user namespace. We're root in the new user namespace and the
+                 * kernel will happily translate our uid/gid to the correct
+                 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
+                 * pass uid 0 and not uid_shift to tmpfs_patch_options().
+                 */
+                r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
+                if (r < 0)
+                        return log_oom();
+
+                r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+                if (r < 0)
+                        return r;
+        }
+
+        if (cg_all_unified() > 0)
+                goto skip_controllers;
+
+        controllers = set_new(&string_hash_ops);
+        if (!controllers)
+                return log_oom();
+
+        r = get_controllers(controllers);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+
+        for (;;) {
+                _cleanup_free_ const char *controller = NULL;
+
+                controller = set_steal_first(controllers);
+                if (!controller)
+                        break;
+
+                r = mount_legacy_cgroup_hierarchy("", controller, controller, unified_requested, !userns);
+                if (r < 0)
+                        return r;
+
+                /* When multiple hierarchies are co-mounted, make their
+                 * constituting individual hierarchies a symlink to the
+                 * co-mount.
+                 */
+                c = controller;
+                for (;;) {
+                        _cleanup_free_ char *target = NULL, *tok = NULL;
+
+                        r = extract_first_word(&c, &tok, ",", 0);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
+                        if (r == 0)
+                                break;
+
+                        target = prefix_root("/sys/fs/cgroup", tok);
+                        if (!target)
+                                return log_oom();
+
+                        if (streq(controller, tok))
+                                break;
+
+                        r = symlink_idempotent(controller, target);
+                        if (r == -EINVAL)
+                                return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
+                }
+        }
+
+skip_controllers:
+        r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
+        if (r < 0)
+                return r;
+
+        if (!userns)
+                return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
+                                     MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+
+        return 0;
+}
+
+/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
+static int mount_legacy_cgns_unsupported(
                 const char *dest,
-                bool userns, uid_t uid_shift, uid_t uid_range,
+                CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range,
                 const char *selinux_apifs_context) {
-
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root;
         int r;
@@ -651,15 +878,17 @@ static int mount_legacy_cgroups(
         if (r == 0) {
                 _cleanup_free_ char *options = NULL;
 
-                r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
+                r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
                 if (r < 0)
                         return log_oom();
 
-                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+                r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+                if (r < 0)
+                        return r;
         }
 
-        if (cg_unified() > 0)
+        if (cg_all_unified() > 0)
                 goto skip_controllers;
 
         controllers = set_new(&string_hash_ops);
@@ -685,7 +914,7 @@ static int mount_legacy_cgroups(
                 if (r == -EINVAL) {
                         /* Not a symbolic link, but directly a single cgroup hierarchy */
 
-                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, unified_requested, true);
                         if (r < 0)
                                 return r;
 
@@ -705,29 +934,25 @@ static int mount_legacy_cgroups(
                                 continue;
                         }
 
-                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, unified_requested, true);
                         if (r < 0)
                                 return r;
 
                         r = symlink_idempotent(combined, target);
-                        if (r == -EINVAL) {
-                                log_error("Invalid existing symlink for combined hierarchy");
-                                return r;
-                        }
+                        if (r == -EINVAL)
+                                return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
                         if (r < 0)
                                 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
                 }
         }
 
 skip_controllers:
-        r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
+        r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
         if (r < 0)
                 return r;
 
-        if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
-                return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
-
-        return 0;
+        return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
+                             MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
 }
 
 static int mount_unified_cgroups(const char *dest) {
@@ -754,27 +979,27 @@ static int mount_unified_cgroups(const char *dest) {
                 return -EINVAL;
         }
 
-        if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
-                return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
-
-        return 0;
+        return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
 }
 
 int mount_cgroups(
                 const char *dest,
-                bool unified_requested,
+                CGroupUnified unified_requested,
                 bool userns, uid_t uid_shift, uid_t uid_range,
-                const char *selinux_apifs_context) {
+                const char *selinux_apifs_context,
+                bool use_cgns) {
 
-        if (unified_requested)
+        if (unified_requested >= CGROUP_UNIFIED_ALL)
                 return mount_unified_cgroups(dest);
-        else
-                return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
+        else if (use_cgns)
+                return mount_legacy_cgns_supported(unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
+
+        return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
 }
 
 int mount_systemd_cgroup_writable(
                 const char *dest,
-                bool unified_requested) {
+                CGroupUnified unified_requested) {
 
         _cleanup_free_ char *own_cgroup_path = NULL;
         const char *systemd_root, *systemd_own;
@@ -790,7 +1015,7 @@ int mount_systemd_cgroup_writable(
         if (path_equal(own_cgroup_path, "/"))
                 return 0;
 
-        if (unified_requested) {
+        if (unified_requested >= CGROUP_UNIFIED_ALL) {
                 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
                 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
         } else {
@@ -799,14 +1024,13 @@ int mount_systemd_cgroup_writable(
         }
 
         /* Make our own cgroup a (writable) bind mount */
-        if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
+        r = mount_verbose(LOG_ERR, systemd_own, systemd_own,  NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
 
         /* And then remount the systemd cgroup root read-only */
-        if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
-                return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
-
-        return 0;
+        return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
+                             MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
 }
 
 int setup_volatile_state(
@@ -827,7 +1051,7 @@ int setup_volatile_state(
         /* --volatile=state means we simply overmount /var
            with a tmpfs, and the rest read-only. */
 
-        r = bind_remount_recursive(directory, true);
+        r = bind_remount_recursive(directory, true, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
@@ -837,16 +1061,13 @@ int setup_volatile_state(
                 return log_error_errno(errno, "Failed to create %s: %m", directory);
 
         options = "mode=755";
-        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
         if (r < 0)
                 return log_oom();
         if (r > 0)
                 options = buf;
 
-        if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
-                return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
-
-        return 0;
+        return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
 }
 
 int setup_volatile(
@@ -873,16 +1094,15 @@ int setup_volatile(
                 return log_error_errno(errno, "Failed to create temporary directory: %m");
 
         options = "mode=755";
-        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+        r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
         if (r < 0)
                 return log_oom();
         if (r > 0)
                 options = buf;
 
-        if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
-                r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
+        r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
+        if (r < 0)
                 goto fail;
-        }
 
         tmpfs_mounted = true;
 
@@ -895,23 +1115,21 @@ int setup_volatile(
                 goto fail;
         }
 
-        if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
+        r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
+        if (r < 0)
                 goto fail;
-        }
 
         bind_mounted = true;
 
-        r = bind_remount_recursive(t, true);
+        r = bind_remount_recursive(t, true, NULL);
         if (r < 0) {
                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
                 goto fail;
         }
 
-        if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
-                r = log_error_errno(errno, "Failed to move root mount: %m");
+        r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
+        if (r < 0)
                 goto fail;
-        }
 
         (void) rmdir(template);
 
@@ -919,10 +1137,10 @@ int setup_volatile(
 
 fail:
         if (bind_mounted)
-                (void) umount(t);
+                (void) umount_verbose(t);
 
         if (tmpfs_mounted)
-                (void) umount(template);
+                (void) umount_verbose(template);
         (void) rmdir(template);
         return r;
 }
diff --git a/src/systemd-nspawn/nspawn-mount.h b/src/systemd-nspawn/nspawn-mount.h
index 0b3f3fbefa..8601dfdad3 100644
--- a/src/systemd-nspawn/nspawn-mount.h
+++ b/src/systemd-nspawn/nspawn-mount.h
@@ -22,6 +22,8 @@
 #include <stdbool.h>
 #include <sys/types.h>
 
+#include "systemd-basic/cgroup-util.h"
+
 typedef enum VolatileMode {
         VOLATILE_NO,
         VOLATILE_YES,
@@ -59,8 +61,8 @@ int custom_mount_compare(const void *a, const void *b);
 int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 int mount_sysfs(const char *dest);
 
-int mount_cgroups(const char *dest, bool unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
-int mount_systemd_cgroup_writable(const char *dest, bool unified_requested);
+int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
+int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
 
 int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 
diff --git a/src/systemd-nspawn/nspawn-register.c b/src/systemd-nspawn/nspawn-register.c
index 3889445ca5..1e45b8e58f 100644
--- a/src/systemd-nspawn/nspawn-register.c
+++ b/src/systemd-nspawn/nspawn-register.c
@@ -69,7 +69,6 @@ int register_machine(
                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
         } else {
                 _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
-                char **i;
                 unsigned j;
 
                 r = sd_bus_message_new_method_call(
@@ -158,11 +157,9 @@ int register_machine(
                                 return bus_log_create_error(r);
                 }
 
-                STRV_FOREACH(i, properties) {
-                        r = bus_append_unit_property_assignment(m, *i);
-                        if (r < 0)
-                                return r;
-                }
+                r = bus_append_unit_property_assignment_many(m, properties);
+                if (r < 0)
+                        return r;
 
                 r = sd_bus_message_close_container(m);
                 if (r < 0)
diff --git a/src/systemd-nspawn/nspawn-seccomp.c b/src/systemd-nspawn/nspawn-seccomp.c
index 4a62f15bc7..e5cfe789a1 100644
--- a/src/systemd-nspawn/nspawn-seccomp.c
+++ b/src/systemd-nspawn/nspawn-seccomp.c
@@ -131,16 +131,15 @@ int setup_seccomp(uint64_t cap_list_retain) {
         scmp_filter_ctx seccomp;
         int r;
 
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return log_oom();
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0) {
-                log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
-                goto finish;
+        if (!is_seccomp_available()) {
+                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+                return 0;
         }
 
+        r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate seccomp object: %m");
+
         r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
         if (r < 0)
                 goto finish;
@@ -167,18 +166,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
                 goto finish;
         }
 
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0) {
-                log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
-                goto finish;
-        }
-
         r = seccomp_load(seccomp);
-        if (r == -EINVAL) {
-                log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
-                r = 0;
-                goto finish;
-        }
         if (r < 0) {
                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
                 goto finish;
diff --git a/src/systemd-nspawn/nspawn-settings.c b/src/systemd-nspawn/nspawn-settings.c
index 0886451eb0..6c1614b276 100644
--- a/src/systemd-nspawn/nspawn-settings.c
+++ b/src/systemd-nspawn/nspawn-settings.c
@@ -102,9 +102,7 @@ Settings* settings_free(Settings *s) {
         expose_port_free_all(s->expose_ports);
 
         custom_mount_free_all(s->custom_mounts, s->n_custom_mounts);
-        free(s);
-
-        return NULL;
+        return mfree(s);
 }
 
 bool settings_private_network(Settings *s) {
diff --git a/src/systemd-nspawn/nspawn.c b/src/systemd-nspawn/nspawn.c
index f2cbae2ddb..9514152b5b 100644
--- a/src/systemd-nspawn/nspawn.c
+++ b/src/systemd-nspawn/nspawn.c
@@ -170,7 +170,6 @@ static CustomMount *arg_custom_mounts = NULL;
 static unsigned arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
 static bool arg_quiet = false;
-static bool arg_share_system = false;
 static bool arg_register = true;
 static bool arg_keep_unit = false;
 static char **arg_network_interfaces = NULL;
@@ -189,12 +188,14 @@ static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns_chown = false;
 static int arg_kill_signal = 0;
-static bool arg_unified_cgroup_hierarchy = false;
+static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
 static SettingsMask arg_settings_mask = 0;
 static int arg_settings_trusted = -1;
 static char **arg_parameters = NULL;
 static const char *arg_container_service_name = "systemd-nspawn";
 static bool arg_notify_ready = false;
+static bool arg_use_cgns = true;
+static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -216,10 +217,10 @@ static void help(void) {
                "     --uuid=UUID            Set a specific machine UUID for the container\n"
                "  -S --slice=SLICE          Place the container in the specified slice\n"
                "     --property=NAME=VALUE  Set scope unit property\n"
-               "  -U --private-users=pick   Run within user namespace, pick UID/GID range automatically\n"
+               "  -U --private-users=pick   Run within user namespace, autoselect UID/GID range\n"
                "     --private-users[=UIDBASE[:NUIDS]]\n"
-               "                            Run within user namespace, user configured UID/GID range\n"
-               "     --private-user-chown   Adjust OS tree file ownership for private UID/GID range\n"
+               "                            Similar, but with user configured UID/GID range\n"
+               "     --private-users-chown  Adjust OS tree ownership to private UID/GID range\n"
                "     --private-network      Disable network in container\n"
                "     --network-interface=INTERFACE\n"
                "                            Assign an existing network interface to the\n"
@@ -236,11 +237,10 @@ static void help(void) {
                "                            Add an additional virtual Ethernet link between\n"
                "                            host and container\n"
                "     --network-bridge=INTERFACE\n"
-               "                            Add a virtual Ethernet connection between host\n"
-               "                            and container and add it to an existing bridge on\n"
-               "                            the host\n"
-               "     --network-zone=NAME    Add a virtual Ethernet connection to the container,\n"
-               "                            and add it to an automatically managed bridge interface\n"
+               "                            Add a virtual Ethernet connection to the container\n"
+               "                            and attach it to an existing bridge on the host\n"
+               "     --network-zone=NAME    Similar, but attach the new interface to an\n"
+               "                            an automatically managed bridge interface\n"
                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
                "                            Expose a container IP port on the host\n"
                "  -Z --selinux-context=SECLABEL\n"
@@ -269,14 +269,12 @@ static void help(void) {
                "     --overlay-ro=PATH[:PATH...]:PATH\n"
                "                            Similar, but creates a read-only overlay mount\n"
                "  -E --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
-               "     --share-system         Share system namespaces with host\n"
                "     --register=BOOLEAN     Register container as machine\n"
                "     --keep-unit            Do not register a scope for the machine, reuse\n"
                "                            the service unit nspawn is running in\n"
                "     --volatile[=MODE]      Run the system in volatile mode\n"
                "     --settings=BOOLEAN     Load additional settings from .nspawn file\n"
-               "     --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
-               "                            accepted values: yes and no\n"
+               "     --notify-ready=BOOLEAN Receive notifications from the child init process\n"
                , program_invocation_short_name);
 }
 
@@ -319,9 +317,9 @@ static int custom_mounts_prepare(void) {
         return 0;
 }
 
-static int detect_unified_cgroup_hierarchy(void) {
+static int detect_unified_cgroup_hierarchy(const char *directory) {
         const char *e;
-        int r;
+        int r, all_unified, systemd_unified;
 
         /* Allow the user to control whether the unified hierarchy is used */
         e = getenv("UNIFIED_CGROUP_HIERARCHY");
@@ -329,20 +327,58 @@ static int detect_unified_cgroup_hierarchy(void) {
                 r = parse_boolean(e);
                 if (r < 0)
                         return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
 
-                arg_unified_cgroup_hierarchy = r;
                 return 0;
         }
 
+        all_unified = cg_all_unified();
+        systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
+
+        if (all_unified < 0 || systemd_unified < 0)
+                return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
+                                       "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
         /* Otherwise inherit the default from the host system */
-        r = cg_unified();
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+        if (all_unified > 0) {
+                /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
+                 * routine only detects 231, so we'll have a false negative here for 230. */
+                r = systemd_installation_has_version(directory, 230);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine systemd version in container: %m");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+        } else if (systemd_unified > 0) {
+                /* Mixed cgroup hierarchy support was added in 232 */
+                r = systemd_installation_has_version(directory, 232);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine systemd version in container: %m");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+        } else
+                arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
 
-        arg_unified_cgroup_hierarchy = r;
         return 0;
 }
 
+static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
+        int r;
+
+        r = getenv_bool(name);
+        if (r == -ENXIO)
+                return;
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+        arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -380,52 +416,52 @@ static int parse_argv(int argc, char *argv[]) {
         };
 
         static const struct option options[] = {
-                { "help",                  no_argument,       NULL, 'h'                   },
-                { "version",               no_argument,       NULL, ARG_VERSION           },
-                { "directory",             required_argument, NULL, 'D'                   },
-                { "template",              required_argument, NULL, ARG_TEMPLATE          },
-                { "ephemeral",             no_argument,       NULL, 'x'                   },
-                { "user",                  required_argument, NULL, 'u'                   },
-                { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
-                { "as-pid2",               no_argument,       NULL, 'a'                   },
-                { "boot",                  no_argument,       NULL, 'b'                   },
-                { "uuid",                  required_argument, NULL, ARG_UUID              },
-                { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
-                { "capability",            required_argument, NULL, ARG_CAPABILITY        },
-                { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
-                { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
-                { "bind",                  required_argument, NULL, ARG_BIND              },
-                { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
-                { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
-                { "overlay",               required_argument, NULL, ARG_OVERLAY           },
-                { "overlay-ro",            required_argument, NULL, ARG_OVERLAY_RO        },
-                { "machine",               required_argument, NULL, 'M'                   },
-                { "slice",                 required_argument, NULL, 'S'                   },
-                { "setenv",                required_argument, NULL, 'E'                   },
-                { "selinux-context",       required_argument, NULL, 'Z'                   },
-                { "selinux-apifs-context", required_argument, NULL, 'L'                   },
-                { "quiet",                 no_argument,       NULL, 'q'                   },
-                { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
-                { "register",              required_argument, NULL, ARG_REGISTER          },
-                { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
-                { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
-                { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
-                { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
-                { "network-veth",          no_argument,       NULL, 'n'                   },
-                { "network-veth-extra",    required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
-                { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
-                { "network-zone",          required_argument, NULL, ARG_NETWORK_ZONE      },
-                { "personality",           required_argument, NULL, ARG_PERSONALITY       },
-                { "image",                 required_argument, NULL, 'i'                   },
-                { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
-                { "port",                  required_argument, NULL, 'p'                   },
-                { "property",              required_argument, NULL, ARG_PROPERTY          },
-                { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
-                { "private-users-chown",   optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
-                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
-                { "settings",              required_argument, NULL, ARG_SETTINGS          },
-                { "chdir",                 required_argument, NULL, ARG_CHDIR             },
-                { "notify-ready",          required_argument, NULL, ARG_NOTIFY_READY      },
+                { "help",                  no_argument,       NULL, 'h'                     },
+                { "version",               no_argument,       NULL, ARG_VERSION             },
+                { "directory",             required_argument, NULL, 'D'                     },
+                { "template",              required_argument, NULL, ARG_TEMPLATE            },
+                { "ephemeral",             no_argument,       NULL, 'x'                     },
+                { "user",                  required_argument, NULL, 'u'                     },
+                { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK     },
+                { "as-pid2",               no_argument,       NULL, 'a'                     },
+                { "boot",                  no_argument,       NULL, 'b'                     },
+                { "uuid",                  required_argument, NULL, ARG_UUID                },
+                { "read-only",             no_argument,       NULL, ARG_READ_ONLY           },
+                { "capability",            required_argument, NULL, ARG_CAPABILITY          },
+                { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY     },
+                { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL        },
+                { "bind",                  required_argument, NULL, ARG_BIND                },
+                { "bind-ro",               required_argument, NULL, ARG_BIND_RO             },
+                { "tmpfs",                 required_argument, NULL, ARG_TMPFS               },
+                { "overlay",               required_argument, NULL, ARG_OVERLAY             },
+                { "overlay-ro",            required_argument, NULL, ARG_OVERLAY_RO          },
+                { "machine",               required_argument, NULL, 'M'                     },
+                { "slice",                 required_argument, NULL, 'S'                     },
+                { "setenv",                required_argument, NULL, 'E'                     },
+                { "selinux-context",       required_argument, NULL, 'Z'                     },
+                { "selinux-apifs-context", required_argument, NULL, 'L'                     },
+                { "quiet",                 no_argument,       NULL, 'q'                     },
+                { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM        }, /* not documented */
+                { "register",              required_argument, NULL, ARG_REGISTER            },
+                { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT           },
+                { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE   },
+                { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN     },
+                { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN      },
+                { "network-veth",          no_argument,       NULL, 'n'                     },
+                { "network-veth-extra",    required_argument, NULL, ARG_NETWORK_VETH_EXTRA  },
+                { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE      },
+                { "network-zone",          required_argument, NULL, ARG_NETWORK_ZONE        },
+                { "personality",           required_argument, NULL, ARG_PERSONALITY         },
+                { "image",                 required_argument, NULL, 'i'                     },
+                { "volatile",              optional_argument, NULL, ARG_VOLATILE            },
+                { "port",                  required_argument, NULL, 'p'                     },
+                { "property",              required_argument, NULL, ARG_PROPERTY            },
+                { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS       },
+                { "private-users-chown",   optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
+                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL         },
+                { "settings",              required_argument, NULL, ARG_SETTINGS            },
+                { "chdir",                 required_argument, NULL, ARG_CHDIR               },
+                { "notify-ready",          required_argument, NULL, ARG_NOTIFY_READY        },
                 {}
         };
 
@@ -814,7 +850,9 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
 
                 case ARG_SHARE_SYSTEM:
-                        arg_share_system = true;
+                        /* We don't officially support this anymore, except for compat reasons. People should use the
+                         * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
+                        arg_clone_ns_flags = 0;
                         break;
 
                 case ARG_REGISTER:
@@ -876,15 +914,21 @@ static int parse_argv(int argc, char *argv[]) {
 
                         break;
 
-                case ARG_PRIVATE_USERS:
+                case ARG_PRIVATE_USERS: {
+                        int boolean = -1;
 
-                        r = optarg ? parse_boolean(optarg) : 1;
-                        if (r == 0) {
+                        if (!optarg)
+                                boolean = true;
+                        else if (!in_charset(optarg, DIGITS))
+                                /* do *not* parse numbers as booleans */
+                                boolean = parse_boolean(optarg);
+
+                        if (boolean == false) {
                                 /* no: User namespacing off */
                                 arg_userns_mode = USER_NAMESPACE_NO;
                                 arg_uid_shift = UID_INVALID;
                                 arg_uid_range = UINT32_C(0x10000);
-                        } else if (r > 0) {
+                        } else if (boolean == true) {
                                 /* yes: User namespacing on, UID range is read from root dir */
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
                                 arg_uid_shift = UID_INVALID;
@@ -908,23 +952,27 @@ static int parse_argv(int argc, char *argv[]) {
                                         shift = buffer;
 
                                         range++;
-                                        if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
-                                                log_error("Failed to parse UID range: %s", range);
-                                                return -EINVAL;
-                                        }
+                                        r = safe_atou32(range, &arg_uid_range);
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
                                 } else
                                         shift = optarg;
 
-                                if (parse_uid(shift, &arg_uid_shift) < 0) {
-                                        log_error("Failed to parse UID: %s", optarg);
-                                        return -EINVAL;
-                                }
+                                r = parse_uid(shift, &arg_uid_shift);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
 
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
                         }
 
+                        if (arg_uid_range <= 0) {
+                                log_error("UID range cannot be 0.");
+                                return -EINVAL;
+                        }
+
                         arg_settings_mask |= SETTING_USERNS;
                         break;
+                }
 
                 case 'U':
                         if (userns_supported()) {
@@ -1018,17 +1066,23 @@ static int parse_argv(int argc, char *argv[]) {
                         assert_not_reached("Unhandled option");
                 }
 
-        if (arg_share_system)
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+
+        if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
+            !(arg_clone_ns_flags & CLONE_NEWUTS)) {
                 arg_register = false;
+                if (arg_start_mode != START_PID1) {
+                        log_error("--boot cannot be used without namespacing.");
+                        return -EINVAL;
+                }
+        }
 
         if (arg_userns_mode == USER_NAMESPACE_PICK)
                 arg_userns_chown = true;
 
-        if (arg_start_mode != START_PID1 && arg_share_system) {
-                log_error("--boot and --share-system may not be combined.");
-                return -EINVAL;
-        }
-
         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
                 log_error("--keep-unit may not be used when invoked from a user session.");
                 return -EINVAL;
@@ -1097,14 +1151,16 @@ static int parse_argv(int argc, char *argv[]) {
 
         arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
 
-        r = detect_unified_cgroup_hierarchy();
-        if (r < 0)
-                return r;
-
         e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
         if (e)
                 arg_container_service_name = e;
 
+        r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+        if (r < 0)
+                arg_use_cgns = cg_ns_supported();
+        else
+                arg_use_cgns = r;
+
         return 1;
 }
 
@@ -1186,7 +1242,13 @@ static int setup_timezone(const char *dest) {
         /* Fix the timezone, if possible */
         r = readlink_malloc("/etc/localtime", &p);
         if (r < 0) {
-                log_warning("/etc/localtime is not a symlink, not updating container timezone.");
+                log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
+                /* to handle warning, delete /etc/localtime and replace it
+                 * with a symbolic link to a time zone data file.
+                 *
+                 * Example:
+                 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
+                 */
                 return 0;
         }
 
@@ -1275,9 +1337,6 @@ static int setup_boot_id(const char *dest) {
         const char *from, *to;
         int r;
 
-        if (arg_share_system)
-                return 0;
-
         /* Generate a new randomized boot ID, so that each boot-up of
          * the container gets a new one */
 
@@ -1292,10 +1351,10 @@ static int setup_boot_id(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
-        if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                r = log_error_errno(errno, "Failed to bind mount boot id: %m");
-        else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
-                log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m");
+        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        if (r >= 0)
+                r = mount_verbose(LOG_ERR, NULL, to, NULL,
+                                  MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
 
         (void) unlink(from);
         return r;
@@ -1343,6 +1402,12 @@ static int copy_devnodes(const char *dest) {
 
                 } else {
                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
+                                /*
+                                 * This is some sort of protection too against
+                                 * recursive userns chown on shared /dev/
+                                 */
+                                if (errno == EEXIST)
+                                        log_notice("%s/dev/ should be an empty directory", dest);
                                 if (errno != EPERM)
                                         return log_error_errno(errno, "mknod(%s) failed: %m", to);
 
@@ -1351,8 +1416,9 @@ static int copy_devnodes(const char *dest) {
                                 r = touch(to);
                                 if (r < 0)
                                         return log_error_errno(r, "touch (%s) failed: %m", to);
-                                if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                                        return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
+                                r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
+                                if (r < 0)
+                                        return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
                         }
 
                         r = userns_lchown(to, 0, 0);
@@ -1388,8 +1454,9 @@ static int setup_pts(const char *dest) {
         p = prefix_roota(dest, "/dev/pts");
         if (mkdir(p, 0755) < 0)
                 return log_error_errno(errno, "Failed to create /dev/pts: %m");
-        if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
-                return log_error_errno(errno, "Failed to mount /dev/pts: %m");
+        r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
+        if (r < 0)
+                return r;
         r = userns_lchown(p, 0, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to chown /dev/pts: %m");
@@ -1434,10 +1501,7 @@ static int setup_dev_console(const char *dest, const char *console) {
         if (r < 0)
                 return log_error_errno(r, "touch() for /dev/console failed: %m");
 
-        if (mount(console, to, NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
-
-        return 0;
+        return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
 }
 
 static int setup_kmsg(const char *dest, int kmsg_socket) {
@@ -1461,8 +1525,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
 
         if (mkfifo(from, 0600) < 0)
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
-        if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
+        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
 
         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
         if (fd < 0)
@@ -1495,7 +1560,7 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user
 
 static int setup_hostname(void) {
 
-        if (arg_share_system)
+        if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
                 return 0;
 
         if (sethostname_idempotent(arg_machine) < 0)
@@ -1632,7 +1697,8 @@ static int setup_journal(const char *directory) {
         if (r < 0)
                 return log_error_errno(r, "Failed to create %s: %m", q);
 
-        if (mount(p, q, NULL, MS_BIND, NULL) < 0)
+        r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
+        if (r < 0)
                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
 
         return 0;
@@ -1646,7 +1712,7 @@ static int reset_audit_loginuid(void) {
         _cleanup_free_ char *p = NULL;
         int r;
 
-        if (arg_share_system)
+        if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
                 return 0;
 
         r = read_one_line_file("/proc/self/loginuid", &p);
@@ -1697,13 +1763,17 @@ static int setup_propagate(const char *root) {
                 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
 
         q = prefix_roota(root, "/run/systemd/nspawn/incoming");
-        if (mount(p, q, NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Failed to install propagation bind mount.");
+        r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
 
-        if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
-                return log_error_errno(errno, "Failed to make propagation mount read-only");
+        r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
+        if (r < 0)
+                return r;
 
-        return 0;
+        /* machined will MS_MOVE into that directory, and that's only
+         * supported for non-shared mounts. */
+        return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
 }
 
 static int setup_image(char **device_path, int *loop_nr) {
@@ -1795,17 +1865,18 @@ static int dissect_image(
                 char **root_device, bool *root_device_rw,
                 char **home_device, bool *home_device_rw,
                 char **srv_device, bool *srv_device_rw,
+                char **esp_device,
                 bool *secondary) {
 
 #ifdef HAVE_BLKID
-        int home_nr = -1, srv_nr = -1;
+        int home_nr = -1, srv_nr = -1, esp_nr = -1;
 #ifdef GPT_ROOT_NATIVE
         int root_nr = -1;
 #endif
 #ifdef GPT_ROOT_SECONDARY
         int secondary_root_nr = -1;
 #endif
-        _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
+        _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
@@ -1823,6 +1894,7 @@ static int dissect_image(
         assert(root_device);
         assert(home_device);
         assert(srv_device);
+        assert(esp_device);
         assert(secondary);
         assert(arg_image);
 
@@ -2036,6 +2108,16 @@ static int dissect_image(
                                 r = free_and_strdup(&srv, node);
                                 if (r < 0)
                                         return log_oom();
+                        } else if (sd_id128_equal(type_id, GPT_ESP)) {
+
+                                if (esp && nr >= esp_nr)
+                                        continue;
+
+                                esp_nr = nr;
+
+                                r = free_and_strdup(&esp, node);
+                                if (r < 0)
+                                        return log_oom();
                         }
 #ifdef GPT_ROOT_NATIVE
                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
@@ -2153,6 +2235,11 @@ static int dissect_image(
                 *srv_device_rw = srv_rw;
         }
 
+        if (esp) {
+                *esp_device = esp;
+                esp = NULL;
+        }
+
         return 0;
 #else
         log_error("--image= is not supported, compiled without blkid support.");
@@ -2163,7 +2250,7 @@ static int dissect_image(
 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
 #ifdef HAVE_BLKID
         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
-        const char *fstype, *p;
+        const char *fstype, *p, *options;
         int r;
 
         assert(what);
@@ -2212,10 +2299,17 @@ static int mount_device(const char *what, const char *where, const char *directo
                 return -EOPNOTSUPP;
         }
 
-        if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
-                return log_error_errno(errno, "Failed to mount %s: %m", what);
+        /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
+         * sparse when possible. */
+        if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
+                const char *l;
 
-        return 0;
+                l = path_startswith(what, "/dev");
+                if (l && startswith(l, "loop"))
+                        options = "discard";
+        }
+
+        return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
 #else
         log_error("--image= is not supported, compiled without blkid support.");
         return -EOPNOTSUPP;
@@ -2285,7 +2379,8 @@ static int mount_devices(
                 const char *where,
                 const char *root_device, bool root_device_rw,
                 const char *home_device, bool home_device_rw,
-                const char *srv_device, bool srv_device_rw) {
+                const char *srv_device, bool srv_device_rw,
+                const char *esp_device) {
         int r;
 
         assert(where);
@@ -2308,6 +2403,27 @@ static int mount_devices(
                         return log_error_errno(r, "Failed to mount server data directory: %m");
         }
 
+        if (esp_device) {
+                const char *mp, *x;
+
+                /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
+
+                mp = "/efi";
+                x = strjoina(arg_directory, mp);
+                r = dir_is_empty(x);
+                if (r == -ENOENT) {
+                        mp = "/boot";
+                        x = strjoina(arg_directory, mp);
+                        r = dir_is_empty(x);
+                }
+
+                if (r > 0) {
+                        r = mount_device(esp_device, arg_directory, mp, true);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to  mount ESP: %m");
+                }
+        }
+
         return 0;
 }
 
@@ -2568,6 +2684,10 @@ static int inner_child(
                 }
         }
 
+        r = reset_uid_gid();
+        if (r < 0)
+                return log_error_errno(r, "Couldn't become new root: %m");
+
         r = mount_all(NULL,
                       arg_userns_mode != USER_NAMESPACE_NO,
                       true,
@@ -2590,13 +2710,25 @@ static int inner_child(
                 return -ESRCH;
         }
 
-        r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
-        if (r < 0)
-                return r;
-
-        r = reset_uid_gid();
-        if (r < 0)
-                return log_error_errno(r, "Couldn't become new root: %m");
+        if (arg_use_cgns && cg_ns_supported()) {
+                r = unshare(CLONE_NEWCGROUP);
+                if (r < 0)
+                        return log_error_errno(errno, "Failed to unshare cgroup namespace");
+                r = mount_cgroups(
+                                "",
+                                arg_unified_cgroup_hierarchy,
+                                arg_userns_mode != USER_NAMESPACE_NO,
+                                arg_uid_shift,
+                                arg_uid_range,
+                                arg_selinux_apifs_context,
+                                true);
+                if (r < 0)
+                        return r;
+        } else {
+                r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+                if (r < 0)
+                        return r;
+        }
 
         r = setup_boot_id(NULL);
         if (r < 0)
@@ -2781,6 +2913,7 @@ static int outer_child(
                 const char *root_device, bool root_device_rw,
                 const char *home_device, bool home_device_rw,
                 const char *srv_device, bool srv_device_rw,
+                const char *esp_device,
                 bool interactive,
                 bool secondary,
                 int pid_socket,
@@ -2836,13 +2969,15 @@ static int outer_child(
         /* Mark everything as slave, so that we still
          * receive mounts from the real root, but don't
          * propagate mounts to the real root. */
-        if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
+        r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
+        if (r < 0)
+                return r;
 
         r = mount_devices(directory,
                           root_device, root_device_rw,
                           home_device, home_device_rw,
-                          srv_device, srv_device_rw);
+                          srv_device, srv_device_rw,
+                          esp_device);
         if (r < 0)
                 return r;
 
@@ -2850,6 +2985,10 @@ static int outer_child(
         if (r < 0)
                 return r;
 
+        r = detect_unified_cgroup_hierarchy(directory);
+        if (r < 0)
+                return r;
+
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 /* Let the parent know which UID shift we read from the image */
                 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
@@ -2878,8 +3017,19 @@ static int outer_child(
         }
 
         /* Turn directory into bind mount */
-        if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
-                return log_error_errno(errno, "Failed to make bind mount: %m");
+        r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
+        if (r < 0)
+                return r;
+
+        /* Mark everything as shared so our mounts get propagated down. This is
+         * required to make new bind mounts available in systemd services
+         * inside the containter that create a new mount namespace.
+         * See https://github.com/systemd/systemd/issues/3860
+         * Further submounts (such as /dev) done after this will inherit the
+         * shared propagation mode.*/
+        r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
+        if (r < 0)
+                return r;
 
         r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
         if (r < 0)
@@ -2910,7 +3060,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only) {
-                r = bind_remount_recursive(directory, true);
+                r = bind_remount_recursive(directory, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
@@ -2974,15 +3124,18 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = mount_cgroups(
-                        directory,
-                        arg_unified_cgroup_hierarchy,
-                        arg_userns_mode != USER_NAMESPACE_NO,
-                        arg_uid_shift,
-                        arg_uid_range,
-                        arg_selinux_apifs_context);
-        if (r < 0)
-                return r;
+        if (!arg_use_cgns || !cg_ns_supported()) {
+                r = mount_cgroups(
+                                directory,
+                                arg_unified_cgroup_hierarchy,
+                                arg_userns_mode != USER_NAMESPACE_NO,
+                                arg_uid_shift,
+                                arg_uid_range,
+                                arg_selinux_apifs_context,
+                                false);
+                if (r < 0)
+                        return r;
+        }
 
         r = mount_move_root(directory);
         if (r < 0)
@@ -2993,7 +3146,7 @@ static int outer_child(
                 return fd;
 
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
-                        (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
+                        arg_clone_ns_flags |
                         (arg_private_network ? CLONE_NEWNET : 0) |
                         (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
         if (pid < 0)
@@ -3443,18 +3596,437 @@ static int load_settings(void) {
         return 0;
 }
 
+static int run(int master,
+               const char* console,
+               const char *root_device, bool root_device_rw,
+               const char *home_device, bool home_device_rw,
+               const char *srv_device, bool srv_device_rw,
+               const char *esp_device,
+               bool interactive,
+               bool secondary,
+               FDSet *fds,
+               char veth_name[IFNAMSIZ], bool *veth_created,
+               union in_addr_union *exposed,
+               pid_t *pid, int *ret) {
+
+        static const struct sigaction sa = {
+                .sa_handler = nop_signal_handler,
+                .sa_flags = SA_NOCLDSTOP,
+        };
+
+        _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
+        _cleanup_close_ int etc_passwd_lock = -1;
+        _cleanup_close_pair_ int
+                kmsg_socket_pair[2] = { -1, -1 },
+                rtnl_socket_pair[2] = { -1, -1 },
+                pid_socket_pair[2] = { -1, -1 },
+                uuid_socket_pair[2] = { -1, -1 },
+                notify_socket_pair[2] = { -1, -1 },
+                uid_shift_socket_pair[2] = { -1, -1 };
+        _cleanup_close_ int notify_socket= -1;
+        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+        ContainerStatus container_status = 0;
+        char last_char = 0;
+        int ifi = 0, r;
+        ssize_t l;
+        sigset_t mask_chld;
+
+        assert_se(sigemptyset(&mask_chld) == 0);
+        assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
+
+        if (arg_userns_mode == USER_NAMESPACE_PICK) {
+                /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
+                 * check with getpwuid() if the specific user already exists. Note that /etc might be
+                 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
+                 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
+                 * really just an extra safety net. We kinda assume that the UID range we allocate from is
+                 * really ours. */
+
+                etc_passwd_lock = take_etc_passwd_lock(NULL);
+                if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
+                        return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
+        }
+
+        r = barrier_create(&barrier);
+        if (r < 0)
+                return log_error_errno(r, "Cannot initialize IPC barrier: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create pid socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create id socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create notify socket pair: %m");
+
+        if (arg_userns_mode != USER_NAMESPACE_NO)
+                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
+                        return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+
+        /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
+         * parent's blocking calls and give it a chance to call wait() and terminate. */
+        r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to change the signal mask: %m");
+
+        r = sigaction(SIGCHLD, &sa, NULL);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
+
+        *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
+        if (*pid < 0)
+                return log_error_errno(errno, "clone() failed%s: %m",
+                                       errno == EINVAL ?
+                                       ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+
+        if (*pid == 0) {
+                /* The outer child only has a file system namespace. */
+                barrier_set_role(&barrier, BARRIER_CHILD);
+
+                master = safe_close(master);
+
+                kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
+                rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+                pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+                uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
+                notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+                uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+
+                (void) reset_all_signal_handlers();
+                (void) reset_signal_mask();
+
+                r = outer_child(&barrier,
+                                arg_directory,
+                                console,
+                                root_device, root_device_rw,
+                                home_device, home_device_rw,
+                                srv_device, srv_device_rw,
+                                esp_device,
+                                interactive,
+                                secondary,
+                                pid_socket_pair[1],
+                                uuid_socket_pair[1],
+                                notify_socket_pair[1],
+                                kmsg_socket_pair[1],
+                                rtnl_socket_pair[1],
+                                uid_shift_socket_pair[1],
+                                fds);
+                if (r < 0)
+                        _exit(EXIT_FAILURE);
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        barrier_set_role(&barrier, BARRIER_PARENT);
+
+        fds = fdset_free(fds);
+
+        kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
+        rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+        pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
+        uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
+        notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+        uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
+                /* The child just let us know the UID shift it might have read from the image. */
+                l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to read UID shift: %m");
+
+                if (l != sizeof arg_uid_shift) {
+                        log_error("Short read while reading UID shift.");
+                        return -EIO;
+                }
+
+                if (arg_userns_mode == USER_NAMESPACE_PICK) {
+                        /* If we are supposed to pick the UID shift, let's try to use the shift read from the
+                         * image, but if that's already in use, pick a new one, and report back to the child,
+                         * which one we now picked. */
+
+                        r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
+
+                        l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
+                        if (l < 0)
+                                return log_error_errno(errno, "Failed to send UID shift: %m");
+                        if (l != sizeof arg_uid_shift) {
+                                log_error("Short write while writing UID shift.");
+                                return -EIO;
+                        }
+                }
+        }
+
+        /* Wait for the outer child. */
+        r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
+        if (r != 0)
+                return r < 0 ? r : -EIO;
+
+        /* And now retrieve the PID of the inner child. */
+        l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
+        if (l < 0)
+                return log_error_errno(errno, "Failed to read inner child PID: %m");
+        if (l != sizeof *pid) {
+                log_error("Short read while reading inner child PID.");
+                return -EIO;
+        }
+
+        /* We also retrieve container UUID in case it was generated by outer child */
+        l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
+        if (l < 0)
+                return log_error_errno(errno, "Failed to read container machine ID: %m");
+        if (l != sizeof(arg_uuid)) {
+                log_error("Short read while reading container machined ID.");
+                return -EIO;
+        }
+
+        /* We also retrieve the socket used for notifications generated by outer child */
+        notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+        if (notify_socket < 0)
+                return log_error_errno(notify_socket,
+                                       "Failed to receive notification socket from the outer child: %m");
+
+        log_debug("Init process invoked as PID "PID_FMT, *pid);
+
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
+                if (!barrier_place_and_sync(&barrier)) { /* #1 */
+                        log_error("Child died too early.");
+                        return -ESRCH;
+                }
+
+                r = setup_uid_map(*pid);
+                if (r < 0)
+                        return r;
+
+                (void) barrier_place(&barrier); /* #2 */
+        }
+
+        if (arg_private_network) {
+
+                r = move_network_interfaces(*pid, arg_network_interfaces);
+                if (r < 0)
+                        return r;
+
+                if (arg_network_veth) {
+                        r = setup_veth(arg_machine, *pid, veth_name,
+                                       arg_network_bridge || arg_network_zone);
+                        if (r < 0)
+                                return r;
+                        else if (r > 0)
+                                ifi = r;
+
+                        if (arg_network_bridge) {
+                                /* Add the interface to a bridge */
+                                r = setup_bridge(veth_name, arg_network_bridge, false);
+                                if (r < 0)
+                                        return r;
+                                if (r > 0)
+                                        ifi = r;
+                        } else if (arg_network_zone) {
+                                /* Add the interface to a bridge, possibly creating it */
+                                r = setup_bridge(veth_name, arg_network_zone, true);
+                                if (r < 0)
+                                        return r;
+                                if (r > 0)
+                                        ifi = r;
+                        }
+                }
+
+                r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
+                if (r < 0)
+                        return r;
+
+                /* We created the primary and extra veth links now; let's remember this, so that we know to
+                   remove them later on. Note that we don't bother with removing veth links that were created
+                   here when their setup failed half-way, because in that case the kernel should be able to
+                   remove them on its own, since they cannot be referenced by anything yet. */
+                *veth_created = true;
+
+                r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
+                if (r < 0)
+                        return r;
+
+                r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
+                if (r < 0)
+                        return r;
+        }
+
+        if (arg_register) {
+                r = register_machine(
+                                arg_machine,
+                                *pid,
+                                arg_directory,
+                                arg_uuid,
+                                ifi,
+                                arg_slice,
+                                arg_custom_mounts, arg_n_custom_mounts,
+                                arg_kill_signal,
+                                arg_property,
+                                arg_keep_unit,
+                                arg_container_service_name);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
+        if (r < 0)
+                return r;
+
+        if (arg_keep_unit) {
+                r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
+                if (r < 0)
+                        return r;
+        }
+
+        r = chown_cgroup(*pid, arg_uid_shift);
+        if (r < 0)
+                return r;
+
+        /* Notify the child that the parent is ready with all
+         * its setup (including cgroup-ification), and that
+         * the child can now hand over control to the code to
+         * run inside the container. */
+        (void) barrier_place(&barrier); /* #3 */
+
+        /* Block SIGCHLD here, before notifying child.
+         * process_pty() will handle it with the other signals. */
+        assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
+
+        /* Reset signal to default */
+        r = default_signals(SIGCHLD, -1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset SIGCHLD: %m");
+
+        r = sd_event_new(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get default event source: %m");
+
+        r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
+        if (r < 0)
+                return r;
+
+        /* Let the child know that we are ready and wait that the child is completely ready now. */
+        if (!barrier_place_and_sync(&barrier)) { /* #4 */
+                log_error("Child died too early.");
+                return -ESRCH;
+        }
+
+        /* At this point we have made use of the UID we picked, and thus nss-mymachines
+         * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
+        etc_passwd_lock = safe_close(etc_passwd_lock);
+
+        sd_notifyf(false,
+                   "STATUS=Container running.\n"
+                   "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
+        if (!arg_notify_ready)
+                sd_notify(false, "READY=1\n");
+
+        if (arg_kill_signal > 0) {
+                /* Try to kill the init system on SIGINT or SIGTERM */
+                sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
+                sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
+        } else {
+                /* Immediately exit */
+                sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+                sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+        }
+
+        /* simply exit on sigchld */
+        sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
+
+        if (arg_expose_ports) {
+                r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
+                if (r < 0)
+                        return r;
+
+                (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
+        }
+
+        rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+
+        r = pty_forward_new(event, master,
+                            PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
+                            &forward);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        pty_forward_get_last_char(forward, &last_char);
+
+        forward = pty_forward_free(forward);
+
+        if (!arg_quiet && last_char != '\n')
+                putc('\n', stdout);
+
+        /* Kill if it is not dead yet anyway */
+        if (arg_register && !arg_keep_unit)
+                terminate_machine(*pid);
+
+        /* Normally redundant, but better safe than sorry */
+        kill(*pid, SIGKILL);
+
+        r = wait_for_container(*pid, &container_status);
+        *pid = 0;
+
+        if (r < 0)
+                /* We failed to wait for the container, or the container exited abnormally. */
+                return r;
+        if (r > 0 || container_status == CONTAINER_TERMINATED) {
+                /* r > 0 → The container exited with a non-zero status.
+                 *         As a special case, we need to replace 133 with a different value,
+                 *         because 133 is special-cased in the service file to reboot the container.
+                 * otherwise → The container exited with zero status and a reboot was not requested.
+                 */
+                if (r == 133)
+                        r = EXIT_FAILURE; /* replace 133 with the general failure code */
+                *ret = r;
+                return 0; /* finito */
+        }
+
+        /* CONTAINER_REBOOTED, loop again */
+
+        if (arg_keep_unit) {
+                /* Special handling if we are running as a service: instead of simply
+                 * restarting the machine we want to restart the entire service, so let's
+                 * inform systemd about this with the special exit code 133. The service
+                 * file uses RestartForceExitStatus=133 so that this results in a full
+                 * nspawn restart. This is necessary since we might have cgroup parameters
+                 * set we want to have flushed out. */
+                *ret = 0;
+                return 133;
+        }
+
+        expose_port_flush(arg_expose_ports, exposed);
+
+        (void) remove_veth_links(veth_name, arg_network_veth_extra);
+        *veth_created = false;
+        return 1; /* loop again */
+}
+
 int main(int argc, char *argv[]) {
 
-        _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
+        _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
         _cleanup_close_ int master = -1, image_fd = -1;
         _cleanup_fdset_free_ FDSet *fds = NULL;
-        int r, n_fd_passed, loop_nr = -1;
+        int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
         bool secondary = false, remove_subvol = false;
-        sigset_t mask_chld;
         pid_t pid = 0;
-        int ret = EXIT_SUCCESS;
         union in_addr_union exposed = {};
         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
         bool interactive, veth_created = false;
@@ -3627,6 +4199,7 @@ int main(int argc, char *argv[]) {
                                   &root_device, &root_device_rw,
                                   &home_device, &home_device_rw,
                                   &srv_device, &srv_device_rw,
+                                  &esp_device,
                                   &secondary);
                 if (r < 0)
                         goto finish;
@@ -3669,469 +4242,25 @@ int main(int argc, char *argv[]) {
 
         assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
 
-        assert_se(sigemptyset(&mask_chld) == 0);
-        assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
-
         if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
                 r = log_error_errno(errno, "Failed to become subreaper: %m");
                 goto finish;
         }
 
         for (;;) {
-                static const struct sigaction sa = {
-                        .sa_handler = nop_signal_handler,
-                        .sa_flags = SA_NOCLDSTOP,
-                };
-
-                _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
-                _cleanup_close_ int etc_passwd_lock = -1;
-                _cleanup_close_pair_ int
-                        kmsg_socket_pair[2] = { -1, -1 },
-                        rtnl_socket_pair[2] = { -1, -1 },
-                        pid_socket_pair[2] = { -1, -1 },
-                        uuid_socket_pair[2] = { -1, -1 },
-                        notify_socket_pair[2] = { -1, -1 },
-                        uid_shift_socket_pair[2] = { -1, -1 };
-                _cleanup_close_ int notify_socket= -1;
-                _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
-                _cleanup_(sd_event_unrefp) sd_event *event = NULL;
-                _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
-                _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
-                ContainerStatus container_status;
-                char last_char = 0;
-                int ifi = 0;
-                ssize_t l;
-
-                if (arg_userns_mode == USER_NAMESPACE_PICK) {
-                        /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
-                         * check with getpwuid() if the specific user already exists. Note that /etc might be
-                         * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
-                         * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
-                         * really just an extra safety net. We kinda assume that the UID range we allocate from is
-                         * really ours. */
-
-                        etc_passwd_lock = take_etc_passwd_lock(NULL);
-                        if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
-                                log_error_errno(r, "Failed to take /etc/passwd lock: %m");
-                                goto finish;
-                        }
-                }
-
-                r = barrier_create(&barrier);
-                if (r < 0) {
-                        log_error_errno(r, "Cannot initialize IPC barrier: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create pid socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create id socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create notify socket pair: %m");
-                        goto finish;
-                }
-
-                if (arg_userns_mode != USER_NAMESPACE_NO)
-                        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
-                                r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
-                                goto finish;
-                        }
-
-                /* Child can be killed before execv(), so handle SIGCHLD
-                 * in order to interrupt parent's blocking calls and
-                 * give it a chance to call wait() and terminate. */
-                r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
-                if (r < 0) {
-                        r = log_error_errno(errno, "Failed to change the signal mask: %m");
-                        goto finish;
-                }
-
-                r = sigaction(SIGCHLD, &sa, NULL);
-                if (r < 0) {
-                        r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
-                        goto finish;
-                }
-
-                pid = raw_clone(SIGCHLD|CLONE_NEWNS);
-                if (pid < 0) {
-                        if (errno == EINVAL)
-                                r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
-                        else
-                                r = log_error_errno(errno, "clone() failed: %m");
-
-                        goto finish;
-                }
-
-                if (pid == 0) {
-                        /* The outer child only has a file system namespace. */
-                        barrier_set_role(&barrier, BARRIER_CHILD);
-
-                        master = safe_close(master);
-
-                        kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
-                        rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-                        pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
-                        uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
-                        notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
-                        uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
-
-                        (void) reset_all_signal_handlers();
-                        (void) reset_signal_mask();
-
-                        r = outer_child(&barrier,
-                                        arg_directory,
-                                        console,
-                                        root_device, root_device_rw,
-                                        home_device, home_device_rw,
-                                        srv_device, srv_device_rw,
-                                        interactive,
-                                        secondary,
-                                        pid_socket_pair[1],
-                                        uuid_socket_pair[1],
-                                        notify_socket_pair[1],
-                                        kmsg_socket_pair[1],
-                                        rtnl_socket_pair[1],
-                                        uid_shift_socket_pair[1],
-                                        fds);
-                        if (r < 0)
-                                _exit(EXIT_FAILURE);
-
-                        _exit(EXIT_SUCCESS);
-                }
-
-                barrier_set_role(&barrier, BARRIER_PARENT);
-
-                fds = fdset_free(fds);
-
-                kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
-                rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
-                pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
-                uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
-                notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
-                uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
-
-                if (arg_userns_mode != USER_NAMESPACE_NO) {
-                        /* The child just let us know the UID shift it might have read from the image. */
-                        l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
-                        if (l < 0) {
-                                r = log_error_errno(errno, "Failed to read UID shift: %m");
-                                goto finish;
-                        }
-                        if (l != sizeof(arg_uid_shift)) {
-                                log_error("Short read while reading UID shift.");
-                                r = EIO;
-                                goto finish;
-                        }
-
-                        if (arg_userns_mode == USER_NAMESPACE_PICK) {
-                                /* If we are supposed to pick the UID shift, let's try to use the shift read from the
-                                 * image, but if that's already in use, pick a new one, and report back to the child,
-                                 * which one we now picked. */
-
-                                r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
-                                        goto finish;
-                                }
-
-                                l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
-                                if (l < 0) {
-                                        r = log_error_errno(errno, "Failed to send UID shift: %m");
-                                        goto finish;
-                                }
-                                if (l != sizeof(arg_uid_shift)) {
-                                        log_error("Short write while writing UID shift.");
-                                        r = -EIO;
-                                        goto finish;
-                                }
-                        }
-                }
-
-                /* Wait for the outer child. */
-                r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
-                if (r < 0)
-                        goto finish;
-                if (r != 0) {
-                        r = -EIO;
-                        goto finish;
-                }
-                pid = 0;
-
-                /* And now retrieve the PID of the inner child. */
-                l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
-                if (l < 0) {
-                        r = log_error_errno(errno, "Failed to read inner child PID: %m");
-                        goto finish;
-                }
-                if (l != sizeof(pid)) {
-                        log_error("Short read while reading inner child PID.");
-                        r = EIO;
-                        goto finish;
-                }
-
-                /* We also retrieve container UUID in case it was generated by outer child */
-                l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
-                if (l < 0) {
-                        r = log_error_errno(errno, "Failed to read container machine ID: %m");
-                        goto finish;
-                }
-                if (l != sizeof(arg_uuid)) {
-                        log_error("Short read while reading container machined ID.");
-                        r = EIO;
-                        goto finish;
-                }
-
-                /* We also retrieve the socket used for notifications generated by outer child */
-                notify_socket = receive_one_fd(notify_socket_pair[0], 0);
-                if (notify_socket < 0) {
-                        r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
-                        goto finish;
-                }
-
-                log_debug("Init process invoked as PID " PID_FMT, pid);
-
-                if (arg_userns_mode != USER_NAMESPACE_NO) {
-                        if (!barrier_place_and_sync(&barrier)) { /* #1 */
-                                log_error("Child died too early.");
-                                r = -ESRCH;
-                                goto finish;
-                        }
-
-                        r = setup_uid_map(pid);
-                        if (r < 0)
-                                goto finish;
-
-                        (void) barrier_place(&barrier); /* #2 */
-                }
-
-                if (arg_private_network) {
-
-                        r = move_network_interfaces(pid, arg_network_interfaces);
-                        if (r < 0)
-                                goto finish;
-
-                        if (arg_network_veth) {
-                                r = setup_veth(arg_machine, pid, veth_name,
-                                               arg_network_bridge || arg_network_zone);
-                                if (r < 0)
-                                        goto finish;
-                                else if (r > 0)
-                                        ifi = r;
-
-                                if (arg_network_bridge) {
-                                        /* Add the interface to a bridge */
-                                        r = setup_bridge(veth_name, arg_network_bridge, false);
-                                        if (r < 0)
-                                                goto finish;
-                                        if (r > 0)
-                                                ifi = r;
-                                } else if (arg_network_zone) {
-                                        /* Add the interface to a bridge, possibly creating it */
-                                        r = setup_bridge(veth_name, arg_network_zone, true);
-                                        if (r < 0)
-                                                goto finish;
-                                        if (r > 0)
-                                                ifi = r;
-                                }
-                        }
-
-                        r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
-                        if (r < 0)
-                                goto finish;
-
-                        /* We created the primary and extra veth links now; let's remember this, so that we know to
-                           remove them later on. Note that we don't bother with removing veth links that were created
-                           here when their setup failed half-way, because in that case the kernel should be able to
-                           remove them on its own, since they cannot be referenced by anything yet. */
-                        veth_created = true;
-
-                        r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
-                        if (r < 0)
-                                goto finish;
-
-                        r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                if (arg_register) {
-                        r = register_machine(
-                                        arg_machine,
-                                        pid,
-                                        arg_directory,
-                                        arg_uuid,
-                                        ifi,
-                                        arg_slice,
-                                        arg_custom_mounts, arg_n_custom_mounts,
-                                        arg_kill_signal,
-                                        arg_property,
-                                        arg_keep_unit,
-                                        arg_container_service_name);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
-                if (r < 0)
-                        goto finish;
-
-                if (arg_keep_unit) {
-                        r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                r = chown_cgroup(pid, arg_uid_shift);
-                if (r < 0)
-                        goto finish;
-
-                /* Notify the child that the parent is ready with all
-                 * its setup (including cgroup-ification), and that
-                 * the child can now hand over control to the code to
-                 * run inside the container. */
-                (void) barrier_place(&barrier); /* #3 */
-
-                /* Block SIGCHLD here, before notifying child.
-                 * process_pty() will handle it with the other signals. */
-                assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
-
-                /* Reset signal to default */
-                r = default_signals(SIGCHLD, -1);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to reset SIGCHLD: %m");
-                        goto finish;
-                }
-
-                r = sd_event_new(&event);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to get default event source: %m");
-                        goto finish;
-                }
-
-                r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
-                if (r < 0)
-                        goto finish;
-
-                /* Let the child know that we are ready and wait that the child is completely ready now. */
-                if (!barrier_place_and_sync(&barrier)) { /* #4 */
-                        log_error("Child died too early.");
-                        r = -ESRCH;
-                        goto finish;
-                }
-
-                /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
-                 * in getpwuid(), thus we can release the /etc/passwd lock. */
-                etc_passwd_lock = safe_close(etc_passwd_lock);
-
-                sd_notifyf(false,
-                           "STATUS=Container running.\n"
-                           "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
-                if (!arg_notify_ready)
-                        sd_notify(false, "READY=1\n");
-
-                if (arg_kill_signal > 0) {
-                        /* Try to kill the init system on SIGINT or SIGTERM */
-                        sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
-                        sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
-                } else {
-                        /* Immediately exit */
-                        sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
-                        sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
-                }
-
-                /* simply exit on sigchld */
-                sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
-
-                if (arg_expose_ports) {
-                        r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
-                        if (r < 0)
-                                goto finish;
-
-                        (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
-                }
-
-                rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-
-                r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to create PTY forwarder: %m");
-                        goto finish;
-                }
-
-                r = sd_event_loop(event);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to run event loop: %m");
-                        goto finish;
-                }
-
-                pty_forward_get_last_char(forward, &last_char);
-
-                forward = pty_forward_free(forward);
-
-                if (!arg_quiet && last_char != '\n')
-                        putc('\n', stdout);
-
-                /* Kill if it is not dead yet anyway */
-                if (arg_register && !arg_keep_unit)
-                        terminate_machine(pid);
-
-                /* Normally redundant, but better safe than sorry */
-                kill(pid, SIGKILL);
-
-                r = wait_for_container(pid, &container_status);
-                pid = 0;
-
-                if (r < 0)
-                        /* We failed to wait for the container, or the
-                         * container exited abnormally */
-                        goto finish;
-                else if (r > 0 || container_status == CONTAINER_TERMINATED) {
-                        /* The container exited with a non-zero
-                         * status, or with zero status and no reboot
-                         * was requested. */
-                        ret = r;
+                r = run(master,
+                        console,
+                        root_device, root_device_rw,
+                        home_device, home_device_rw,
+                        srv_device, srv_device_rw,
+                        esp_device,
+                        interactive, secondary,
+                        fds,
+                        veth_name, &veth_created,
+                        &exposed,
+                        &pid, &ret);
+                if (r <= 0)
                         break;
-                }
-
-                /* CONTAINER_REBOOTED, loop again */
-
-                if (arg_keep_unit) {
-                        /* Special handling if we are running as a
-                         * service: instead of simply restarting the
-                         * machine we want to restart the entire
-                         * service, so let's inform systemd about this
-                         * with the special exit code 133. The service
-                         * file uses RestartForceExitStatus=133 so
-                         * that this results in a full nspawn
-                         * restart. This is necessary since we might
-                         * have cgroup parameters set we want to have
-                         * flushed out. */
-                        ret = 133;
-                        r = 0;
-                        break;
-                }
-
-                expose_port_flush(arg_expose_ports, &exposed);
-
-                (void) remove_veth_links(veth_name, arg_network_veth_extra);
-                veth_created = false;
         }
 
 finish:
diff --git a/src/systemd-nspawn/systemd-nspawn.xml b/src/systemd-nspawn/systemd-nspawn.xml
index 69d2f6ff7d..c449edee89 100644
--- a/src/systemd-nspawn/systemd-nspawn.xml
+++ b/src/systemd-nspawn/systemd-nspawn.xml
@@ -274,8 +274,7 @@
         signals. It is recommended to use this mode to invoke arbitrary commands in containers, unless they have been
         modified to run correctly as PID 1. Or in other words: this switch should be used for pretty much all commands,
         except when the command refers to an init or shell implementation, as these are generally capable of running
-        correctly as PID 1. This option may not be combined with <option>--boot</option> or
-        <option>--share-system</option>.</para>
+        correctly as PID 1. This option may not be combined with <option>--boot</option>.</para>
         </listitem>
       </varlistentry>
 
@@ -285,8 +284,7 @@
 
         <listitem><para>Automatically search for an init binary and invoke it as PID 1, instead of a shell or a user
         supplied program. If this option is used, arguments specified on the command line are used as arguments for the
-        init binary. This option may not be combined with <option>--as-pid2</option> or
-        <option>--share-system</option>.</para>
+        init binary. This option may not be combined with <option>--as-pid2</option>.</para>
 
         <para>The following table explains the different modes of invocation and relationship to
         <option>--as-pid2</option> (see above):</para>
@@ -407,41 +405,42 @@
         purposes (usually in the range beyond the host's UID/GID 65536). The parameter may be specified as follows:</para>
 
         <orderedlist>
-          <listitem><para>The value <literal>no</literal> turns off user namespacing. This is the default.</para></listitem>
-
-          <listitem><para>The value <literal>yes</literal> (or the omission of a parameter) turns on user
-          namespacing. The UID/GID range to use is determined automatically from the file ownership of the root
-          directory of the container's directory tree. To use this option, make sure to prepare the directory tree in
-          advance, and ensure that all files and directories in it are owned by UIDs/GIDs in the range you'd like to
-          use. Also, make sure that used file ACLs exclusively reference UIDs/GIDs in the appropriate range. If this
-          mode is used the number of UIDs/GIDs assigned to the container for use is 65536, and the UID/GID of the
-          root directory must be a multiple of 65536.</para></listitem>
-
-          <listitem><para>The value "pick" turns on user namespacing. In this case the UID/GID range is automatically
-          chosen. As first step, the file owner of the root directory of the container's directory tree is read, and it
-          is checked that it is currently not used by the system otherwise (in particular, that no other container is
-          using it). If this check is successful, the UID/GID range determined this way is used, similar to the
-          behaviour if "yes" is specified. If the check is not successful (and thus the UID/GID range indicated in the
-          root directory's file owner is already used elsewhere) a new – currently unused – UID/GID range of 65536
-          UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and 1878982656, always starting at a
-          multiple of 65536. This setting implies <option>--private-users-chown</option> (see below), which has the
-          effect that the files and directories in the container's directory tree will be owned by the appropriate
-          users of the range picked. Using this option makes user namespace behaviour fully automatic. Note that the
-          first invocation of a previously unused container image might result in picking a new UID/GID range for it,
-          and thus in the (possibly expensive) file ownership adjustment operation. However, subsequent invocations of
-          the container will be cheap (unless of course the picked UID/GID range is assigned to a different use by
-          then).</para></listitem>
-
-          <listitem><para>Finally if one or two colon-separated numeric parameters are specified, user namespacing is
-          turned on, too. The first parameter specifies the first host UID/GID to assign to the container, the second
-          parameter specifies the number of host UIDs/GIDs to assign to the container. If the second parameter is
-          omitted, 65536 UIDs/GIDs are assigned.</para></listitem>
+          <listitem><para>If one or two colon-separated numbers are specified, user namespacing is turned on. The first
+          parameter specifies the first host UID/GID to assign to the container, the second parameter specifies the
+          number of host UIDs/GIDs to assign to the container. If the second parameter is omitted, 65536 UIDs/GIDs are
+          assigned.</para></listitem>
+
+          <listitem><para>If the parameter is omitted, or true, user namespacing is turned on. The UID/GID range to
+          use is determined automatically from the file ownership of the root directory of the container's directory
+          tree. To use this option, make sure to prepare the directory tree in advance, and ensure that all files and
+          directories in it are owned by UIDs/GIDs in the range you'd like to use. Also, make sure that used file ACLs
+          exclusively reference UIDs/GIDs in the appropriate range. If this mode is used the number of UIDs/GIDs
+          assigned to the container for use is 65536, and the UID/GID of the root directory must be a multiple of
+          65536.</para></listitem>
+
+          <listitem><para>If the parameter is false, user namespacing is turned off. This is the default.</para>
+          </listitem>
+
+          <listitem><para>The special value <literal>pick</literal> turns on user namespacing. In this case the UID/GID
+          range is automatically chosen. As first step, the file owner of the root directory of the container's
+          directory tree is read, and it is checked that it is currently not used by the system otherwise (in
+          particular, that no other container is using it). If this check is successful, the UID/GID range determined
+          this way is used, similar to the behavior if "yes" is specified. If the check is not successful (and thus
+          the UID/GID range indicated in the root directory's file owner is already used elsewhere) a new – currently
+          unused – UID/GID range of 65536 UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and
+          1878982656, always starting at a multiple of 65536. This setting implies
+          <option>--private-users-chown</option> (see below), which has the effect that the files and directories in
+          the container's directory tree will be owned by the appropriate users of the range picked. Using this option
+          makes user namespace behavior fully automatic. Note that the first invocation of a previously unused
+          container image might result in picking a new UID/GID range for it, and thus in the (possibly expensive) file
+          ownership adjustment operation. However, subsequent invocations of the container will be cheap (unless of
+          course the picked UID/GID range is assigned to a different use by then).</para></listitem>
         </orderedlist>
 
         <para>It is recommended to assign at least 65536 UIDs/GIDs to each container, so that the usable UID/GID range in the
         container covers 16 bit. For best security, do not assign overlapping UID/GID ranges to multiple containers. It is
         hence a good idea to use the upper 16 bit of the host 32-bit UIDs/GIDs as container identifier, while the lower 16
-        bit encode the container UID/GID used. This is in fact the behaviour enforced by the
+        bit encode the container UID/GID used. This is in fact the behavior enforced by the
         <option>--private-users=pick</option> option.</para>
 
         <para>When user namespaces are used, the GID range assigned to each container is always chosen identical to the
@@ -456,17 +455,6 @@
       </varlistentry>
 
       <varlistentry>
-        <term><option>-U</option></term>
-
-        <listitem><para>If the kernel supports the user namespaces feature, equivalent to
-        <option>--private-users=pick</option>, otherwise equivalent to
-        <option>--private-users=no</option>.</para>
-
-        <para>Note that <option>-U</option> is the default if the <filename>systemd-nspawn@.service</filename> template unit
-        file is used.</para></listitem>
-      </varlistentry>
-
-      <varlistentry>
         <term><option>--private-users-chown</option></term>
 
         <listitem><para>If specified, all files and directories in the container's directory tree will adjusted so that
@@ -479,6 +467,23 @@
       </varlistentry>
 
       <varlistentry>
+        <term><option>-U</option></term>
+
+        <listitem><para>If the kernel supports the user namespaces feature, equivalent to
+        <option>--private-users=pick --private-users-chown</option>, otherwise equivalent to
+        <option>--private-users=no</option>.</para>
+
+        <para>Note that <option>-U</option> is the default if the
+        <filename>systemd-nspawn@.service</filename> template unit file is used.</para>
+
+        <para>Note: it is possible to undo the effect of <option>--private-users-chown</option> (or
+        <option>-U</option>) on the file system by redoing the operation with the first UID of 0:</para>
+
+        <programlisting>systemd-nspawn … --private-users=0 --private-users-chown</programlisting>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
         <term><option>--private-network</option></term>
 
         <listitem><para>Disconnect networking of the container from
@@ -717,7 +722,7 @@
         and the subdirectory is symlinked into the host at the same
         location. <literal>try-host</literal> and
         <literal>try-guest</literal> do the same but do not fail if
-        the host does not have persistent journalling enabled. If
+        the host does not have persistent journaling enabled. If
         <literal>auto</literal> (the default), and the right
         subdirectory of <filename>/var/log/journal</filename> exists,
         it will be bind mounted into the container. If the
@@ -847,23 +852,6 @@
       </varlistentry>
 
       <varlistentry>
-        <term><option>--share-system</option></term>
-
-        <listitem><para>Allows the container to share certain system
-        facilities with the host. More specifically, this turns off
-        PID namespacing, UTS namespacing and IPC namespacing, and thus
-        allows the guest to see and interact more easily with
-        processes outside of the container. Note that using this
-        option makes it impossible to start up a full Operating System
-        in the container, as an init system cannot operate in this
-        mode. It is only useful to run specific programs or
-        applications this way, without involving an init system in the
-        container. This option implies <option>--register=no</option>.
-        This option may not be combined with
-        <option>--boot</option>.</para></listitem>
-      </varlistentry>
-
-      <varlistentry>
         <term><option>--register=</option></term>
 
         <listitem><para>Controls whether the container is registered
@@ -877,9 +865,7 @@
         and shown by tools such as
         <citerefentry project='man-pages'><refentrytitle>ps</refentrytitle><manvolnum>1</manvolnum></citerefentry>.
         If the container does not run an init system, it is
-        recommended to set this option to <literal>no</literal>. Note
-        that <option>--share-system</option> implies
-        <option>--register=no</option>. </para></listitem>
+        recommended to set this option to <literal>no</literal>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1037,9 +1023,9 @@
     </example>
 
     <example>
-      <title>Spawn a shell in a container of a minimal gNewSense unstable distribution</title>
+      <title>Spawn a shell in a container of a minimal gNewSense Ucclia distribution</title>
 
-      <programlisting># debootstrap --arch=amd64 unstable ~/gnewsense-tree/
+      <programlisting># debootstrap --arch=amd64 ucclia ~/gnewsense-tree/
 # systemd-nspawn -D ~/gnewsense-tree/</programlisting>
 
       <para>This installs a minimal gNewSense unstable distribution into
@@ -1048,12 +1034,12 @@
     </example>
 
     <example>
-      <title>Boot a minimal Parabola GNU/Linux-libre distribution in a container</title>
+      <title>Boot a minimal Parabola distribution in a container</title>
 
       <programlisting># pacstrap -c -d ~/parabola-tree/ base
 # systemd-nspawn -bD ~/parabola-tree/</programlisting>
 
-      <para>This installs a minimal Parabola GNU/Linux-libre distribution into the
+      <para>This installs a minimal Parabola distribution into the
       directory <filename>~/parabola-tree/</filename> and then boots an OS
       in a namespace container in it.</para>
     </example>