Merge branch 'systemd/parabola' into notsystemd/premove

# Conflicts: # Makefile.amp
author: Luke Shumaker <lukeshu@sbcglobal.net> 2016-12-17 02:47:02 -0500
committer: Luke Shumaker <lukeshu@sbcglobal.net> 2016-12-17 02:47:02 -0500
commit: a4d083550a7273b895b44aac8d2ff7e2fdb1f7d5 (patch)
tree: 6f148433641f8c92d6f1eddcb2199a78dbd111a0 /src/nspawn
parent: b6d071f1df46eb841ba3f88cdb2b248eaf5f35f8 (diff)
parent: 86e9bb69ae74bd960e1fd427258f41d54240d6d1 (diff)
8 files changed, 896 insertions, 783 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index b1580236c9..5274767b96 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -20,32 +20,23 @@
 #include <sys/mount.h>
 
 #include "alloc-util.h"
-#include "cgroup-util.h"
 #include "fd-util.h"
 #include "fileio.h"
 #include "mkdir.h"
+#include "mount-util.h"
 #include "nspawn-cgroup.h"
+#include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
 #include "util.h"
 
-int chown_cgroup(pid_t pid, uid_t uid_shift) {
-        _cleanup_free_ char *path = NULL, *fs = NULL;
+static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
-        int r;
-
-        r = cg_pid_get_path(NULL, pid, &path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get container cgroup path: %m");
 
-        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-        fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+        fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
         if (fd < 0)
-                return log_error_errno(errno, "Failed to open %s: %m", fs);
+                return -errno;
 
         FOREACH_STRING(fn,
                        ".",
@@ -63,18 +54,37 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) {
         return 0;
 }
 
-int sync_cgroup(pid_t pid, bool unified_requested) {
+int chown_cgroup(pid_t pid, uid_t uid_shift) {
+        _cleanup_free_ char *path = NULL, *fs = NULL;
+        int r;
+
+        r = cg_pid_get_path(NULL, pid, &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+        r = chown_cgroup_path(fs, uid_shift);
+        if (r < 0)
+                return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+        return 0;
+}
+
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) {
         _cleanup_free_ char *cgroup = NULL;
         char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
         bool undo_mount = false;
         const char *fn;
         int unified, r;
 
-        unified = cg_unified();
+        unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
         if (unified < 0)
                 return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
 
-        if ((unified > 0) == unified_requested)
+        if ((unified > 0) == (unified_requested >= CGROUP_UNIFIED_SYSTEMD))
                 return 0;
 
         /* When the host uses the legacy cgroup setup, but the
@@ -91,33 +101,45 @@ int sync_cgroup(pid_t pid, bool unified_requested) {
                 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
 
         if (unified)
-                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+                r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
         else
-                r = mount("cgroup", tree, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
-        if (r < 0) {
-                r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+                r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup2",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+        if (r < 0)
                 goto finish;
-        }
 
         undo_mount = true;
 
+        /* If nspawn dies abruptly the cgroup hierarchy created below
+         * its unit isn't cleaned up. So, let's remove it
+         * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
+        fn = strjoina(tree, cgroup);
+        (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
+
         fn = strjoina(tree, cgroup, "/cgroup.procs");
         (void) mkdir_parents(fn, 0755);
 
         sprintf(pid_string, PID_FMT, pid);
         r = write_string_file(fn, pid_string, 0);
-        if (r < 0)
+        if (r < 0) {
                 log_error_errno(r, "Failed to move process: %m");
+                goto finish;
+        }
 
+        fn = strjoina(tree, cgroup);
+        r = chown_cgroup_path(fn, arg_uid_shift);
+        if (r < 0)
+                log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
 finish:
         if (undo_mount)
-                (void) umount(tree);
+                (void) umount_verbose(tree);
 
         (void) rmdir(tree);
         return r;
 }
 
-int create_subcgroup(pid_t pid, bool unified_requested) {
+int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
         _cleanup_free_ char *cgroup = NULL;
         const char *child;
         int unified, r;
@@ -129,10 +151,10 @@ int create_subcgroup(pid_t pid, bool unified_requested) {
          * did not create a scope unit for the container move us and
          * the container into two separate subcgroups. */
 
-        if (!unified_requested)
+        if (unified_requested == CGROUP_UNIFIED_NONE)
                 return 0;
 
-        unified = cg_unified();
+        unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
         if (unified < 0)
                 return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
         if (unified == 0)
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 1ff35a299a..fa4321ab43 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -22,6 +22,8 @@
 #include <stdbool.h>
 #include <sys/types.h>
 
+#include "cgroup-util.h"
+
 int chown_cgroup(pid_t pid, uid_t uid_shift);
-int sync_cgroup(pid_t pid, bool unified_requested);
-int create_subcgroup(pid_t pid, bool unified_requested);
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
+int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 803caef3dd..2dabe2ae5b 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -21,7 +21,6 @@
 #include <linux/magic.h>
 
 #include "alloc-util.h"
-#include "cgroup-util.h"
 #include "escape.h"
 #include "fd-util.h"
 #include "fileio.h"
@@ -251,8 +250,10 @@ int mount_sysfs(const char *dest) {
 
         (void) mkdir(full, 0755);
 
-        if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
-                return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
+        r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
+                          MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+        if (r < 0)
+                return r;
 
         FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
                 _cleanup_free_ char *from = NULL, *to = NULL;
@@ -267,15 +268,19 @@ int mount_sysfs(const char *dest) {
 
                 (void) mkdir(to, 0755);
 
-                if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
+                r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+                if (r < 0)
+                        return r;
 
-                if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
+                r = mount_verbose(LOG_ERR, NULL, to, NULL,
+                                  MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+                if (r < 0)
+                        return r;
         }
 
-        if (umount(full) < 0)
-                return log_error_errno(errno, "Failed to unmount %s: %m", full);
+        r = umount_verbose(full);
+        if (r < 0)
+                return r;
 
         if (rmdir(full) < 0)
                 return log_error_errno(errno, "Failed to remove %s: %m", full);
@@ -291,12 +296,63 @@ int mount_sysfs(const char *dest) {
                 (void) mkdir_p(x, 0755);
         }
 
-        if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
-                return log_error_errno(errno, "Failed to make %s read-only: %m", top);
+        return mount_verbose(LOG_ERR, NULL, top, NULL,
+                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+}
+
+static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+        int r;
+
+        assert(path);
+
+        r = mkdir(path, mode);
+        if (r < 0 && errno != EEXIST)
+                return -errno;
+
+        if (!in_userns) {
+                r = lchown(path, uid_shift, uid_shift);
+                if (r < 0)
+                        return -errno;
+        }
 
         return 0;
 }
 
+static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+        const char *p, *e;
+        int r;
+
+        assert(path);
+
+        if (prefix && !path_startswith(path, prefix))
+                return -ENOTDIR;
+
+        /* create every parent directory in the path, except the last component */
+        p = path + strspn(path, "/");
+        for (;;) {
+                char t[strlen(path) + 1];
+
+                e = p + strcspn(p, "/");
+                p = e + strspn(e, "/");
+
+                /* Is this the last component? If so, then we're done */
+                if (*p == 0)
+                        break;
+
+                memcpy(t, path, e - path);
+                t[e-path] = 0;
+
+                if (prefix && path_startswith(prefix, t))
+                        continue;
+
+                r = mkdir_userns(t, mode, in_userns, uid_shift);
+                if (r < 0)
+                        return r;
+        }
+
+        return mkdir_userns(path, mode, in_userns, uid_shift);
+}
+
 int mount_all(const char *dest,
               bool use_userns, bool in_userns,
               bool use_netns,
@@ -315,19 +371,21 @@ int mount_all(const char *dest,
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
-                { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
-                { "/proc/sys/net",   "/proc/sys/net",   NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
-                { NULL,              "/proc/sys",       NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
-                { "tmpfs",           "/sys",            "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
-                { "sysfs",           "/sys",            "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
-                { "tmpfs",           "/dev",            "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
-                { "tmpfs",           "/dev/shm",        "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",           "/run",            "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",           "/tmp",            "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  false, false },
+                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
+                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
+                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
+                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
+                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                   false, true,  false },   /* Bind mount first ...*/
+                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true,  false },   /* ... then, make it r/o */
+                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
+                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
+                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
+                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
+                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
+                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  false,  false },
 #ifdef HAVE_SELINUX
-                { "/sys/fs/selinux", "/sys/fs/selinux", NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
-                { NULL,              "/sys/fs/selinux", NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
+                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
+                { NULL,                  "/sys/fs/selinux",     NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
 #endif
         };
 
@@ -356,8 +414,8 @@ int mount_all(const char *dest,
                 if (mount_table[k].what && r > 0)
                         continue;
 
-                r = mkdir_p(where, 0755);
-                if (r < 0) {
+                r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
+                if (r < 0 && r != -EEXIST) {
                         if (mount_table[k].fatal)
                                 return log_error_errno(r, "Failed to create directory %s: %m", where);
 
@@ -367,24 +425,24 @@ int mount_all(const char *dest,
 
                 o = mount_table[k].options;
                 if (streq_ptr(mount_table[k].type, "tmpfs")) {
-                        r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
+                        if (in_userns)
+                                r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
+                        else
+                                r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
                         if (r < 0)
                                 return log_oom();
                         if (r > 0)
                                 o = options;
                 }
 
-                if (mount(mount_table[k].what,
-                          where,
-                          mount_table[k].type,
-                          mount_table[k].flags,
-                          o) < 0) {
-
-                        if (mount_table[k].fatal)
-                                return log_error_errno(errno, "mount(%s) failed: %m", where);
-
-                        log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
-                }
+                r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_WARNING,
+                                  mount_table[k].what,
+                                  where,
+                                  mount_table[k].type,
+                                  mount_table[k].flags,
+                                  o);
+                if (r < 0 && mount_table[k].fatal)
+                        return r;
         }
 
         return 0;
@@ -469,15 +527,15 @@ static int mount_bind(const char *dest, CustomMount *m) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to create mount point %s: %m", where);
 
-        } else {
+        } else
                 return log_error_errno(errno, "Failed to stat %s: %m", where);
-        }
 
-        if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
-                return log_error_errno(errno, "mount(%s) failed: %m", where);
+        r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
+        if (r < 0)
+                return r;
 
         if (m->read_only) {
-                r = bind_remount_recursive(where, true);
+                r = bind_remount_recursive(where, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Read-only bind mount failed: %m");
         }
@@ -509,10 +567,7 @@ static int mount_tmpfs(
                 return log_oom();
         options = r > 0 ? buf : m->options;
 
-        if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
-                return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
-
-        return 0;
+        return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
 }
 
 static char *joined_and_escaped_lower_dirs(char * const *lower) {
@@ -574,10 +629,7 @@ static int mount_overlay(const char *dest, CustomMount *m) {
                 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
         }
 
-        if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
-                return log_error_errno(errno, "overlay mount to %s failed: %m", where);
-
-        return 0;
+        return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
 }
 
 int mount_custom(
@@ -636,8 +688,6 @@ static int get_controllers(Set *subsystems) {
                 int r;
                 char *e, *l, *p;
 
-                truncate_nl(line);
-
                 l = strchr(line, ':');
                 if (!l)
                         continue;
@@ -649,10 +699,13 @@ static int get_controllers(Set *subsystems) {
 
                 *e = 0;
 
-                if (streq(l, "") || streq(l, "name=systemd"))
+                if (STR_IN_SET(l, "", "name=systemd"))
                         continue;
 
                 p = strdup(l);
+                if (!p)
+                        return -ENOMEM;
+
                 r = set_consume(subsystems, p);
                 if (r < 0)
                         return r;
@@ -661,8 +714,9 @@ static int get_controllers(Set *subsystems) {
         return 0;
 }
 
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
-        char *to;
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
+                                         CGroupUnified unified_requested, bool read_only) {
+        const char *to, *fstype, *opts;
         int r;
 
         to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
@@ -677,22 +731,38 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
 
         /* The superblock mount options of the mount point need to be
          * identical to the hosts', and hence writable... */
-        if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
-                return log_error_errno(errno, "Failed to mount to %s: %m", to);
+        if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+                if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
+                        fstype = "cgroup2";
+                        opts = NULL;
+                } else {
+                        fstype = "cgroup";
+                        opts = "none,name=systemd,xattr";
+                }
+        } else {
+                fstype = "cgroup";
+                opts = controller;
+        }
 
-        /* ... hence let's only make the bind mount read-only, not the
-         * superblock. */
+        r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+        if (r < 0)
+                return r;
+
+        /* ... hence let's only make the bind mount read-only, not the superblock. */
         if (read_only) {
-                if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
-                        return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+                r = mount_verbose(LOG_ERR, NULL, to, NULL,
+                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+                if (r < 0)
+                        return r;
         }
+
         return 1;
 }
 
 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
 static int mount_legacy_cgns_supported(
-                bool userns, uid_t uid_shift, uid_t uid_range,
-                const char *selinux_apifs_context) {
+                CGroupUnified unified_requested, bool userns, uid_t uid_shift,
+                uid_t uid_range, const char *selinux_apifs_context) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root = "/sys/fs/cgroup", *c;
         int r;
@@ -717,11 +787,13 @@ static int mount_legacy_cgns_supported(
                 if (r < 0)
                         return log_oom();
 
-                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+                r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+                if (r < 0)
+                        return r;
         }
 
-        if (cg_unified() > 0)
+        if (cg_all_unified() > 0)
                 goto skip_controllers;
 
         controllers = set_new(&string_hash_ops);
@@ -739,7 +811,7 @@ static int mount_legacy_cgns_supported(
                 if (!controller)
                         break;
 
-                r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
+                r = mount_legacy_cgroup_hierarchy("", controller, controller, unified_requested, !userns);
                 if (r < 0)
                         return r;
 
@@ -773,14 +845,13 @@ static int mount_legacy_cgns_supported(
         }
 
 skip_controllers:
-        r = mount_legacy_cgroup_hierarchy("", "none,name=systemd,xattr", "systemd", false);
+        r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
         if (r < 0)
                 return r;
 
-        if (!userns) {
-                if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
-                        return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
-        }
+        if (!userns)
+                return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
+                                     MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
 
         return 0;
 }
@@ -788,7 +859,7 @@ skip_controllers:
 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
 static int mount_legacy_cgns_unsupported(
                 const char *dest,
-                bool userns, uid_t uid_shift, uid_t uid_range,
+                CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range,
                 const char *selinux_apifs_context) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root;
@@ -809,11 +880,13 @@ static int mount_legacy_cgns_unsupported(
                 if (r < 0)
                         return log_oom();
 
-                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
-                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+                r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+                if (r < 0)
+                        return r;
         }
 
-        if (cg_unified() > 0)
+        if (cg_all_unified() > 0)
                 goto skip_controllers;
 
         controllers = set_new(&string_hash_ops);
@@ -839,7 +912,7 @@ static int mount_legacy_cgns_unsupported(
                 if (r == -EINVAL) {
                         /* Not a symbolic link, but directly a single cgroup hierarchy */
 
-                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, unified_requested, true);
                         if (r < 0)
                                 return r;
 
@@ -859,7 +932,7 @@ static int mount_legacy_cgns_unsupported(
                                 continue;
                         }
 
-                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, unified_requested, true);
                         if (r < 0)
                                 return r;
 
@@ -872,14 +945,12 @@ static int mount_legacy_cgns_unsupported(
         }
 
 skip_controllers:
-        r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
+        r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
         if (r < 0)
                 return r;
 
-        if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
-                return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
-
-        return 0;
+        return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
+                             MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
 }
 
 static int mount_unified_cgroups(const char *dest) {
@@ -906,30 +977,27 @@ static int mount_unified_cgroups(const char *dest) {
                 return -EINVAL;
         }
 
-        if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
-                return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
-
-        return 0;
+        return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
 }
 
 int mount_cgroups(
                 const char *dest,
-                bool unified_requested,
+                CGroupUnified unified_requested,
                 bool userns, uid_t uid_shift, uid_t uid_range,
                 const char *selinux_apifs_context,
                 bool use_cgns) {
 
-        if (unified_requested)
+        if (unified_requested >= CGROUP_UNIFIED_ALL)
                 return mount_unified_cgroups(dest);
-        else if (use_cgns && cg_ns_supported())
-                return mount_legacy_cgns_supported(userns, uid_shift, uid_range, selinux_apifs_context);
+        else if (use_cgns)
+                return mount_legacy_cgns_supported(unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
 
-        return mount_legacy_cgns_unsupported(dest, userns, uid_shift, uid_range, selinux_apifs_context);
+        return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
 }
 
 int mount_systemd_cgroup_writable(
                 const char *dest,
-                bool unified_requested) {
+                CGroupUnified unified_requested) {
 
         _cleanup_free_ char *own_cgroup_path = NULL;
         const char *systemd_root, *systemd_own;
@@ -945,7 +1013,7 @@ int mount_systemd_cgroup_writable(
         if (path_equal(own_cgroup_path, "/"))
                 return 0;
 
-        if (unified_requested) {
+        if (unified_requested >= CGROUP_UNIFIED_ALL) {
                 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
                 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
         } else {
@@ -954,14 +1022,13 @@ int mount_systemd_cgroup_writable(
         }
 
         /* Make our own cgroup a (writable) bind mount */
-        if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
+        r = mount_verbose(LOG_ERR, systemd_own, systemd_own,  NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
 
         /* And then remount the systemd cgroup root read-only */
-        if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
-                return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
-
-        return 0;
+        return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
+                             MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
 }
 
 int setup_volatile_state(
@@ -982,7 +1049,7 @@ int setup_volatile_state(
         /* --volatile=state means we simply overmount /var
            with a tmpfs, and the rest read-only. */
 
-        r = bind_remount_recursive(directory, true);
+        r = bind_remount_recursive(directory, true, NULL);
         if (r < 0)
                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
 
@@ -998,10 +1065,7 @@ int setup_volatile_state(
         if (r > 0)
                 options = buf;
 
-        if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
-                return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
-
-        return 0;
+        return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
 }
 
 int setup_volatile(
@@ -1034,10 +1098,9 @@ int setup_volatile(
         if (r > 0)
                 options = buf;
 
-        if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
-                r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
+        r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
+        if (r < 0)
                 goto fail;
-        }
 
         tmpfs_mounted = true;
 
@@ -1050,23 +1113,21 @@ int setup_volatile(
                 goto fail;
         }
 
-        if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
-                r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
+        r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
+        if (r < 0)
                 goto fail;
-        }
 
         bind_mounted = true;
 
-        r = bind_remount_recursive(t, true);
+        r = bind_remount_recursive(t, true, NULL);
         if (r < 0) {
                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
                 goto fail;
         }
 
-        if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
-                r = log_error_errno(errno, "Failed to move root mount: %m");
+        r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
+        if (r < 0)
                 goto fail;
-        }
 
         (void) rmdir(template);
 
@@ -1074,10 +1135,10 @@ int setup_volatile(
 
 fail:
         if (bind_mounted)
-                (void) umount(t);
+                (void) umount_verbose(t);
 
         if (tmpfs_mounted)
-                (void) umount(template);
+                (void) umount_verbose(template);
         (void) rmdir(template);
         return r;
 }
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index 0eff8e1006..7307a838a5 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -21,6 +21,8 @@
 
 #include <stdbool.h>
 
+#include "cgroup-util.h"
+
 typedef enum VolatileMode {
         VOLATILE_NO,
         VOLATILE_YES,
@@ -58,8 +60,8 @@ int custom_mount_compare(const void *a, const void *b);
 int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 int mount_sysfs(const char *dest);
 
-int mount_cgroups(const char *dest, bool unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
-int mount_systemd_cgroup_writable(const char *dest, bool unified_requested);
+int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
+int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
 
 int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 
diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c
index e5b76a0c5d..06c56d9ec8 100644
--- a/src/nspawn/nspawn-register.c
+++ b/src/nspawn/nspawn-register.c
@@ -68,7 +68,6 @@ int register_machine(
                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
         } else {
                 _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
-                char **i;
                 unsigned j;
 
                 r = sd_bus_message_new_method_call(
@@ -157,11 +156,9 @@ int register_machine(
                                 return bus_log_create_error(r);
                 }
 
-                STRV_FOREACH(i, properties) {
-                        r = bus_append_unit_property_assignment(m, *i);
-                        if (r < 0)
-                                return r;
-                }
+                r = bus_append_unit_property_assignment_many(m, properties);
+                if (r < 0)
+                        return r;
 
                 r = sd_bus_message_close_container(m);
                 if (r < 0)
diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c
index 3ab7160ebe..03a397d30c 100644
--- a/src/nspawn/nspawn-seccomp.c
+++ b/src/nspawn/nspawn-seccomp.c
@@ -130,16 +130,15 @@ int setup_seccomp(uint64_t cap_list_retain) {
         scmp_filter_ctx seccomp;
         int r;
 
-        seccomp = seccomp_init(SCMP_ACT_ALLOW);
-        if (!seccomp)
-                return log_oom();
-
-        r = seccomp_add_secondary_archs(seccomp);
-        if (r < 0) {
-                log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
-                goto finish;
+        if (!is_seccomp_available()) {
+                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+                return 0;
         }
 
+        r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate seccomp object: %m");
+
         r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
         if (r < 0)
                 goto finish;
@@ -166,18 +165,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
                 goto finish;
         }
 
-        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
-        if (r < 0) {
-                log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
-                goto finish;
-        }
-
         r = seccomp_load(seccomp);
-        if (r == -EINVAL) {
-                log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
-                r = 0;
-                goto finish;
-        }
         if (r < 0) {
                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
                 goto finish;
diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c
index 5f1522cfb6..09c8f070ba 100644
--- a/src/nspawn/nspawn-settings.c
+++ b/src/nspawn/nspawn-settings.c
@@ -101,9 +101,7 @@ Settings* settings_free(Settings *s) {
         expose_port_free_all(s->expose_ports);
 
         custom_mount_free_all(s->custom_mounts, s->n_custom_mounts);
-        free(s);
-
-        return NULL;
+        return mfree(s);
 }
 
 bool settings_private_network(Settings *s) {
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index fcf14bba4c..3300c00373 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -169,7 +169,6 @@ static CustomMount *arg_custom_mounts = NULL;
 static unsigned arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
 static bool arg_quiet = false;
-static bool arg_share_system = false;
 static bool arg_register = true;
 static bool arg_keep_unit = false;
 static char **arg_network_interfaces = NULL;
@@ -188,13 +187,14 @@ static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns_chown = false;
 static int arg_kill_signal = 0;
-static bool arg_unified_cgroup_hierarchy = false;
+static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
 static SettingsMask arg_settings_mask = 0;
 static int arg_settings_trusted = -1;
 static char **arg_parameters = NULL;
 static const char *arg_container_service_name = "systemd-nspawn";
 static bool arg_notify_ready = false;
 static bool arg_use_cgns = true;
+static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -219,7 +219,7 @@ static void help(void) {
                "  -U --private-users=pick   Run within user namespace, autoselect UID/GID range\n"
                "     --private-users[=UIDBASE[:NUIDS]]\n"
                "                            Similar, but with user configured UID/GID range\n"
-               "     --private-user-chown   Adjust OS tree ownership to private UID/GID range\n"
+               "     --private-users-chown  Adjust OS tree ownership to private UID/GID range\n"
                "     --private-network      Disable network in container\n"
                "     --network-interface=INTERFACE\n"
                "                            Assign an existing network interface to the\n"
@@ -316,9 +316,9 @@ static int custom_mounts_prepare(void) {
         return 0;
 }
 
-static int detect_unified_cgroup_hierarchy(void) {
+static int detect_unified_cgroup_hierarchy(const char *directory) {
         const char *e;
-        int r;
+        int r, all_unified, systemd_unified;
 
         /* Allow the user to control whether the unified hierarchy is used */
         e = getenv("UNIFIED_CGROUP_HIERARCHY");
@@ -326,20 +326,58 @@ static int detect_unified_cgroup_hierarchy(void) {
                 r = parse_boolean(e);
                 if (r < 0)
                         return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
 
-                arg_unified_cgroup_hierarchy = r;
                 return 0;
         }
 
+        all_unified = cg_all_unified();
+        systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
+
+        if (all_unified < 0 || systemd_unified < 0)
+                return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
+                                       "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
         /* Otherwise inherit the default from the host system */
-        r = cg_unified();
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+        if (all_unified > 0) {
+                /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
+                 * routine only detects 231, so we'll have a false negative here for 230. */
+                r = systemd_installation_has_version(directory, 230);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine systemd version in container: %m");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+        } else if (systemd_unified > 0) {
+                /* Mixed cgroup hierarchy support was added in 232 */
+                r = systemd_installation_has_version(directory, 232);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine systemd version in container: %m");
+                if (r > 0)
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
+                else
+                        arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+        } else
+                arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
 
-        arg_unified_cgroup_hierarchy = r;
         return 0;
 }
 
+static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
+        int r;
+
+        r = getenv_bool(name);
+        if (r == -ENXIO)
+                return;
+        if (r < 0)
+                log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+        arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -377,52 +415,52 @@ static int parse_argv(int argc, char *argv[]) {
         };
 
         static const struct option options[] = {
-                { "help",                  no_argument,       NULL, 'h'                   },
-                { "version",               no_argument,       NULL, ARG_VERSION           },
-                { "directory",             required_argument, NULL, 'D'                   },
-                { "template",              required_argument, NULL, ARG_TEMPLATE          },
-                { "ephemeral",             no_argument,       NULL, 'x'                   },
-                { "user",                  required_argument, NULL, 'u'                   },
-                { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
-                { "as-pid2",               no_argument,       NULL, 'a'                   },
-                { "boot",                  no_argument,       NULL, 'b'                   },
-                { "uuid",                  required_argument, NULL, ARG_UUID              },
-                { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
-                { "capability",            required_argument, NULL, ARG_CAPABILITY        },
-                { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
-                { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
-                { "bind",                  required_argument, NULL, ARG_BIND              },
-                { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
-                { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
-                { "overlay",               required_argument, NULL, ARG_OVERLAY           },
-                { "overlay-ro",            required_argument, NULL, ARG_OVERLAY_RO        },
-                { "machine",               required_argument, NULL, 'M'                   },
-                { "slice",                 required_argument, NULL, 'S'                   },
-                { "setenv",                required_argument, NULL, 'E'                   },
-                { "selinux-context",       required_argument, NULL, 'Z'                   },
-                { "selinux-apifs-context", required_argument, NULL, 'L'                   },
-                { "quiet",                 no_argument,       NULL, 'q'                   },
-                { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      }, /* not documented */
-                { "register",              required_argument, NULL, ARG_REGISTER          },
-                { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
-                { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
-                { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
-                { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
-                { "network-veth",          no_argument,       NULL, 'n'                   },
-                { "network-veth-extra",    required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
-                { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
-                { "network-zone",          required_argument, NULL, ARG_NETWORK_ZONE      },
-                { "personality",           required_argument, NULL, ARG_PERSONALITY       },
-                { "image",                 required_argument, NULL, 'i'                   },
-                { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
-                { "port",                  required_argument, NULL, 'p'                   },
-                { "property",              required_argument, NULL, ARG_PROPERTY          },
-                { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
-                { "private-users-chown",   optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
-                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
-                { "settings",              required_argument, NULL, ARG_SETTINGS          },
-                { "chdir",                 required_argument, NULL, ARG_CHDIR             },
-                { "notify-ready",          required_argument, NULL, ARG_NOTIFY_READY      },
+                { "help",                  no_argument,       NULL, 'h'                     },
+                { "version",               no_argument,       NULL, ARG_VERSION             },
+                { "directory",             required_argument, NULL, 'D'                     },
+                { "template",              required_argument, NULL, ARG_TEMPLATE            },
+                { "ephemeral",             no_argument,       NULL, 'x'                     },
+                { "user",                  required_argument, NULL, 'u'                     },
+                { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK     },
+                { "as-pid2",               no_argument,       NULL, 'a'                     },
+                { "boot",                  no_argument,       NULL, 'b'                     },
+                { "uuid",                  required_argument, NULL, ARG_UUID                },
+                { "read-only",             no_argument,       NULL, ARG_READ_ONLY           },
+                { "capability",            required_argument, NULL, ARG_CAPABILITY          },
+                { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY     },
+                { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL        },
+                { "bind",                  required_argument, NULL, ARG_BIND                },
+                { "bind-ro",               required_argument, NULL, ARG_BIND_RO             },
+                { "tmpfs",                 required_argument, NULL, ARG_TMPFS               },
+                { "overlay",               required_argument, NULL, ARG_OVERLAY             },
+                { "overlay-ro",            required_argument, NULL, ARG_OVERLAY_RO          },
+                { "machine",               required_argument, NULL, 'M'                     },
+                { "slice",                 required_argument, NULL, 'S'                     },
+                { "setenv",                required_argument, NULL, 'E'                     },
+                { "selinux-context",       required_argument, NULL, 'Z'                     },
+                { "selinux-apifs-context", required_argument, NULL, 'L'                     },
+                { "quiet",                 no_argument,       NULL, 'q'                     },
+                { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM        }, /* not documented */
+                { "register",              required_argument, NULL, ARG_REGISTER            },
+                { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT           },
+                { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE   },
+                { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN     },
+                { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN      },
+                { "network-veth",          no_argument,       NULL, 'n'                     },
+                { "network-veth-extra",    required_argument, NULL, ARG_NETWORK_VETH_EXTRA  },
+                { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE      },
+                { "network-zone",          required_argument, NULL, ARG_NETWORK_ZONE        },
+                { "personality",           required_argument, NULL, ARG_PERSONALITY         },
+                { "image",                 required_argument, NULL, 'i'                     },
+                { "volatile",              optional_argument, NULL, ARG_VOLATILE            },
+                { "port",                  required_argument, NULL, 'p'                     },
+                { "property",              required_argument, NULL, ARG_PROPERTY            },
+                { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS       },
+                { "private-users-chown",   optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
+                { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL         },
+                { "settings",              required_argument, NULL, ARG_SETTINGS            },
+                { "chdir",                 required_argument, NULL, ARG_CHDIR               },
+                { "notify-ready",          required_argument, NULL, ARG_NOTIFY_READY        },
                 {}
         };
 
@@ -812,8 +850,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case ARG_SHARE_SYSTEM:
                         /* We don't officially support this anymore, except for compat reasons. People should use the
-                         * $SYSTEMD_NSPAWN_SHARE_SYSTEM environment variable instead. */
-                        arg_share_system = true;
+                         * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
+                        arg_clone_ns_flags = 0;
                         break;
 
                 case ARG_REGISTER:
@@ -875,15 +913,21 @@ static int parse_argv(int argc, char *argv[]) {
 
                         break;
 
-                case ARG_PRIVATE_USERS:
+                case ARG_PRIVATE_USERS: {
+                        int boolean = -1;
+
+                        if (!optarg)
+                                boolean = true;
+                        else if (!in_charset(optarg, DIGITS))
+                                /* do *not* parse numbers as booleans */
+                                boolean = parse_boolean(optarg);
 
-                        r = optarg ? parse_boolean(optarg) : 1;
-                        if (r == 0) {
+                        if (boolean == false) {
                                 /* no: User namespacing off */
                                 arg_userns_mode = USER_NAMESPACE_NO;
                                 arg_uid_shift = UID_INVALID;
                                 arg_uid_range = UINT32_C(0x10000);
-                        } else if (r > 0) {
+                        } else if (boolean == true) {
                                 /* yes: User namespacing on, UID range is read from root dir */
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
                                 arg_uid_shift = UID_INVALID;
@@ -907,23 +951,27 @@ static int parse_argv(int argc, char *argv[]) {
                                         shift = buffer;
 
                                         range++;
-                                        if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
-                                                log_error("Failed to parse UID range: %s", range);
-                                                return -EINVAL;
-                                        }
+                                        r = safe_atou32(range, &arg_uid_range);
+                                        if (r < 0)
+                                                return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
                                 } else
                                         shift = optarg;
 
-                                if (parse_uid(shift, &arg_uid_shift) < 0) {
-                                        log_error("Failed to parse UID: %s", optarg);
-                                        return -EINVAL;
-                                }
+                                r = parse_uid(shift, &arg_uid_shift);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
 
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
                         }
 
+                        if (arg_uid_range <= 0) {
+                                log_error("UID range cannot be 0.");
+                                return -EINVAL;
+                        }
+
                         arg_settings_mask |= SETTING_USERNS;
                         break;
+                }
 
                 case 'U':
                         if (userns_supported()) {
@@ -1017,20 +1065,23 @@ static int parse_argv(int argc, char *argv[]) {
                         assert_not_reached("Unhandled option");
                 }
 
-        if (getenv_bool("SYSTEMD_NSPAWN_SHARE_SYSTEM") > 0)
-                arg_share_system = true;
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
+        parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
 
-        if (arg_share_system)
+        if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
+            !(arg_clone_ns_flags & CLONE_NEWUTS)) {
                 arg_register = false;
+                if (arg_start_mode != START_PID1) {
+                        log_error("--boot cannot be used without namespacing.");
+                        return -EINVAL;
+                }
+        }
 
         if (arg_userns_mode == USER_NAMESPACE_PICK)
                 arg_userns_chown = true;
 
-        if (arg_start_mode != START_PID1 && arg_share_system) {
-                log_error("--boot and SYSTEMD_NSPAWN_SHARE_SYSTEM=1 may not be combined.");
-                return -EINVAL;
-        }
-
         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
                 log_error("--keep-unit may not be used when invoked from a user session.");
                 return -EINVAL;
@@ -1099,10 +1150,6 @@ static int parse_argv(int argc, char *argv[]) {
 
         arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
 
-        r = detect_unified_cgroup_hierarchy();
-        if (r < 0)
-                return r;
-
         e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
         if (e)
                 arg_container_service_name = e;
@@ -1194,7 +1241,13 @@ static int setup_timezone(const char *dest) {
         /* Fix the timezone, if possible */
         r = readlink_malloc("/etc/localtime", &p);
         if (r < 0) {
-                log_warning("/etc/localtime is not a symlink, not updating container timezone.");
+                log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
+                /* to handle warning, delete /etc/localtime and replace it
+                 * with a symbolic link to a time zone data file.
+                 *
+                 * Example:
+                 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
+                 */
                 return 0;
         }
 
@@ -1256,39 +1309,24 @@ static int setup_resolv_conf(const char *dest) {
         /* Fix resolv.conf, if possible */
         where = prefix_roota(dest, "/etc/resolv.conf");
 
-        if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
-                /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
-                 * container, so that the container can use the host's resolver. Given that network namespacing is
-                 * disabled it's only natural of the container also uses the host's resolver. It also has the big
-                 * advantage that the container will be able to follow the host's DNS server configuration changes
-                 * transparently. */
-
-                if (mount("/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL) < 0)
-                        log_warning_errno(errno, "Failed to mount /etc/resolv.conf in the container, ignoring: %m");
-                else {
-                        if (mount(NULL, where, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
-                                return log_error_errno(errno, "Failed to remount /etc/resolv.conf read-only: %m");
-
-                        return 0;
-                }
-        }
-
-        /* If that didn't work, let's copy the file */
         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
         if (r < 0) {
-                /* If the file already exists as symlink, let's suppress the warning, under the assumption that
-                 * resolved or something similar runs inside and the symlink points there.
+                /* If the file already exists as symlink, let's
+                 * suppress the warning, under the assumption that
+                 * resolved or something similar runs inside and the
+                 * symlink points there.
                  *
-                 * If the disk image is read-only, there's also no point in complaining.
+                 * If the disk image is read-only, there's also no
+                 * point in complaining.
                  */
                 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
-                               "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
+                               "Failed to copy /etc/resolv.conf to %s: %m", where);
                 return 0;
         }
 
         r = userns_lchown(where, 0, 0);
         if (r < 0)
-                log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
+                log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
 
         return 0;
 }
@@ -1298,9 +1336,6 @@ static int setup_boot_id(const char *dest) {
         const char *from, *to;
         int r;
 
-        if (arg_share_system)
-                return 0;
-
         /* Generate a new randomized boot ID, so that each boot-up of
          * the container gets a new one */
 
@@ -1315,10 +1350,10 @@ static int setup_boot_id(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
-        if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                r = log_error_errno(errno, "Failed to bind mount boot id: %m");
-        else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
-                r = log_error_errno(errno, "Failed to make boot id read-only: %m");
+        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        if (r >= 0)
+                r = mount_verbose(LOG_ERR, NULL, to, NULL,
+                                  MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
 
         (void) unlink(from);
         return r;
@@ -1366,6 +1401,12 @@ static int copy_devnodes(const char *dest) {
 
                 } else {
                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
+                                /*
+                                 * This is some sort of protection too against
+                                 * recursive userns chown on shared /dev/
+                                 */
+                                if (errno == EEXIST)
+                                        log_notice("%s/dev/ should be an empty directory", dest);
                                 if (errno != EPERM)
                                         return log_error_errno(errno, "mknod(%s) failed: %m", to);
 
@@ -1374,8 +1415,9 @@ static int copy_devnodes(const char *dest) {
                                 r = touch(to);
                                 if (r < 0)
                                         return log_error_errno(r, "touch (%s) failed: %m", to);
-                                if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                                        return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
+                                r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
+                                if (r < 0)
+                                        return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
                         }
 
                         r = userns_lchown(to, 0, 0);
@@ -1411,8 +1453,9 @@ static int setup_pts(const char *dest) {
         p = prefix_roota(dest, "/dev/pts");
         if (mkdir(p, 0755) < 0)
                 return log_error_errno(errno, "Failed to create /dev/pts: %m");
-        if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
-                return log_error_errno(errno, "Failed to mount /dev/pts: %m");
+        r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
+        if (r < 0)
+                return r;
         r = userns_lchown(p, 0, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to chown /dev/pts: %m");
@@ -1457,10 +1500,7 @@ static int setup_dev_console(const char *dest, const char *console) {
         if (r < 0)
                 return log_error_errno(r, "touch() for /dev/console failed: %m");
 
-        if (mount(console, to, NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
-
-        return 0;
+        return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
 }
 
 static int setup_kmsg(const char *dest, int kmsg_socket) {
@@ -1484,8 +1524,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
 
         if (mkfifo(from, 0600) < 0)
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
-        if (mount(from, to, NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
+        r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
 
         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
         if (fd < 0)
@@ -1518,7 +1559,7 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user
 
 static int setup_hostname(void) {
 
-        if (arg_share_system)
+        if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
                 return 0;
 
         if (sethostname_idempotent(arg_machine) < 0)
@@ -1655,7 +1696,8 @@ static int setup_journal(const char *directory) {
         if (r < 0)
                 return log_error_errno(r, "Failed to create %s: %m", q);
 
-        if (mount(p, q, NULL, MS_BIND, NULL) < 0)
+        r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
+        if (r < 0)
                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
 
         return 0;
@@ -1669,7 +1711,7 @@ static int reset_audit_loginuid(void) {
         _cleanup_free_ char *p = NULL;
         int r;
 
-        if (arg_share_system)
+        if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
                 return 0;
 
         r = read_one_line_file("/proc/self/loginuid", &p);
@@ -1720,13 +1762,17 @@ static int setup_propagate(const char *root) {
                 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
 
         q = prefix_roota(root, "/run/systemd/nspawn/incoming");
-        if (mount(p, q, NULL, MS_BIND, NULL) < 0)
-                return log_error_errno(errno, "Failed to install propagation bind mount.");
+        r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
+        if (r < 0)
+                return r;
 
-        if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
-                return log_error_errno(errno, "Failed to make propagation mount read-only");
+        r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
+        if (r < 0)
+                return r;
 
-        return 0;
+        /* machined will MS_MOVE into that directory, and that's only
+         * supported for non-shared mounts. */
+        return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
 }
 
 static int setup_image(char **device_path, int *loop_nr) {
@@ -2203,7 +2249,7 @@ static int dissect_image(
 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
 #ifdef HAVE_BLKID
         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
-        const char *fstype, *p;
+        const char *fstype, *p, *options;
         int r;
 
         assert(what);
@@ -2252,10 +2298,17 @@ static int mount_device(const char *what, const char *where, const char *directo
                 return -EOPNOTSUPP;
         }
 
-        if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
-                return log_error_errno(errno, "Failed to mount %s: %m", what);
+        /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
+         * sparse when possible. */
+        if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
+                const char *l;
 
-        return 0;
+                l = path_startswith(what, "/dev");
+                if (l && startswith(l, "loop"))
+                        options = "discard";
+        }
+
+        return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
 #else
         log_error("--image= is not supported, compiled without blkid support.");
         return -EOPNOTSUPP;
@@ -2630,6 +2683,10 @@ static int inner_child(
                 }
         }
 
+        r = reset_uid_gid();
+        if (r < 0)
+                return log_error_errno(r, "Couldn't become new root: %m");
+
         r = mount_all(NULL,
                       arg_userns_mode != USER_NAMESPACE_NO,
                       true,
@@ -2663,7 +2720,7 @@ static int inner_child(
                                 arg_uid_shift,
                                 arg_uid_range,
                                 arg_selinux_apifs_context,
-                                arg_use_cgns);
+                                true);
                 if (r < 0)
                         return r;
         } else {
@@ -2672,10 +2729,6 @@ static int inner_child(
                         return r;
         }
 
-        r = reset_uid_gid();
-        if (r < 0)
-                return log_error_errno(r, "Couldn't become new root: %m");
-
         r = setup_boot_id(NULL);
         if (r < 0)
                 return r;
@@ -2915,8 +2968,9 @@ static int outer_child(
         /* Mark everything as slave, so that we still
          * receive mounts from the real root, but don't
          * propagate mounts to the real root. */
-        if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
-                return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
+        r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
+        if (r < 0)
+                return r;
 
         r = mount_devices(directory,
                           root_device, root_device_rw,
@@ -2930,6 +2984,10 @@ static int outer_child(
         if (r < 0)
                 return r;
 
+        r = detect_unified_cgroup_hierarchy(directory);
+        if (r < 0)
+                return r;
+
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 /* Let the parent know which UID shift we read from the image */
                 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
@@ -2958,8 +3016,19 @@ static int outer_child(
         }
 
         /* Turn directory into bind mount */
-        if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
-                return log_error_errno(errno, "Failed to make bind mount: %m");
+        r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
+        if (r < 0)
+                return r;
+
+        /* Mark everything as shared so our mounts get propagated down. This is
+         * required to make new bind mounts available in systemd services
+         * inside the containter that create a new mount namespace.
+         * See https://github.com/systemd/systemd/issues/3860
+         * Further submounts (such as /dev) done after this will inherit the
+         * shared propagation mode.*/
+        r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
+        if (r < 0)
+                return r;
 
         r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
         if (r < 0)
@@ -2990,7 +3059,7 @@ static int outer_child(
                 return r;
 
         if (arg_read_only) {
-                r = bind_remount_recursive(directory, true);
+                r = bind_remount_recursive(directory, true, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to make tree read-only: %m");
         }
@@ -3062,7 +3131,7 @@ static int outer_child(
                                 arg_uid_shift,
                                 arg_uid_range,
                                 arg_selinux_apifs_context,
-                                arg_use_cgns);
+                                false);
                 if (r < 0)
                         return r;
         }
@@ -3076,7 +3145,7 @@ static int outer_child(
                 return fd;
 
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
-                        (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
+                        arg_clone_ns_flags |
                         (arg_private_network ? CLONE_NEWNET : 0) |
                         (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
         if (pid < 0)
@@ -3526,18 +3595,437 @@ static int load_settings(void) {
         return 0;
 }
 
+static int run(int master,
+               const char* console,
+               const char *root_device, bool root_device_rw,
+               const char *home_device, bool home_device_rw,
+               const char *srv_device, bool srv_device_rw,
+               const char *esp_device,
+               bool interactive,
+               bool secondary,
+               FDSet *fds,
+               char veth_name[IFNAMSIZ], bool *veth_created,
+               union in_addr_union *exposed,
+               pid_t *pid, int *ret) {
+
+        static const struct sigaction sa = {
+                .sa_handler = nop_signal_handler,
+                .sa_flags = SA_NOCLDSTOP,
+        };
+
+        _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
+        _cleanup_close_ int etc_passwd_lock = -1;
+        _cleanup_close_pair_ int
+                kmsg_socket_pair[2] = { -1, -1 },
+                rtnl_socket_pair[2] = { -1, -1 },
+                pid_socket_pair[2] = { -1, -1 },
+                uuid_socket_pair[2] = { -1, -1 },
+                notify_socket_pair[2] = { -1, -1 },
+                uid_shift_socket_pair[2] = { -1, -1 };
+        _cleanup_close_ int notify_socket= -1;
+        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+        ContainerStatus container_status = 0;
+        char last_char = 0;
+        int ifi = 0, r;
+        ssize_t l;
+        sigset_t mask_chld;
+
+        assert_se(sigemptyset(&mask_chld) == 0);
+        assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
+
+        if (arg_userns_mode == USER_NAMESPACE_PICK) {
+                /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
+                 * check with getpwuid() if the specific user already exists. Note that /etc might be
+                 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
+                 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
+                 * really just an extra safety net. We kinda assume that the UID range we allocate from is
+                 * really ours. */
+
+                etc_passwd_lock = take_etc_passwd_lock(NULL);
+                if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
+                        return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
+        }
+
+        r = barrier_create(&barrier);
+        if (r < 0)
+                return log_error_errno(r, "Cannot initialize IPC barrier: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create pid socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create id socket pair: %m");
+
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create notify socket pair: %m");
+
+        if (arg_userns_mode != USER_NAMESPACE_NO)
+                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
+                        return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+
+        /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
+         * parent's blocking calls and give it a chance to call wait() and terminate. */
+        r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to change the signal mask: %m");
+
+        r = sigaction(SIGCHLD, &sa, NULL);
+        if (r < 0)
+                return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
+
+        *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
+        if (*pid < 0)
+                return log_error_errno(errno, "clone() failed%s: %m",
+                                       errno == EINVAL ?
+                                       ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+
+        if (*pid == 0) {
+                /* The outer child only has a file system namespace. */
+                barrier_set_role(&barrier, BARRIER_CHILD);
+
+                master = safe_close(master);
+
+                kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
+                rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+                pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+                uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
+                notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+                uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+
+                (void) reset_all_signal_handlers();
+                (void) reset_signal_mask();
+
+                r = outer_child(&barrier,
+                                arg_directory,
+                                console,
+                                root_device, root_device_rw,
+                                home_device, home_device_rw,
+                                srv_device, srv_device_rw,
+                                esp_device,
+                                interactive,
+                                secondary,
+                                pid_socket_pair[1],
+                                uuid_socket_pair[1],
+                                notify_socket_pair[1],
+                                kmsg_socket_pair[1],
+                                rtnl_socket_pair[1],
+                                uid_shift_socket_pair[1],
+                                fds);
+                if (r < 0)
+                        _exit(EXIT_FAILURE);
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        barrier_set_role(&barrier, BARRIER_PARENT);
+
+        fds = fdset_free(fds);
+
+        kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
+        rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+        pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
+        uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
+        notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+        uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
+                /* The child just let us know the UID shift it might have read from the image. */
+                l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to read UID shift: %m");
+
+                if (l != sizeof arg_uid_shift) {
+                        log_error("Short read while reading UID shift.");
+                        return -EIO;
+                }
+
+                if (arg_userns_mode == USER_NAMESPACE_PICK) {
+                        /* If we are supposed to pick the UID shift, let's try to use the shift read from the
+                         * image, but if that's already in use, pick a new one, and report back to the child,
+                         * which one we now picked. */
+
+                        r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
+
+                        l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
+                        if (l < 0)
+                                return log_error_errno(errno, "Failed to send UID shift: %m");
+                        if (l != sizeof arg_uid_shift) {
+                                log_error("Short write while writing UID shift.");
+                                return -EIO;
+                        }
+                }
+        }
+
+        /* Wait for the outer child. */
+        r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
+        if (r != 0)
+                return r < 0 ? r : -EIO;
+
+        /* And now retrieve the PID of the inner child. */
+        l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
+        if (l < 0)
+                return log_error_errno(errno, "Failed to read inner child PID: %m");
+        if (l != sizeof *pid) {
+                log_error("Short read while reading inner child PID.");
+                return -EIO;
+        }
+
+        /* We also retrieve container UUID in case it was generated by outer child */
+        l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
+        if (l < 0)
+                return log_error_errno(errno, "Failed to read container machine ID: %m");
+        if (l != sizeof(arg_uuid)) {
+                log_error("Short read while reading container machined ID.");
+                return -EIO;
+        }
+
+        /* We also retrieve the socket used for notifications generated by outer child */
+        notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+        if (notify_socket < 0)
+                return log_error_errno(notify_socket,
+                                       "Failed to receive notification socket from the outer child: %m");
+
+        log_debug("Init process invoked as PID "PID_FMT, *pid);
+
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
+                if (!barrier_place_and_sync(&barrier)) { /* #1 */
+                        log_error("Child died too early.");
+                        return -ESRCH;
+                }
+
+                r = setup_uid_map(*pid);
+                if (r < 0)
+                        return r;
+
+                (void) barrier_place(&barrier); /* #2 */
+        }
+
+        if (arg_private_network) {
+
+                r = move_network_interfaces(*pid, arg_network_interfaces);
+                if (r < 0)
+                        return r;
+
+                if (arg_network_veth) {
+                        r = setup_veth(arg_machine, *pid, veth_name,
+                                       arg_network_bridge || arg_network_zone);
+                        if (r < 0)
+                                return r;
+                        else if (r > 0)
+                                ifi = r;
+
+                        if (arg_network_bridge) {
+                                /* Add the interface to a bridge */
+                                r = setup_bridge(veth_name, arg_network_bridge, false);
+                                if (r < 0)
+                                        return r;
+                                if (r > 0)
+                                        ifi = r;
+                        } else if (arg_network_zone) {
+                                /* Add the interface to a bridge, possibly creating it */
+                                r = setup_bridge(veth_name, arg_network_zone, true);
+                                if (r < 0)
+                                        return r;
+                                if (r > 0)
+                                        ifi = r;
+                        }
+                }
+
+                r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
+                if (r < 0)
+                        return r;
+
+                /* We created the primary and extra veth links now; let's remember this, so that we know to
+                   remove them later on. Note that we don't bother with removing veth links that were created
+                   here when their setup failed half-way, because in that case the kernel should be able to
+                   remove them on its own, since they cannot be referenced by anything yet. */
+                *veth_created = true;
+
+                r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
+                if (r < 0)
+                        return r;
+
+                r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
+                if (r < 0)
+                        return r;
+        }
+
+        if (arg_register) {
+                r = register_machine(
+                                arg_machine,
+                                *pid,
+                                arg_directory,
+                                arg_uuid,
+                                ifi,
+                                arg_slice,
+                                arg_custom_mounts, arg_n_custom_mounts,
+                                arg_kill_signal,
+                                arg_property,
+                                arg_keep_unit,
+                                arg_container_service_name);
+                if (r < 0)
+                        return r;
+        }
+
+        r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
+        if (r < 0)
+                return r;
+
+        if (arg_keep_unit) {
+                r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
+                if (r < 0)
+                        return r;
+        }
+
+        r = chown_cgroup(*pid, arg_uid_shift);
+        if (r < 0)
+                return r;
+
+        /* Notify the child that the parent is ready with all
+         * its setup (including cgroup-ification), and that
+         * the child can now hand over control to the code to
+         * run inside the container. */
+        (void) barrier_place(&barrier); /* #3 */
+
+        /* Block SIGCHLD here, before notifying child.
+         * process_pty() will handle it with the other signals. */
+        assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
+
+        /* Reset signal to default */
+        r = default_signals(SIGCHLD, -1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to reset SIGCHLD: %m");
+
+        r = sd_event_new(&event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get default event source: %m");
+
+        r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
+        if (r < 0)
+                return r;
+
+        /* Let the child know that we are ready and wait that the child is completely ready now. */
+        if (!barrier_place_and_sync(&barrier)) { /* #4 */
+                log_error("Child died too early.");
+                return -ESRCH;
+        }
+
+        /* At this point we have made use of the UID we picked, and thus nss-mymachines
+         * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
+        etc_passwd_lock = safe_close(etc_passwd_lock);
+
+        sd_notifyf(false,
+                   "STATUS=Container running.\n"
+                   "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
+        if (!arg_notify_ready)
+                sd_notify(false, "READY=1\n");
+
+        if (arg_kill_signal > 0) {
+                /* Try to kill the init system on SIGINT or SIGTERM */
+                sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
+                sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
+        } else {
+                /* Immediately exit */
+                sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+                sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+        }
+
+        /* simply exit on sigchld */
+        sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
+
+        if (arg_expose_ports) {
+                r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
+                if (r < 0)
+                        return r;
+
+                (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
+        }
+
+        rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+
+        r = pty_forward_new(event, master,
+                            PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
+                            &forward);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        pty_forward_get_last_char(forward, &last_char);
+
+        forward = pty_forward_free(forward);
+
+        if (!arg_quiet && last_char != '\n')
+                putc('\n', stdout);
+
+        /* Kill if it is not dead yet anyway */
+        if (arg_register && !arg_keep_unit)
+                terminate_machine(*pid);
+
+        /* Normally redundant, but better safe than sorry */
+        kill(*pid, SIGKILL);
+
+        r = wait_for_container(*pid, &container_status);
+        *pid = 0;
+
+        if (r < 0)
+                /* We failed to wait for the container, or the container exited abnormally. */
+                return r;
+        if (r > 0 || container_status == CONTAINER_TERMINATED) {
+                /* r > 0 → The container exited with a non-zero status.
+                 *         As a special case, we need to replace 133 with a different value,
+                 *         because 133 is special-cased in the service file to reboot the container.
+                 * otherwise → The container exited with zero status and a reboot was not requested.
+                 */
+                if (r == 133)
+                        r = EXIT_FAILURE; /* replace 133 with the general failure code */
+                *ret = r;
+                return 0; /* finito */
+        }
+
+        /* CONTAINER_REBOOTED, loop again */
+
+        if (arg_keep_unit) {
+                /* Special handling if we are running as a service: instead of simply
+                 * restarting the machine we want to restart the entire service, so let's
+                 * inform systemd about this with the special exit code 133. The service
+                 * file uses RestartForceExitStatus=133 so that this results in a full
+                 * nspawn restart. This is necessary since we might have cgroup parameters
+                 * set we want to have flushed out. */
+                *ret = 0;
+                return 133;
+        }
+
+        expose_port_flush(arg_expose_ports, exposed);
+
+        (void) remove_veth_links(veth_name, arg_network_veth_extra);
+        *veth_created = false;
+        return 1; /* loop again */
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
         _cleanup_close_ int master = -1, image_fd = -1;
         _cleanup_fdset_free_ FDSet *fds = NULL;
-        int r, n_fd_passed, loop_nr = -1;
+        int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
         bool secondary = false, remove_subvol = false;
-        sigset_t mask_chld;
         pid_t pid = 0;
-        int ret = EXIT_SUCCESS;
         union in_addr_union exposed = {};
         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
         bool interactive, veth_created = false;
@@ -3753,470 +4241,25 @@ int main(int argc, char *argv[]) {
 
         assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
 
-        assert_se(sigemptyset(&mask_chld) == 0);
-        assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
-
         if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
                 r = log_error_errno(errno, "Failed to become subreaper: %m");
                 goto finish;
         }
 
         for (;;) {
-                static const struct sigaction sa = {
-                        .sa_handler = nop_signal_handler,
-                        .sa_flags = SA_NOCLDSTOP,
-                };
-
-                _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
-                _cleanup_close_ int etc_passwd_lock = -1;
-                _cleanup_close_pair_ int
-                        kmsg_socket_pair[2] = { -1, -1 },
-                        rtnl_socket_pair[2] = { -1, -1 },
-                        pid_socket_pair[2] = { -1, -1 },
-                        uuid_socket_pair[2] = { -1, -1 },
-                        notify_socket_pair[2] = { -1, -1 },
-                        uid_shift_socket_pair[2] = { -1, -1 };
-                _cleanup_close_ int notify_socket= -1;
-                _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
-                _cleanup_(sd_event_unrefp) sd_event *event = NULL;
-                _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
-                _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
-                ContainerStatus container_status;
-                char last_char = 0;
-                int ifi = 0;
-                ssize_t l;
-
-                if (arg_userns_mode == USER_NAMESPACE_PICK) {
-                        /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
-                         * check with getpwuid() if the specific user already exists. Note that /etc might be
-                         * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
-                         * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
-                         * really just an extra safety net. We kinda assume that the UID range we allocate from is
-                         * really ours. */
-
-                        etc_passwd_lock = take_etc_passwd_lock(NULL);
-                        if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
-                                log_error_errno(r, "Failed to take /etc/passwd lock: %m");
-                                goto finish;
-                        }
-                }
-
-                r = barrier_create(&barrier);
-                if (r < 0) {
-                        log_error_errno(r, "Cannot initialize IPC barrier: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create pid socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create id socket pair: %m");
-                        goto finish;
-                }
-
-                if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
-                        r = log_error_errno(errno, "Failed to create notify socket pair: %m");
-                        goto finish;
-                }
-
-                if (arg_userns_mode != USER_NAMESPACE_NO)
-                        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
-                                r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
-                                goto finish;
-                        }
-
-                /* Child can be killed before execv(), so handle SIGCHLD
-                 * in order to interrupt parent's blocking calls and
-                 * give it a chance to call wait() and terminate. */
-                r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
-                if (r < 0) {
-                        r = log_error_errno(errno, "Failed to change the signal mask: %m");
-                        goto finish;
-                }
-
-                r = sigaction(SIGCHLD, &sa, NULL);
-                if (r < 0) {
-                        r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
-                        goto finish;
-                }
-
-                pid = raw_clone(SIGCHLD|CLONE_NEWNS);
-                if (pid < 0) {
-                        if (errno == EINVAL)
-                                r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
-                        else
-                                r = log_error_errno(errno, "clone() failed: %m");
-
-                        goto finish;
-                }
-
-                if (pid == 0) {
-                        /* The outer child only has a file system namespace. */
-                        barrier_set_role(&barrier, BARRIER_CHILD);
-
-                        master = safe_close(master);
-
-                        kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
-                        rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-                        pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
-                        uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
-                        notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
-                        uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
-
-                        (void) reset_all_signal_handlers();
-                        (void) reset_signal_mask();
-
-                        r = outer_child(&barrier,
-                                        arg_directory,
-                                        console,
-                                        root_device, root_device_rw,
-                                        home_device, home_device_rw,
-                                        srv_device, srv_device_rw,
-                                        esp_device,
-                                        interactive,
-                                        secondary,
-                                        pid_socket_pair[1],
-                                        uuid_socket_pair[1],
-                                        notify_socket_pair[1],
-                                        kmsg_socket_pair[1],
-                                        rtnl_socket_pair[1],
-                                        uid_shift_socket_pair[1],
-                                        fds);
-                        if (r < 0)
-                                _exit(EXIT_FAILURE);
-
-                        _exit(EXIT_SUCCESS);
-                }
-
-                barrier_set_role(&barrier, BARRIER_PARENT);
-
-                fds = fdset_free(fds);
-
-                kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
-                rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
-                pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
-                uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
-                notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
-                uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
-
-                if (arg_userns_mode != USER_NAMESPACE_NO) {
-                        /* The child just let us know the UID shift it might have read from the image. */
-                        l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
-                        if (l < 0) {
-                                r = log_error_errno(errno, "Failed to read UID shift: %m");
-                                goto finish;
-                        }
-                        if (l != sizeof(arg_uid_shift)) {
-                                log_error("Short read while reading UID shift.");
-                                r = EIO;
-                                goto finish;
-                        }
-
-                        if (arg_userns_mode == USER_NAMESPACE_PICK) {
-                                /* If we are supposed to pick the UID shift, let's try to use the shift read from the
-                                 * image, but if that's already in use, pick a new one, and report back to the child,
-                                 * which one we now picked. */
-
-                                r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
-                                if (r < 0) {
-                                        log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
-                                        goto finish;
-                                }
-
-                                l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
-                                if (l < 0) {
-                                        r = log_error_errno(errno, "Failed to send UID shift: %m");
-                                        goto finish;
-                                }
-                                if (l != sizeof(arg_uid_shift)) {
-                                        log_error("Short write while writing UID shift.");
-                                        r = -EIO;
-                                        goto finish;
-                                }
-                        }
-                }
-
-                /* Wait for the outer child. */
-                r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
-                if (r < 0)
-                        goto finish;
-                if (r != 0) {
-                        r = -EIO;
-                        goto finish;
-                }
-                pid = 0;
-
-                /* And now retrieve the PID of the inner child. */
-                l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
-                if (l < 0) {
-                        r = log_error_errno(errno, "Failed to read inner child PID: %m");
-                        goto finish;
-                }
-                if (l != sizeof(pid)) {
-                        log_error("Short read while reading inner child PID.");
-                        r = EIO;
-                        goto finish;
-                }
-
-                /* We also retrieve container UUID in case it was generated by outer child */
-                l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
-                if (l < 0) {
-                        r = log_error_errno(errno, "Failed to read container machine ID: %m");
-                        goto finish;
-                }
-                if (l != sizeof(arg_uuid)) {
-                        log_error("Short read while reading container machined ID.");
-                        r = EIO;
-                        goto finish;
-                }
-
-                /* We also retrieve the socket used for notifications generated by outer child */
-                notify_socket = receive_one_fd(notify_socket_pair[0], 0);
-                if (notify_socket < 0) {
-                        r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
-                        goto finish;
-                }
-
-                log_debug("Init process invoked as PID " PID_FMT, pid);
-
-                if (arg_userns_mode != USER_NAMESPACE_NO) {
-                        if (!barrier_place_and_sync(&barrier)) { /* #1 */
-                                log_error("Child died too early.");
-                                r = -ESRCH;
-                                goto finish;
-                        }
-
-                        r = setup_uid_map(pid);
-                        if (r < 0)
-                                goto finish;
-
-                        (void) barrier_place(&barrier); /* #2 */
-                }
-
-                if (arg_private_network) {
-
-                        r = move_network_interfaces(pid, arg_network_interfaces);
-                        if (r < 0)
-                                goto finish;
-
-                        if (arg_network_veth) {
-                                r = setup_veth(arg_machine, pid, veth_name,
-                                               arg_network_bridge || arg_network_zone);
-                                if (r < 0)
-                                        goto finish;
-                                else if (r > 0)
-                                        ifi = r;
-
-                                if (arg_network_bridge) {
-                                        /* Add the interface to a bridge */
-                                        r = setup_bridge(veth_name, arg_network_bridge, false);
-                                        if (r < 0)
-                                                goto finish;
-                                        if (r > 0)
-                                                ifi = r;
-                                } else if (arg_network_zone) {
-                                        /* Add the interface to a bridge, possibly creating it */
-                                        r = setup_bridge(veth_name, arg_network_zone, true);
-                                        if (r < 0)
-                                                goto finish;
-                                        if (r > 0)
-                                                ifi = r;
-                                }
-                        }
-
-                        r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
-                        if (r < 0)
-                                goto finish;
-
-                        /* We created the primary and extra veth links now; let's remember this, so that we know to
-                           remove them later on. Note that we don't bother with removing veth links that were created
-                           here when their setup failed half-way, because in that case the kernel should be able to
-                           remove them on its own, since they cannot be referenced by anything yet. */
-                        veth_created = true;
-
-                        r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
-                        if (r < 0)
-                                goto finish;
-
-                        r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                if (arg_register) {
-                        r = register_machine(
-                                        arg_machine,
-                                        pid,
-                                        arg_directory,
-                                        arg_uuid,
-                                        ifi,
-                                        arg_slice,
-                                        arg_custom_mounts, arg_n_custom_mounts,
-                                        arg_kill_signal,
-                                        arg_property,
-                                        arg_keep_unit,
-                                        arg_container_service_name);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
-                if (r < 0)
-                        goto finish;
-
-                if (arg_keep_unit) {
-                        r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
-                        if (r < 0)
-                                goto finish;
-                }
-
-                r = chown_cgroup(pid, arg_uid_shift);
-                if (r < 0)
-                        goto finish;
-
-                /* Notify the child that the parent is ready with all
-                 * its setup (including cgroup-ification), and that
-                 * the child can now hand over control to the code to
-                 * run inside the container. */
-                (void) barrier_place(&barrier); /* #3 */
-
-                /* Block SIGCHLD here, before notifying child.
-                 * process_pty() will handle it with the other signals. */
-                assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
-
-                /* Reset signal to default */
-                r = default_signals(SIGCHLD, -1);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to reset SIGCHLD: %m");
-                        goto finish;
-                }
-
-                r = sd_event_new(&event);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to get default event source: %m");
-                        goto finish;
-                }
-
-                r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
-                if (r < 0)
-                        goto finish;
-
-                /* Let the child know that we are ready and wait that the child is completely ready now. */
-                if (!barrier_place_and_sync(&barrier)) { /* #4 */
-                        log_error("Child died too early.");
-                        r = -ESRCH;
-                        goto finish;
-                }
-
-                /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
-                 * in getpwuid(), thus we can release the /etc/passwd lock. */
-                etc_passwd_lock = safe_close(etc_passwd_lock);
-
-                sd_notifyf(false,
-                           "STATUS=Container running.\n"
-                           "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
-                if (!arg_notify_ready)
-                        sd_notify(false, "READY=1\n");
-
-                if (arg_kill_signal > 0) {
-                        /* Try to kill the init system on SIGINT or SIGTERM */
-                        sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
-                        sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
-                } else {
-                        /* Immediately exit */
-                        sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
-                        sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
-                }
-
-                /* simply exit on sigchld */
-                sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
-
-                if (arg_expose_ports) {
-                        r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
-                        if (r < 0)
-                                goto finish;
-
-                        (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
-                }
-
-                rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-
-                r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to create PTY forwarder: %m");
-                        goto finish;
-                }
-
-                r = sd_event_loop(event);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to run event loop: %m");
-                        goto finish;
-                }
-
-                pty_forward_get_last_char(forward, &last_char);
-
-                forward = pty_forward_free(forward);
-
-                if (!arg_quiet && last_char != '\n')
-                        putc('\n', stdout);
-
-                /* Kill if it is not dead yet anyway */
-                if (arg_register && !arg_keep_unit)
-                        terminate_machine(pid);
-
-                /* Normally redundant, but better safe than sorry */
-                kill(pid, SIGKILL);
-
-                r = wait_for_container(pid, &container_status);
-                pid = 0;
-
-                if (r < 0)
-                        /* We failed to wait for the container, or the
-                         * container exited abnormally */
-                        goto finish;
-                else if (r > 0 || container_status == CONTAINER_TERMINATED) {
-                        /* The container exited with a non-zero
-                         * status, or with zero status and no reboot
-                         * was requested. */
-                        ret = r;
+                r = run(master,
+                        console,
+                        root_device, root_device_rw,
+                        home_device, home_device_rw,
+                        srv_device, srv_device_rw,
+                        esp_device,
+                        interactive, secondary,
+                        fds,
+                        veth_name, &veth_created,
+                        &exposed,
+                        &pid, &ret);
+                if (r <= 0)
                         break;
-                }
-
-                /* CONTAINER_REBOOTED, loop again */
-
-                if (arg_keep_unit) {
-                        /* Special handling if we are running as a
-                         * service: instead of simply restarting the
-                         * machine we want to restart the entire
-                         * service, so let's inform systemd about this
-                         * with the special exit code 133. The service
-                         * file uses RestartForceExitStatus=133 so
-                         * that this results in a full nspawn
-                         * restart. This is necessary since we might
-                         * have cgroup parameters set we want to have
-                         * flushed out. */
-                        ret = 133;
-                        r = 0;
-                        break;
-                }
-
-                expose_port_flush(arg_expose_ports, &exposed);
-
-                (void) remove_veth_links(veth_name, arg_network_veth_extra);
-                veth_created = false;
         }
 
 finish:
author	Luke Shumaker <lukeshu@sbcglobal.net>	2016-12-17 02:47:02 -0500
committer	Luke Shumaker <lukeshu@sbcglobal.net>	2016-12-17 02:47:02 -0500
commit	a4d083550a7273b895b44aac8d2ff7e2fdb1f7d5 (patch)
tree	6f148433641f8c92d6f1eddcb2199a78dbd111a0 /src/nspawn
parent	b6d071f1df46eb841ba3f88cdb2b248eaf5f35f8 (diff)
parent	86e9bb69ae74bd960e1fd427258f41d54240d6d1 (diff)