summaryrefslogtreecommitdiff
path: root/src/systemd-nspawn
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@sbcglobal.net>2016-12-17 03:11:52 -0500
committerLuke Shumaker <lukeshu@sbcglobal.net>2016-12-17 03:11:52 -0500
commitb849891b5dde5ee14ab8b7b7db74e65a4a38d993 (patch)
tree29bb0e6fda9b4b170041913de495da057bbe3621 /src/systemd-nspawn
parent004efebf9cc559ea131bb9460ee0ee198e2d5da7 (diff)
parent881228ff72434a0e3401a16bd87f179ef0ab1619 (diff)
Merge branch 'notsystemd/postmove' into notsystemd/master
# Conflicts: # src/grp-journal/libjournal-core/.gitignore # src/grp-system/libcore/include/core/mount.h
Diffstat (limited to 'src/systemd-nspawn')
-rw-r--r--src/systemd-nspawn/nspawn-cgroup.c76
-rw-r--r--src/systemd-nspawn/nspawn-cgroup.h6
-rw-r--r--src/systemd-nspawn/nspawn-mount.c450
-rw-r--r--src/systemd-nspawn/nspawn-mount.h6
-rw-r--r--src/systemd-nspawn/nspawn-register.c9
-rw-r--r--src/systemd-nspawn/nspawn-seccomp.c26
-rw-r--r--src/systemd-nspawn/nspawn-settings.c4
-rw-r--r--src/systemd-nspawn/nspawn.c1339
-rw-r--r--src/systemd-nspawn/systemd-nspawn.xml126
9 files changed, 1192 insertions, 850 deletions
diff --git a/src/systemd-nspawn/nspawn-cgroup.c b/src/systemd-nspawn/nspawn-cgroup.c
index c43d747dc3..9e793d85f1 100644
--- a/src/systemd-nspawn/nspawn-cgroup.c
+++ b/src/systemd-nspawn/nspawn-cgroup.c
@@ -20,33 +20,24 @@
#include <sys/mount.h>
#include "systemd-basic/alloc-util.h"
-#include "systemd-basic/cgroup-util.h"
#include "systemd-basic/fd-util.h"
#include "systemd-basic/fileio.h"
#include "systemd-basic/mkdir.h"
+#include "systemd-basic/mount-util.h"
+#include "systemd-basic/rm-rf.h"
#include "systemd-basic/string-util.h"
#include "systemd-basic/strv.h"
#include "systemd-basic/util.h"
#include "nspawn-cgroup.h"
-int chown_cgroup(pid_t pid, uid_t uid_shift) {
- _cleanup_free_ char *path = NULL, *fs = NULL;
+static int chown_cgroup_path(const char *path, uid_t uid_shift) {
_cleanup_close_ int fd = -1;
const char *fn;
- int r;
-
- r = cg_pid_get_path(NULL, pid, &path);
- if (r < 0)
- return log_error_errno(r, "Failed to get container cgroup path: %m");
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
- if (r < 0)
- return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
- fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
if (fd < 0)
- return log_error_errno(errno, "Failed to open %s: %m", fs);
+ return -errno;
FOREACH_STRING(fn,
".",
@@ -64,18 +55,37 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) {
return 0;
}
-int sync_cgroup(pid_t pid, bool unified_requested) {
+int chown_cgroup(pid_t pid, uid_t uid_shift) {
+ _cleanup_free_ char *path = NULL, *fs = NULL;
+ int r;
+
+ r = cg_pid_get_path(NULL, pid, &path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+ r = chown_cgroup_path(fs, uid_shift);
+ if (r < 0)
+ return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+ return 0;
+}
+
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) {
_cleanup_free_ char *cgroup = NULL;
char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
bool undo_mount = false;
const char *fn;
int unified, r;
- unified = cg_unified();
+ unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
if (unified < 0)
return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
- if ((unified > 0) == unified_requested)
+ if ((unified > 0) == (unified_requested >= CGROUP_UNIFIED_SYSTEMD))
return 0;
/* When the host uses the legacy cgroup setup, but the
@@ -92,33 +102,45 @@ int sync_cgroup(pid_t pid, bool unified_requested) {
return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
if (unified)
- r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+ r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
else
- r = mount("cgroup", tree, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
- if (r < 0) {
- r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+ r = mount_verbose(LOG_ERR, "cgroup", tree, "cgroup2",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (r < 0)
goto finish;
- }
undo_mount = true;
+ /* If nspawn dies abruptly the cgroup hierarchy created below
+ * its unit isn't cleaned up. So, let's remove it
+ * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
+ fn = strjoina(tree, cgroup);
+ (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
+
fn = strjoina(tree, cgroup, "/cgroup.procs");
(void) mkdir_parents(fn, 0755);
sprintf(pid_string, PID_FMT, pid);
r = write_string_file(fn, pid_string, 0);
- if (r < 0)
+ if (r < 0) {
log_error_errno(r, "Failed to move process: %m");
+ goto finish;
+ }
+ fn = strjoina(tree, cgroup);
+ r = chown_cgroup_path(fn, arg_uid_shift);
+ if (r < 0)
+ log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
finish:
if (undo_mount)
- (void) umount(tree);
+ (void) umount_verbose(tree);
(void) rmdir(tree);
return r;
}
-int create_subcgroup(pid_t pid, bool unified_requested) {
+int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
_cleanup_free_ char *cgroup = NULL;
const char *child;
int unified, r;
@@ -130,10 +152,10 @@ int create_subcgroup(pid_t pid, bool unified_requested) {
* did not create a scope unit for the container move us and
* the container into two separate subcgroups. */
- if (!unified_requested)
+ if (unified_requested == CGROUP_UNIFIED_NONE)
return 0;
- unified = cg_unified();
+ unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
if (unified < 0)
return log_error_errno(unified, "Failed to determine whether the unified hierarchy is used: %m");
if (unified == 0)
diff --git a/src/systemd-nspawn/nspawn-cgroup.h b/src/systemd-nspawn/nspawn-cgroup.h
index 1ff35a299a..6c0ddfc7de 100644
--- a/src/systemd-nspawn/nspawn-cgroup.h
+++ b/src/systemd-nspawn/nspawn-cgroup.h
@@ -22,6 +22,8 @@
#include <stdbool.h>
#include <sys/types.h>
+#include "systemd-basic/cgroup-util.h"
+
int chown_cgroup(pid_t pid, uid_t uid_shift);
-int sync_cgroup(pid_t pid, bool unified_requested);
-int create_subcgroup(pid_t pid, bool unified_requested);
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
+int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
diff --git a/src/systemd-nspawn/nspawn-mount.c b/src/systemd-nspawn/nspawn-mount.c
index 10a28ff29b..aac04efd4b 100644
--- a/src/systemd-nspawn/nspawn-mount.c
+++ b/src/systemd-nspawn/nspawn-mount.c
@@ -22,8 +22,9 @@
#include <linux/magic.h>
#include "systemd-basic/alloc-util.h"
-#include "systemd-basic/cgroup-util.h"
#include "systemd-basic/escape.h"
+#include "systemd-basic/fd-util.h"
+#include "systemd-basic/fileio.h"
#include "systemd-basic/fs-util.h"
#include "systemd-basic/label.h"
#include "systemd-basic/mkdir.h"
@@ -183,13 +184,15 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
static int tmpfs_patch_options(
const char *options,
- bool userns, uid_t uid_shift, uid_t uid_range,
+ bool userns,
+ uid_t uid_shift, uid_t uid_range,
+ bool patch_ids,
const char *selinux_apifs_context,
char **ret) {
char *buf = NULL;
- if (userns && uid_shift != 0) {
+ if ((userns && uid_shift != 0) || patch_ids) {
assert(uid_shift != UID_INVALID);
if (options)
@@ -220,7 +223,13 @@ static int tmpfs_patch_options(
}
#endif
+ if (!buf && options) {
+ buf = strdup(options);
+ if (!buf)
+ return -ENOMEM;
+ }
*ret = buf;
+
return !!buf;
}
@@ -243,8 +252,10 @@ int mount_sysfs(const char *dest) {
(void) mkdir(full, 0755);
- if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
- return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
+ r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
+ MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (r < 0)
+ return r;
FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
_cleanup_free_ char *from = NULL, *to = NULL;
@@ -259,28 +270,91 @@ int mount_sysfs(const char *dest) {
(void) mkdir(to, 0755);
- if (mount(from, to, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
+ r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
- if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
- return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
+ r = mount_verbose(LOG_ERR, NULL, to, NULL,
+ MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+ if (r < 0)
+ return r;
}
- if (umount(full) < 0)
- return log_error_errno(errno, "Failed to unmount %s: %m", full);
+ r = umount_verbose(full);
+ if (r < 0)
+ return r;
if (rmdir(full) < 0)
return log_error_errno(errno, "Failed to remove %s: %m", full);
x = prefix_roota(top, "/fs/kdbus");
- (void) mkdir(x, 0755);
+ (void) mkdir_p(x, 0755);
+
+ /* Create mountpoint for cgroups. Otherwise we are not allowed since we
+ * remount /sys read-only.
+ */
+ if (cg_ns_supported()) {
+ x = prefix_roota(top, "/fs/cgroup");
+ (void) mkdir_p(x, 0755);
+ }
+
+ return mount_verbose(LOG_ERR, NULL, top, NULL,
+ MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+}
+
+static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+ int r;
+
+ assert(path);
- if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
- return log_error_errno(errno, "Failed to make %s read-only: %m", top);
+ r = mkdir(path, mode);
+ if (r < 0 && errno != EEXIST)
+ return -errno;
+
+ if (!in_userns) {
+ r = lchown(path, uid_shift, uid_shift);
+ if (r < 0)
+ return -errno;
+ }
return 0;
}
+static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
+ const char *p, *e;
+ int r;
+
+ assert(path);
+
+ if (prefix && !path_startswith(path, prefix))
+ return -ENOTDIR;
+
+ /* create every parent directory in the path, except the last component */
+ p = path + strspn(path, "/");
+ for (;;) {
+ char t[strlen(path) + 1];
+
+ e = p + strcspn(p, "/");
+ p = e + strspn(e, "/");
+
+ /* Is this the last component? If so, then we're done */
+ if (*p == 0)
+ break;
+
+ memcpy(t, path, e - path);
+ t[e-path] = 0;
+
+ if (prefix && path_startswith(prefix, t))
+ continue;
+
+ r = mkdir_userns(t, mode, in_userns, uid_shift);
+ if (r < 0)
+ return r;
+ }
+
+ return mkdir_userns(path, mode, in_userns, uid_shift);
+}
+
int mount_all(const char *dest,
bool use_userns, bool in_userns,
bool use_netns,
@@ -299,19 +373,21 @@ int mount_all(const char *dest,
} MountPoint;
static const MountPoint mount_table[] = {
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
- { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
- { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
+ { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
+ { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, false, true, false }, /* Bind mount first ...*/
+ { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true, false }, /* ... then, make it r/o */
+ { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
#ifdef HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
#endif
};
@@ -340,8 +416,8 @@ int mount_all(const char *dest,
if (mount_table[k].what && r > 0)
continue;
- r = mkdir_p(where, 0755);
- if (r < 0) {
+ r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
+ if (r < 0 && r != -EEXIST) {
if (mount_table[k].fatal)
return log_error_errno(r, "Failed to create directory %s: %m", where);
@@ -351,24 +427,24 @@ int mount_all(const char *dest,
o = mount_table[k].options;
if (streq_ptr(mount_table[k].type, "tmpfs")) {
- r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
+ if (in_userns)
+ r = tmpfs_patch_options(o, use_userns, 0, uid_range, true, selinux_apifs_context, &options);
+ else
+ r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
if (r < 0)
return log_oom();
if (r > 0)
o = options;
}
- if (mount(mount_table[k].what,
- where,
- mount_table[k].type,
- mount_table[k].flags,
- o) < 0) {
-
- if (mount_table[k].fatal)
- return log_error_errno(errno, "mount(%s) failed: %m", where);
-
- log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
- }
+ r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_WARNING,
+ mount_table[k].what,
+ where,
+ mount_table[k].type,
+ mount_table[k].flags,
+ o);
+ if (r < 0 && mount_table[k].fatal)
+ return r;
}
return 0;
@@ -453,15 +529,15 @@ static int mount_bind(const char *dest, CustomMount *m) {
if (r < 0)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
- } else {
+ } else
return log_error_errno(errno, "Failed to stat %s: %m", where);
- }
- if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
- return log_error_errno(errno, "mount(%s) failed: %m", where);
+ r = mount_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
+ if (r < 0)
+ return r;
if (m->read_only) {
- r = bind_remount_recursive(where, true);
+ r = bind_remount_recursive(where, true, NULL);
if (r < 0)
return log_error_errno(r, "Read-only bind mount failed: %m");
}
@@ -488,15 +564,12 @@ static int mount_tmpfs(
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
- r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+ r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
if (r < 0)
return log_oom();
options = r > 0 ? buf : m->options;
- if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
- return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
-
- return 0;
+ return mount_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
}
static char *joined_and_escaped_lower_dirs(char * const *lower) {
@@ -558,10 +631,7 @@ static int mount_overlay(const char *dest, CustomMount *m) {
options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
}
- if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
- return log_error_errno(errno, "overlay mount to %s failed: %m", where);
-
- return 0;
+ return mount_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
}
int mount_custom(
@@ -603,8 +673,52 @@ int mount_custom(
return 0;
}
-static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
- char *to;
+/* Retrieve existing subsystems. This function is called in a new cgroup
+ * namespace.
+ */
+static int get_controllers(Set *subsystems) {
+ _cleanup_fclose_ FILE *f = NULL;
+ char line[LINE_MAX];
+
+ assert(subsystems);
+
+ f = fopen("/proc/self/cgroup", "re");
+ if (!f)
+ return errno == ENOENT ? -ESRCH : -errno;
+
+ FOREACH_LINE(line, f, return -errno) {
+ int r;
+ char *e, *l, *p;
+
+ l = strchr(line, ':');
+ if (!l)
+ continue;
+
+ l++;
+ e = strchr(l, ':');
+ if (!e)
+ continue;
+
+ *e = 0;
+
+ if (STR_IN_SET(l, "", "name=systemd"))
+ continue;
+
+ p = strdup(l);
+ if (!p)
+ return -ENOMEM;
+
+ r = set_consume(subsystems, p);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy,
+ CGroupUnified unified_requested, bool read_only) {
+ const char *to, *fstype, *opts;
int r;
to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
@@ -619,23 +733,136 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
/* The superblock mount options of the mount point need to be
* identical to the hosts', and hence writable... */
- if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
- return log_error_errno(errno, "Failed to mount to %s: %m", to);
+ if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+ if (unified_requested >= CGROUP_UNIFIED_SYSTEMD) {
+ fstype = "cgroup2";
+ opts = NULL;
+ } else {
+ fstype = "cgroup";
+ opts = "none,name=systemd,xattr";
+ }
+ } else {
+ fstype = "cgroup";
+ opts = controller;
+ }
- /* ... hence let's only make the bind mount read-only, not the
- * superblock. */
+ r = mount_verbose(LOG_ERR, "cgroup", to, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ if (r < 0)
+ return r;
+
+ /* ... hence let's only make the bind mount read-only, not the superblock. */
if (read_only) {
- if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
- return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
+ r = mount_verbose(LOG_ERR, NULL, to, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
}
+
return 1;
}
-static int mount_legacy_cgroups(
+/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
+static int mount_legacy_cgns_supported(
+ CGroupUnified unified_requested, bool userns, uid_t uid_shift,
+ uid_t uid_range, const char *selinux_apifs_context) {
+ _cleanup_set_free_free_ Set *controllers = NULL;
+ const char *cgroup_root = "/sys/fs/cgroup", *c;
+ int r;
+
+ (void) mkdir_p(cgroup_root, 0755);
+
+ /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+ r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+ if (r == 0) {
+ _cleanup_free_ char *options = NULL;
+
+ /* When cgroup namespaces are enabled and user namespaces are
+ * used then the mount of the cgroupfs is done *inside* the new
+ * user namespace. We're root in the new user namespace and the
+ * kernel will happily translate our uid/gid to the correct
+ * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
+ * pass uid 0 and not uid_shift to tmpfs_patch_options().
+ */
+ r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
+ if (r < 0)
+ return log_oom();
+
+ r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+ if (r < 0)
+ return r;
+ }
+
+ if (cg_all_unified() > 0)
+ goto skip_controllers;
+
+ controllers = set_new(&string_hash_ops);
+ if (!controllers)
+ return log_oom();
+
+ r = get_controllers(controllers);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+
+ for (;;) {
+ _cleanup_free_ const char *controller = NULL;
+
+ controller = set_steal_first(controllers);
+ if (!controller)
+ break;
+
+ r = mount_legacy_cgroup_hierarchy("", controller, controller, unified_requested, !userns);
+ if (r < 0)
+ return r;
+
+ /* When multiple hierarchies are co-mounted, make their
+ * constituting individual hierarchies a symlink to the
+ * co-mount.
+ */
+ c = controller;
+ for (;;) {
+ _cleanup_free_ char *target = NULL, *tok = NULL;
+
+ r = extract_first_word(&c, &tok, ",", 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
+ if (r == 0)
+ break;
+
+ target = prefix_root("/sys/fs/cgroup", tok);
+ if (!target)
+ return log_oom();
+
+ if (streq(controller, tok))
+ break;
+
+ r = symlink_idempotent(controller, target);
+ if (r == -EINVAL)
+ return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
+ if (r < 0)
+ return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
+ }
+ }
+
+skip_controllers:
+ r = mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
+ if (r < 0)
+ return r;
+
+ if (!userns)
+ return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
+ MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+
+ return 0;
+}
+
+/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
+static int mount_legacy_cgns_unsupported(
const char *dest,
- bool userns, uid_t uid_shift, uid_t uid_range,
+ CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
-
_cleanup_set_free_free_ Set *controllers = NULL;
const char *cgroup_root;
int r;
@@ -651,15 +878,17 @@ static int mount_legacy_cgroups(
if (r == 0) {
_cleanup_free_ char *options = NULL;
- r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
+ r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
if (r < 0)
return log_oom();
- if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
- return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+ r = mount_verbose(LOG_ERR, "tmpfs", cgroup_root, "tmpfs",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+ if (r < 0)
+ return r;
}
- if (cg_unified() > 0)
+ if (cg_all_unified() > 0)
goto skip_controllers;
controllers = set_new(&string_hash_ops);
@@ -685,7 +914,7 @@ static int mount_legacy_cgroups(
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
- r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
+ r = mount_legacy_cgroup_hierarchy(dest, controller, controller, unified_requested, true);
if (r < 0)
return r;
@@ -705,29 +934,25 @@ static int mount_legacy_cgroups(
continue;
}
- r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
+ r = mount_legacy_cgroup_hierarchy(dest, combined, combined, unified_requested, true);
if (r < 0)
return r;
r = symlink_idempotent(combined, target);
- if (r == -EINVAL) {
- log_error("Invalid existing symlink for combined hierarchy");
- return r;
- }
+ if (r == -EINVAL)
+ return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
if (r < 0)
return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
}
}
skip_controllers:
- r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
+ r = mount_legacy_cgroup_hierarchy(dest, SYSTEMD_CGROUP_CONTROLLER, "systemd", unified_requested, false);
if (r < 0)
return r;
- if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
- return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
-
- return 0;
+ return mount_verbose(LOG_ERR, NULL, cgroup_root, NULL,
+ MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
}
static int mount_unified_cgroups(const char *dest) {
@@ -754,27 +979,27 @@ static int mount_unified_cgroups(const char *dest) {
return -EINVAL;
}
- if (mount("cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
- return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
-
- return 0;
+ return mount_verbose(LOG_ERR, "cgroup", p, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
int mount_cgroups(
const char *dest,
- bool unified_requested,
+ CGroupUnified unified_requested,
bool userns, uid_t uid_shift, uid_t uid_range,
- const char *selinux_apifs_context) {
+ const char *selinux_apifs_context,
+ bool use_cgns) {
- if (unified_requested)
+ if (unified_requested >= CGROUP_UNIFIED_ALL)
return mount_unified_cgroups(dest);
- else
- return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
+ else if (use_cgns)
+ return mount_legacy_cgns_supported(unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
+
+ return mount_legacy_cgns_unsupported(dest, unified_requested, userns, uid_shift, uid_range, selinux_apifs_context);
}
int mount_systemd_cgroup_writable(
const char *dest,
- bool unified_requested) {
+ CGroupUnified unified_requested) {
_cleanup_free_ char *own_cgroup_path = NULL;
const char *systemd_root, *systemd_own;
@@ -790,7 +1015,7 @@ int mount_systemd_cgroup_writable(
if (path_equal(own_cgroup_path, "/"))
return 0;
- if (unified_requested) {
+ if (unified_requested >= CGROUP_UNIFIED_ALL) {
systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
} else {
@@ -799,14 +1024,13 @@ int mount_systemd_cgroup_writable(
}
/* Make our own cgroup a (writable) bind mount */
- if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
+ r = mount_verbose(LOG_ERR, systemd_own, systemd_own, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
/* And then remount the systemd cgroup root read-only */
- if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
- return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
-
- return 0;
+ return mount_verbose(LOG_ERR, NULL, systemd_root, NULL,
+ MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
}
int setup_volatile_state(
@@ -827,7 +1051,7 @@ int setup_volatile_state(
/* --volatile=state means we simply overmount /var
with a tmpfs, and the rest read-only. */
- r = bind_remount_recursive(directory, true);
+ r = bind_remount_recursive(directory, true, NULL);
if (r < 0)
return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
@@ -837,16 +1061,13 @@ int setup_volatile_state(
return log_error_errno(errno, "Failed to create %s: %m", directory);
options = "mode=755";
- r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+ r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
if (r < 0)
return log_oom();
if (r > 0)
options = buf;
- if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
- return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
-
- return 0;
+ return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
}
int setup_volatile(
@@ -873,16 +1094,15 @@ int setup_volatile(
return log_error_errno(errno, "Failed to create temporary directory: %m");
options = "mode=755";
- r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
+ r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
if (r < 0)
return log_oom();
if (r > 0)
options = buf;
- if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
- r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
+ r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
+ if (r < 0)
goto fail;
- }
tmpfs_mounted = true;
@@ -895,23 +1115,21 @@ int setup_volatile(
goto fail;
}
- if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
- r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
+ r = mount_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
goto fail;
- }
bind_mounted = true;
- r = bind_remount_recursive(t, true);
+ r = bind_remount_recursive(t, true, NULL);
if (r < 0) {
log_error_errno(r, "Failed to remount %s read-only: %m", t);
goto fail;
}
- if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
- r = log_error_errno(errno, "Failed to move root mount: %m");
+ r = mount_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
+ if (r < 0)
goto fail;
- }
(void) rmdir(template);
@@ -919,10 +1137,10 @@ int setup_volatile(
fail:
if (bind_mounted)
- (void) umount(t);
+ (void) umount_verbose(t);
if (tmpfs_mounted)
- (void) umount(template);
+ (void) umount_verbose(template);
(void) rmdir(template);
return r;
}
diff --git a/src/systemd-nspawn/nspawn-mount.h b/src/systemd-nspawn/nspawn-mount.h
index 0b3f3fbefa..8601dfdad3 100644
--- a/src/systemd-nspawn/nspawn-mount.h
+++ b/src/systemd-nspawn/nspawn-mount.h
@@ -22,6 +22,8 @@
#include <stdbool.h>
#include <sys/types.h>
+#include "systemd-basic/cgroup-util.h"
+
typedef enum VolatileMode {
VOLATILE_NO,
VOLATILE_YES,
@@ -59,8 +61,8 @@ int custom_mount_compare(const void *a, const void *b);
int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_sysfs(const char *dest);
-int mount_cgroups(const char *dest, bool unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
-int mount_systemd_cgroup_writable(const char *dest, bool unified_requested);
+int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
+int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
diff --git a/src/systemd-nspawn/nspawn-register.c b/src/systemd-nspawn/nspawn-register.c
index 3889445ca5..1e45b8e58f 100644
--- a/src/systemd-nspawn/nspawn-register.c
+++ b/src/systemd-nspawn/nspawn-register.c
@@ -69,7 +69,6 @@ int register_machine(
local_ifindex > 0 ? 1 : 0, local_ifindex);
} else {
_cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
- char **i;
unsigned j;
r = sd_bus_message_new_method_call(
@@ -158,11 +157,9 @@ int register_machine(
return bus_log_create_error(r);
}
- STRV_FOREACH(i, properties) {
- r = bus_append_unit_property_assignment(m, *i);
- if (r < 0)
- return r;
- }
+ r = bus_append_unit_property_assignment_many(m, properties);
+ if (r < 0)
+ return r;
r = sd_bus_message_close_container(m);
if (r < 0)
diff --git a/src/systemd-nspawn/nspawn-seccomp.c b/src/systemd-nspawn/nspawn-seccomp.c
index 4a62f15bc7..e5cfe789a1 100644
--- a/src/systemd-nspawn/nspawn-seccomp.c
+++ b/src/systemd-nspawn/nspawn-seccomp.c
@@ -131,16 +131,15 @@ int setup_seccomp(uint64_t cap_list_retain) {
scmp_filter_ctx seccomp;
int r;
- seccomp = seccomp_init(SCMP_ACT_ALLOW);
- if (!seccomp)
- return log_oom();
-
- r = seccomp_add_secondary_archs(seccomp);
- if (r < 0) {
- log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
- goto finish;
+ if (!is_seccomp_available()) {
+ log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+ return 0;
}
+ r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate seccomp object: %m");
+
r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain);
if (r < 0)
goto finish;
@@ -167,18 +166,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
goto finish;
}
- r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
- goto finish;
- }
-
r = seccomp_load(seccomp);
- if (r == -EINVAL) {
- log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
- r = 0;
- goto finish;
- }
if (r < 0) {
log_error_errno(r, "Failed to install seccomp audit filter: %m");
goto finish;
diff --git a/src/systemd-nspawn/nspawn-settings.c b/src/systemd-nspawn/nspawn-settings.c
index 0886451eb0..6c1614b276 100644
--- a/src/systemd-nspawn/nspawn-settings.c
+++ b/src/systemd-nspawn/nspawn-settings.c
@@ -102,9 +102,7 @@ Settings* settings_free(Settings *s) {
expose_port_free_all(s->expose_ports);
custom_mount_free_all(s->custom_mounts, s->n_custom_mounts);
- free(s);
-
- return NULL;
+ return mfree(s);
}
bool settings_private_network(Settings *s) {
diff --git a/src/systemd-nspawn/nspawn.c b/src/systemd-nspawn/nspawn.c
index f2cbae2ddb..9514152b5b 100644
--- a/src/systemd-nspawn/nspawn.c
+++ b/src/systemd-nspawn/nspawn.c
@@ -170,7 +170,6 @@ static CustomMount *arg_custom_mounts = NULL;
static unsigned arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static bool arg_quiet = false;
-static bool arg_share_system = false;
static bool arg_register = true;
static bool arg_keep_unit = false;
static char **arg_network_interfaces = NULL;
@@ -189,12 +188,14 @@ static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns_chown = false;
static int arg_kill_signal = 0;
-static bool arg_unified_cgroup_hierarchy = false;
+static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
static SettingsMask arg_settings_mask = 0;
static int arg_settings_trusted = -1;
static char **arg_parameters = NULL;
static const char *arg_container_service_name = "systemd-nspawn";
static bool arg_notify_ready = false;
+static bool arg_use_cgns = true;
+static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -216,10 +217,10 @@ static void help(void) {
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
- " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
+ " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
" --private-users[=UIDBASE[:NUIDS]]\n"
- " Run within user namespace, user configured UID/GID range\n"
- " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
+ " Similar, but with user configured UID/GID range\n"
+ " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
@@ -236,11 +237,10 @@ static void help(void) {
" Add an additional virtual Ethernet link between\n"
" host and container\n"
" --network-bridge=INTERFACE\n"
- " Add a virtual Ethernet connection between host\n"
- " and container and add it to an existing bridge on\n"
- " the host\n"
- " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
- " and add it to an automatically managed bridge interface\n"
+ " Add a virtual Ethernet connection to the container\n"
+ " and attach it to an existing bridge on the host\n"
+ " --network-zone=NAME Similar, but attach the new interface to an\n"
+ " an automatically managed bridge interface\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
" Expose a container IP port on the host\n"
" -Z --selinux-context=SECLABEL\n"
@@ -269,14 +269,12 @@ static void help(void) {
" --overlay-ro=PATH[:PATH...]:PATH\n"
" Similar, but creates a read-only overlay mount\n"
" -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
- " --share-system Share system namespaces with host\n"
" --register=BOOLEAN Register container as machine\n"
" --keep-unit Do not register a scope for the machine, reuse\n"
" the service unit nspawn is running in\n"
" --volatile[=MODE] Run the system in volatile mode\n"
" --settings=BOOLEAN Load additional settings from .nspawn file\n"
- " --notify-ready=BOOLEAN Receive notifications from the container's init process,\n"
- " accepted values: yes and no\n"
+ " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
, program_invocation_short_name);
}
@@ -319,9 +317,9 @@ static int custom_mounts_prepare(void) {
return 0;
}
-static int detect_unified_cgroup_hierarchy(void) {
+static int detect_unified_cgroup_hierarchy(const char *directory) {
const char *e;
- int r;
+ int r, all_unified, systemd_unified;
/* Allow the user to control whether the unified hierarchy is used */
e = getenv("UNIFIED_CGROUP_HIERARCHY");
@@ -329,20 +327,58 @@ static int detect_unified_cgroup_hierarchy(void) {
r = parse_boolean(e);
if (r < 0)
return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+ if (r > 0)
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+ else
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
- arg_unified_cgroup_hierarchy = r;
return 0;
}
+ all_unified = cg_all_unified();
+ systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
+
+ if (all_unified < 0 || systemd_unified < 0)
+ return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
+ "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
/* Otherwise inherit the default from the host system */
- r = cg_unified();
- if (r < 0)
- return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+ if (all_unified > 0) {
+ /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
+ * routine only detects 231, so we'll have a false negative here for 230. */
+ r = systemd_installation_has_version(directory, 230);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine systemd version in container: %m");
+ if (r > 0)
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+ else
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+ } else if (systemd_unified > 0) {
+ /* Mixed cgroup hierarchy support was added in 232 */
+ r = systemd_installation_has_version(directory, 232);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine systemd version in container: %m");
+ if (r > 0)
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
+ else
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
+ } else
+ arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
- arg_unified_cgroup_hierarchy = r;
return 0;
}
+static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
+ int r;
+
+ r = getenv_bool(name);
+ if (r == -ENXIO)
+ return;
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
+ arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
+}
+
static int parse_argv(int argc, char *argv[]) {
enum {
@@ -380,52 +416,52 @@ static int parse_argv(int argc, char *argv[]) {
};
static const struct option options[] = {
- { "help", no_argument, NULL, 'h' },
- { "version", no_argument, NULL, ARG_VERSION },
- { "directory", required_argument, NULL, 'D' },
- { "template", required_argument, NULL, ARG_TEMPLATE },
- { "ephemeral", no_argument, NULL, 'x' },
- { "user", required_argument, NULL, 'u' },
- { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
- { "as-pid2", no_argument, NULL, 'a' },
- { "boot", no_argument, NULL, 'b' },
- { "uuid", required_argument, NULL, ARG_UUID },
- { "read-only", no_argument, NULL, ARG_READ_ONLY },
- { "capability", required_argument, NULL, ARG_CAPABILITY },
- { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
- { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
- { "bind", required_argument, NULL, ARG_BIND },
- { "bind-ro", required_argument, NULL, ARG_BIND_RO },
- { "tmpfs", required_argument, NULL, ARG_TMPFS },
- { "overlay", required_argument, NULL, ARG_OVERLAY },
- { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
- { "machine", required_argument, NULL, 'M' },
- { "slice", required_argument, NULL, 'S' },
- { "setenv", required_argument, NULL, 'E' },
- { "selinux-context", required_argument, NULL, 'Z' },
- { "selinux-apifs-context", required_argument, NULL, 'L' },
- { "quiet", no_argument, NULL, 'q' },
- { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
- { "register", required_argument, NULL, ARG_REGISTER },
- { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
- { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
- { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
- { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
- { "network-veth", no_argument, NULL, 'n' },
- { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
- { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
- { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
- { "personality", required_argument, NULL, ARG_PERSONALITY },
- { "image", required_argument, NULL, 'i' },
- { "volatile", optional_argument, NULL, ARG_VOLATILE },
- { "port", required_argument, NULL, 'p' },
- { "property", required_argument, NULL, ARG_PROPERTY },
- { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
- { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
- { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
- { "settings", required_argument, NULL, ARG_SETTINGS },
- { "chdir", required_argument, NULL, ARG_CHDIR },
- { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "directory", required_argument, NULL, 'D' },
+ { "template", required_argument, NULL, ARG_TEMPLATE },
+ { "ephemeral", no_argument, NULL, 'x' },
+ { "user", required_argument, NULL, 'u' },
+ { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
+ { "as-pid2", no_argument, NULL, 'a' },
+ { "boot", no_argument, NULL, 'b' },
+ { "uuid", required_argument, NULL, ARG_UUID },
+ { "read-only", no_argument, NULL, ARG_READ_ONLY },
+ { "capability", required_argument, NULL, ARG_CAPABILITY },
+ { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
+ { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
+ { "bind", required_argument, NULL, ARG_BIND },
+ { "bind-ro", required_argument, NULL, ARG_BIND_RO },
+ { "tmpfs", required_argument, NULL, ARG_TMPFS },
+ { "overlay", required_argument, NULL, ARG_OVERLAY },
+ { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
+ { "machine", required_argument, NULL, 'M' },
+ { "slice", required_argument, NULL, 'S' },
+ { "setenv", required_argument, NULL, 'E' },
+ { "selinux-context", required_argument, NULL, 'Z' },
+ { "selinux-apifs-context", required_argument, NULL, 'L' },
+ { "quiet", no_argument, NULL, 'q' },
+ { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
+ { "register", required_argument, NULL, ARG_REGISTER },
+ { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
+ { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
+ { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
+ { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
+ { "network-veth", no_argument, NULL, 'n' },
+ { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
+ { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
+ { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
+ { "personality", required_argument, NULL, ARG_PERSONALITY },
+ { "image", required_argument, NULL, 'i' },
+ { "volatile", optional_argument, NULL, ARG_VOLATILE },
+ { "port", required_argument, NULL, 'p' },
+ { "property", required_argument, NULL, ARG_PROPERTY },
+ { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
+ { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
+ { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
+ { "settings", required_argument, NULL, ARG_SETTINGS },
+ { "chdir", required_argument, NULL, ARG_CHDIR },
+ { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{}
};
@@ -814,7 +850,9 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_SHARE_SYSTEM:
- arg_share_system = true;
+ /* We don't officially support this anymore, except for compat reasons. People should use the
+ * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
+ arg_clone_ns_flags = 0;
break;
case ARG_REGISTER:
@@ -876,15 +914,21 @@ static int parse_argv(int argc, char *argv[]) {
break;
- case ARG_PRIVATE_USERS:
+ case ARG_PRIVATE_USERS: {
+ int boolean = -1;
- r = optarg ? parse_boolean(optarg) : 1;
- if (r == 0) {
+ if (!optarg)
+ boolean = true;
+ else if (!in_charset(optarg, DIGITS))
+ /* do *not* parse numbers as booleans */
+ boolean = parse_boolean(optarg);
+
+ if (boolean == false) {
/* no: User namespacing off */
arg_userns_mode = USER_NAMESPACE_NO;
arg_uid_shift = UID_INVALID;
arg_uid_range = UINT32_C(0x10000);
- } else if (r > 0) {
+ } else if (boolean == true) {
/* yes: User namespacing on, UID range is read from root dir */
arg_userns_mode = USER_NAMESPACE_FIXED;
arg_uid_shift = UID_INVALID;
@@ -908,23 +952,27 @@ static int parse_argv(int argc, char *argv[]) {
shift = buffer;
range++;
- if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
- log_error("Failed to parse UID range: %s", range);
- return -EINVAL;
- }
+ r = safe_atou32(range, &arg_uid_range);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
} else
shift = optarg;
- if (parse_uid(shift, &arg_uid_shift) < 0) {
- log_error("Failed to parse UID: %s", optarg);
- return -EINVAL;
- }
+ r = parse_uid(shift, &arg_uid_shift);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
arg_userns_mode = USER_NAMESPACE_FIXED;
}
+ if (arg_uid_range <= 0) {
+ log_error("UID range cannot be 0.");
+ return -EINVAL;
+ }
+
arg_settings_mask |= SETTING_USERNS;
break;
+ }
case 'U':
if (userns_supported()) {
@@ -1018,17 +1066,23 @@ static int parse_argv(int argc, char *argv[]) {
assert_not_reached("Unhandled option");
}
- if (arg_share_system)
+ parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
+ parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
+ parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
+ parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
+
+ if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
+ !(arg_clone_ns_flags & CLONE_NEWUTS)) {
arg_register = false;
+ if (arg_start_mode != START_PID1) {
+ log_error("--boot cannot be used without namespacing.");
+ return -EINVAL;
+ }
+ }
if (arg_userns_mode == USER_NAMESPACE_PICK)
arg_userns_chown = true;
- if (arg_start_mode != START_PID1 && arg_share_system) {
- log_error("--boot and --share-system may not be combined.");
- return -EINVAL;
- }
-
if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
log_error("--keep-unit may not be used when invoked from a user session.");
return -EINVAL;
@@ -1097,14 +1151,16 @@ static int parse_argv(int argc, char *argv[]) {
arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
- r = detect_unified_cgroup_hierarchy();
- if (r < 0)
- return r;
-
e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
if (e)
arg_container_service_name = e;
+ r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+ if (r < 0)
+ arg_use_cgns = cg_ns_supported();
+ else
+ arg_use_cgns = r;
+
return 1;
}
@@ -1186,7 +1242,13 @@ static int setup_timezone(const char *dest) {
/* Fix the timezone, if possible */
r = readlink_malloc("/etc/localtime", &p);
if (r < 0) {
- log_warning("/etc/localtime is not a symlink, not updating container timezone.");
+ log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
+ /* to handle warning, delete /etc/localtime and replace it
+ * with a symbolic link to a time zone data file.
+ *
+ * Example:
+ * ln -s /usr/share/zoneinfo/UTC /etc/localtime
+ */
return 0;
}
@@ -1275,9 +1337,6 @@ static int setup_boot_id(const char *dest) {
const char *from, *to;
int r;
- if (arg_share_system)
- return 0;
-
/* Generate a new randomized boot ID, so that each boot-up of
* the container gets a new one */
@@ -1292,10 +1351,10 @@ static int setup_boot_id(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
- if (mount(from, to, NULL, MS_BIND, NULL) < 0)
- r = log_error_errno(errno, "Failed to bind mount boot id: %m");
- else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
- log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m");
+ r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+ if (r >= 0)
+ r = mount_verbose(LOG_ERR, NULL, to, NULL,
+ MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
(void) unlink(from);
return r;
@@ -1343,6 +1402,12 @@ static int copy_devnodes(const char *dest) {
} else {
if (mknod(to, st.st_mode, st.st_rdev) < 0) {
+ /*
+ * This is some sort of protection too against
+ * recursive userns chown on shared /dev/
+ */
+ if (errno == EEXIST)
+ log_notice("%s/dev/ should be an empty directory", dest);
if (errno != EPERM)
return log_error_errno(errno, "mknod(%s) failed: %m", to);
@@ -1351,8 +1416,9 @@ static int copy_devnodes(const char *dest) {
r = touch(to);
if (r < 0)
return log_error_errno(r, "touch (%s) failed: %m", to);
- if (mount(from, to, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
+ r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
}
r = userns_lchown(to, 0, 0);
@@ -1388,8 +1454,9 @@ static int setup_pts(const char *dest) {
p = prefix_roota(dest, "/dev/pts");
if (mkdir(p, 0755) < 0)
return log_error_errno(errno, "Failed to create /dev/pts: %m");
- if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
- return log_error_errno(errno, "Failed to mount /dev/pts: %m");
+ r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
+ if (r < 0)
+ return r;
r = userns_lchown(p, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to chown /dev/pts: %m");
@@ -1434,10 +1501,7 @@ static int setup_dev_console(const char *dest, const char *console) {
if (r < 0)
return log_error_errno(r, "touch() for /dev/console failed: %m");
- if (mount(console, to, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
-
- return 0;
+ return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
}
static int setup_kmsg(const char *dest, int kmsg_socket) {
@@ -1461,8 +1525,9 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
if (mkfifo(from, 0600) < 0)
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
- if (mount(from, to, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
+ r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
if (fd < 0)
@@ -1495,7 +1560,7 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user
static int setup_hostname(void) {
- if (arg_share_system)
+ if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
return 0;
if (sethostname_idempotent(arg_machine) < 0)
@@ -1632,7 +1697,8 @@ static int setup_journal(const char *directory) {
if (r < 0)
return log_error_errno(r, "Failed to create %s: %m", q);
- if (mount(p, q, NULL, MS_BIND, NULL) < 0)
+ r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
+ if (r < 0)
return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
return 0;
@@ -1646,7 +1712,7 @@ static int reset_audit_loginuid(void) {
_cleanup_free_ char *p = NULL;
int r;
- if (arg_share_system)
+ if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
return 0;
r = read_one_line_file("/proc/self/loginuid", &p);
@@ -1697,13 +1763,17 @@ static int setup_propagate(const char *root) {
return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
q = prefix_roota(root, "/run/systemd/nspawn/incoming");
- if (mount(p, q, NULL, MS_BIND, NULL) < 0)
- return log_error_errno(errno, "Failed to install propagation bind mount.");
+ r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
- if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
- return log_error_errno(errno, "Failed to make propagation mount read-only");
+ r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
+ if (r < 0)
+ return r;
- return 0;
+ /* machined will MS_MOVE into that directory, and that's only
+ * supported for non-shared mounts. */
+ return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
}
static int setup_image(char **device_path, int *loop_nr) {
@@ -1795,17 +1865,18 @@ static int dissect_image(
char **root_device, bool *root_device_rw,
char **home_device, bool *home_device_rw,
char **srv_device, bool *srv_device_rw,
+ char **esp_device,
bool *secondary) {
#ifdef HAVE_BLKID
- int home_nr = -1, srv_nr = -1;
+ int home_nr = -1, srv_nr = -1, esp_nr = -1;
#ifdef GPT_ROOT_NATIVE
int root_nr = -1;
#endif
#ifdef GPT_ROOT_SECONDARY
int secondary_root_nr = -1;
#endif
- _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
+ _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
_cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
_cleanup_udev_device_unref_ struct udev_device *d = NULL;
_cleanup_blkid_free_probe_ blkid_probe b = NULL;
@@ -1823,6 +1894,7 @@ static int dissect_image(
assert(root_device);
assert(home_device);
assert(srv_device);
+ assert(esp_device);
assert(secondary);
assert(arg_image);
@@ -2036,6 +2108,16 @@ static int dissect_image(
r = free_and_strdup(&srv, node);
if (r < 0)
return log_oom();
+ } else if (sd_id128_equal(type_id, GPT_ESP)) {
+
+ if (esp && nr >= esp_nr)
+ continue;
+
+ esp_nr = nr;
+
+ r = free_and_strdup(&esp, node);
+ if (r < 0)
+ return log_oom();
}
#ifdef GPT_ROOT_NATIVE
else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
@@ -2153,6 +2235,11 @@ static int dissect_image(
*srv_device_rw = srv_rw;
}
+ if (esp) {
+ *esp_device = esp;
+ esp = NULL;
+ }
+
return 0;
#else
log_error("--image= is not supported, compiled without blkid support.");
@@ -2163,7 +2250,7 @@ static int dissect_image(
static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
#ifdef HAVE_BLKID
_cleanup_blkid_free_probe_ blkid_probe b = NULL;
- const char *fstype, *p;
+ const char *fstype, *p, *options;
int r;
assert(what);
@@ -2212,10 +2299,17 @@ static int mount_device(const char *what, const char *where, const char *directo
return -EOPNOTSUPP;
}
- if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
- return log_error_errno(errno, "Failed to mount %s: %m", what);
+ /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
+ * sparse when possible. */
+ if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
+ const char *l;
- return 0;
+ l = path_startswith(what, "/dev");
+ if (l && startswith(l, "loop"))
+ options = "discard";
+ }
+
+ return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
#else
log_error("--image= is not supported, compiled without blkid support.");
return -EOPNOTSUPP;
@@ -2285,7 +2379,8 @@ static int mount_devices(
const char *where,
const char *root_device, bool root_device_rw,
const char *home_device, bool home_device_rw,
- const char *srv_device, bool srv_device_rw) {
+ const char *srv_device, bool srv_device_rw,
+ const char *esp_device) {
int r;
assert(where);
@@ -2308,6 +2403,27 @@ static int mount_devices(
return log_error_errno(r, "Failed to mount server data directory: %m");
}
+ if (esp_device) {
+ const char *mp, *x;
+
+ /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
+
+ mp = "/efi";
+ x = strjoina(arg_directory, mp);
+ r = dir_is_empty(x);
+ if (r == -ENOENT) {
+ mp = "/boot";
+ x = strjoina(arg_directory, mp);
+ r = dir_is_empty(x);
+ }
+
+ if (r > 0) {
+ r = mount_device(esp_device, arg_directory, mp, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mount ESP: %m");
+ }
+ }
+
return 0;
}
@@ -2568,6 +2684,10 @@ static int inner_child(
}
}
+ r = reset_uid_gid();
+ if (r < 0)
+ return log_error_errno(r, "Couldn't become new root: %m");
+
r = mount_all(NULL,
arg_userns_mode != USER_NAMESPACE_NO,
true,
@@ -2590,13 +2710,25 @@ static int inner_child(
return -ESRCH;
}
- r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
- if (r < 0)
- return r;
-
- r = reset_uid_gid();
- if (r < 0)
- return log_error_errno(r, "Couldn't become new root: %m");
+ if (arg_use_cgns && cg_ns_supported()) {
+ r = unshare(CLONE_NEWCGROUP);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to unshare cgroup namespace");
+ r = mount_cgroups(
+ "",
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context,
+ true);
+ if (r < 0)
+ return r;
+ } else {
+ r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+ if (r < 0)
+ return r;
+ }
r = setup_boot_id(NULL);
if (r < 0)
@@ -2781,6 +2913,7 @@ static int outer_child(
const char *root_device, bool root_device_rw,
const char *home_device, bool home_device_rw,
const char *srv_device, bool srv_device_rw,
+ const char *esp_device,
bool interactive,
bool secondary,
int pid_socket,
@@ -2836,13 +2969,15 @@ static int outer_child(
/* Mark everything as slave, so that we still
* receive mounts from the real root, but don't
* propagate mounts to the real root. */
- if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
- return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
+ r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
+ if (r < 0)
+ return r;
r = mount_devices(directory,
root_device, root_device_rw,
home_device, home_device_rw,
- srv_device, srv_device_rw);
+ srv_device, srv_device_rw,
+ esp_device);
if (r < 0)
return r;
@@ -2850,6 +2985,10 @@ static int outer_child(
if (r < 0)
return r;
+ r = detect_unified_cgroup_hierarchy(directory);
+ if (r < 0)
+ return r;
+
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Let the parent know which UID shift we read from the image */
l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
@@ -2878,8 +3017,19 @@ static int outer_child(
}
/* Turn directory into bind mount */
- if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
- return log_error_errno(errno, "Failed to make bind mount: %m");
+ r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ /* Mark everything as shared so our mounts get propagated down. This is
+ * required to make new bind mounts available in systemd services
+ * inside the containter that create a new mount namespace.
+ * See https://github.com/systemd/systemd/issues/3860
+ * Further submounts (such as /dev) done after this will inherit the
+ * shared propagation mode.*/
+ r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
+ if (r < 0)
+ return r;
r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
if (r < 0)
@@ -2910,7 +3060,7 @@ static int outer_child(
return r;
if (arg_read_only) {
- r = bind_remount_recursive(directory, true);
+ r = bind_remount_recursive(directory, true, NULL);
if (r < 0)
return log_error_errno(r, "Failed to make tree read-only: %m");
}
@@ -2974,15 +3124,18 @@ static int outer_child(
if (r < 0)
return r;
- r = mount_cgroups(
- directory,
- arg_unified_cgroup_hierarchy,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_apifs_context);
- if (r < 0)
- return r;
+ if (!arg_use_cgns || !cg_ns_supported()) {
+ r = mount_cgroups(
+ directory,
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context,
+ false);
+ if (r < 0)
+ return r;
+ }
r = mount_move_root(directory);
if (r < 0)
@@ -2993,7 +3146,7 @@ static int outer_child(
return fd;
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
- (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
+ arg_clone_ns_flags |
(arg_private_network ? CLONE_NEWNET : 0) |
(arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
if (pid < 0)
@@ -3443,18 +3596,437 @@ static int load_settings(void) {
return 0;
}
+static int run(int master,
+ const char* console,
+ const char *root_device, bool root_device_rw,
+ const char *home_device, bool home_device_rw,
+ const char *srv_device, bool srv_device_rw,
+ const char *esp_device,
+ bool interactive,
+ bool secondary,
+ FDSet *fds,
+ char veth_name[IFNAMSIZ], bool *veth_created,
+ union in_addr_union *exposed,
+ pid_t *pid, int *ret) {
+
+ static const struct sigaction sa = {
+ .sa_handler = nop_signal_handler,
+ .sa_flags = SA_NOCLDSTOP,
+ };
+
+ _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
+ _cleanup_close_ int etc_passwd_lock = -1;
+ _cleanup_close_pair_ int
+ kmsg_socket_pair[2] = { -1, -1 },
+ rtnl_socket_pair[2] = { -1, -1 },
+ pid_socket_pair[2] = { -1, -1 },
+ uuid_socket_pair[2] = { -1, -1 },
+ notify_socket_pair[2] = { -1, -1 },
+ uid_shift_socket_pair[2] = { -1, -1 };
+ _cleanup_close_ int notify_socket= -1;
+ _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
+ _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+ _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ ContainerStatus container_status = 0;
+ char last_char = 0;
+ int ifi = 0, r;
+ ssize_t l;
+ sigset_t mask_chld;
+
+ assert_se(sigemptyset(&mask_chld) == 0);
+ assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
+
+ if (arg_userns_mode == USER_NAMESPACE_PICK) {
+ /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
+ * check with getpwuid() if the specific user already exists. Note that /etc might be
+ * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
+ * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
+ * really just an extra safety net. We kinda assume that the UID range we allocate from is
+ * really ours. */
+
+ etc_passwd_lock = take_etc_passwd_lock(NULL);
+ if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
+ return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
+ }
+
+ r = barrier_create(&barrier);
+ if (r < 0)
+ return log_error_errno(r, "Cannot initialize IPC barrier: %m");
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create pid socket pair: %m");
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create id socket pair: %m");
+
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create notify socket pair: %m");
+
+ if (arg_userns_mode != USER_NAMESPACE_NO)
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
+
+ /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
+ * parent's blocking calls and give it a chance to call wait() and terminate. */
+ r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to change the signal mask: %m");
+
+ r = sigaction(SIGCHLD, &sa, NULL);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
+
+ *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
+ if (*pid < 0)
+ return log_error_errno(errno, "clone() failed%s: %m",
+ errno == EINVAL ?
+ ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+
+ if (*pid == 0) {
+ /* The outer child only has a file system namespace. */
+ barrier_set_role(&barrier, BARRIER_CHILD);
+
+ master = safe_close(master);
+
+ kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
+ rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+ pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
+ uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
+ notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
+ uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
+
+ (void) reset_all_signal_handlers();
+ (void) reset_signal_mask();
+
+ r = outer_child(&barrier,
+ arg_directory,
+ console,
+ root_device, root_device_rw,
+ home_device, home_device_rw,
+ srv_device, srv_device_rw,
+ esp_device,
+ interactive,
+ secondary,
+ pid_socket_pair[1],
+ uuid_socket_pair[1],
+ notify_socket_pair[1],
+ kmsg_socket_pair[1],
+ rtnl_socket_pair[1],
+ uid_shift_socket_pair[1],
+ fds);
+ if (r < 0)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ barrier_set_role(&barrier, BARRIER_PARENT);
+
+ fds = fdset_free(fds);
+
+ kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
+ rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
+ pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
+ uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
+ notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
+ uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
+
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ /* The child just let us know the UID shift it might have read from the image. */
+ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to read UID shift: %m");
+
+ if (l != sizeof arg_uid_shift) {
+ log_error("Short read while reading UID shift.");
+ return -EIO;
+ }
+
+ if (arg_userns_mode == USER_NAMESPACE_PICK) {
+ /* If we are supposed to pick the UID shift, let's try to use the shift read from the
+ * image, but if that's already in use, pick a new one, and report back to the child,
+ * which one we now picked. */
+
+ r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
+
+ l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send UID shift: %m");
+ if (l != sizeof arg_uid_shift) {
+ log_error("Short write while writing UID shift.");
+ return -EIO;
+ }
+ }
+ }
+
+ /* Wait for the outer child. */
+ r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
+ if (r != 0)
+ return r < 0 ? r : -EIO;
+
+ /* And now retrieve the PID of the inner child. */
+ l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to read inner child PID: %m");
+ if (l != sizeof *pid) {
+ log_error("Short read while reading inner child PID.");
+ return -EIO;
+ }
+
+ /* We also retrieve container UUID in case it was generated by outer child */
+ l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to read container machine ID: %m");
+ if (l != sizeof(arg_uuid)) {
+ log_error("Short read while reading container machined ID.");
+ return -EIO;
+ }
+
+ /* We also retrieve the socket used for notifications generated by outer child */
+ notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+ if (notify_socket < 0)
+ return log_error_errno(notify_socket,
+ "Failed to receive notification socket from the outer child: %m");
+
+ log_debug("Init process invoked as PID "PID_FMT, *pid);
+
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
+ if (!barrier_place_and_sync(&barrier)) { /* #1 */
+ log_error("Child died too early.");
+ return -ESRCH;
+ }
+
+ r = setup_uid_map(*pid);
+ if (r < 0)
+ return r;
+
+ (void) barrier_place(&barrier); /* #2 */
+ }
+
+ if (arg_private_network) {
+
+ r = move_network_interfaces(*pid, arg_network_interfaces);
+ if (r < 0)
+ return r;
+
+ if (arg_network_veth) {
+ r = setup_veth(arg_machine, *pid, veth_name,
+ arg_network_bridge || arg_network_zone);
+ if (r < 0)
+ return r;
+ else if (r > 0)
+ ifi = r;
+
+ if (arg_network_bridge) {
+ /* Add the interface to a bridge */
+ r = setup_bridge(veth_name, arg_network_bridge, false);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ ifi = r;
+ } else if (arg_network_zone) {
+ /* Add the interface to a bridge, possibly creating it */
+ r = setup_bridge(veth_name, arg_network_zone, true);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ ifi = r;
+ }
+ }
+
+ r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
+ if (r < 0)
+ return r;
+
+ /* We created the primary and extra veth links now; let's remember this, so that we know to
+ remove them later on. Note that we don't bother with removing veth links that were created
+ here when their setup failed half-way, because in that case the kernel should be able to
+ remove them on its own, since they cannot be referenced by anything yet. */
+ *veth_created = true;
+
+ r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
+ if (r < 0)
+ return r;
+
+ r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
+ if (r < 0)
+ return r;
+ }
+
+ if (arg_register) {
+ r = register_machine(
+ arg_machine,
+ *pid,
+ arg_directory,
+ arg_uuid,
+ ifi,
+ arg_slice,
+ arg_custom_mounts, arg_n_custom_mounts,
+ arg_kill_signal,
+ arg_property,
+ arg_keep_unit,
+ arg_container_service_name);
+ if (r < 0)
+ return r;
+ }
+
+ r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
+ if (r < 0)
+ return r;
+
+ if (arg_keep_unit) {
+ r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
+ if (r < 0)
+ return r;
+ }
+
+ r = chown_cgroup(*pid, arg_uid_shift);
+ if (r < 0)
+ return r;
+
+ /* Notify the child that the parent is ready with all
+ * its setup (including cgroup-ification), and that
+ * the child can now hand over control to the code to
+ * run inside the container. */
+ (void) barrier_place(&barrier); /* #3 */
+
+ /* Block SIGCHLD here, before notifying child.
+ * process_pty() will handle it with the other signals. */
+ assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
+
+ /* Reset signal to default */
+ r = default_signals(SIGCHLD, -1);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reset SIGCHLD: %m");
+
+ r = sd_event_new(&event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get default event source: %m");
+
+ r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
+ if (r < 0)
+ return r;
+
+ /* Let the child know that we are ready and wait that the child is completely ready now. */
+ if (!barrier_place_and_sync(&barrier)) { /* #4 */
+ log_error("Child died too early.");
+ return -ESRCH;
+ }
+
+ /* At this point we have made use of the UID we picked, and thus nss-mymachines
+ * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
+ etc_passwd_lock = safe_close(etc_passwd_lock);
+
+ sd_notifyf(false,
+ "STATUS=Container running.\n"
+ "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
+ if (!arg_notify_ready)
+ sd_notify(false, "READY=1\n");
+
+ if (arg_kill_signal > 0) {
+ /* Try to kill the init system on SIGINT or SIGTERM */
+ sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
+ sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
+ } else {
+ /* Immediately exit */
+ sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
+ sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
+ }
+
+ /* simply exit on sigchld */
+ sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
+
+ if (arg_expose_ports) {
+ r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
+ if (r < 0)
+ return r;
+
+ (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
+ }
+
+ rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
+
+ r = pty_forward_new(event, master,
+ PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
+ &forward);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create PTY forwarder: %m");
+
+ r = sd_event_loop(event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to run event loop: %m");
+
+ pty_forward_get_last_char(forward, &last_char);
+
+ forward = pty_forward_free(forward);
+
+ if (!arg_quiet && last_char != '\n')
+ putc('\n', stdout);
+
+ /* Kill if it is not dead yet anyway */
+ if (arg_register && !arg_keep_unit)
+ terminate_machine(*pid);
+
+ /* Normally redundant, but better safe than sorry */
+ kill(*pid, SIGKILL);
+
+ r = wait_for_container(*pid, &container_status);
+ *pid = 0;
+
+ if (r < 0)
+ /* We failed to wait for the container, or the container exited abnormally. */
+ return r;
+ if (r > 0 || container_status == CONTAINER_TERMINATED) {
+ /* r > 0 → The container exited with a non-zero status.
+ * As a special case, we need to replace 133 with a different value,
+ * because 133 is special-cased in the service file to reboot the container.
+ * otherwise → The container exited with zero status and a reboot was not requested.
+ */
+ if (r == 133)
+ r = EXIT_FAILURE; /* replace 133 with the general failure code */
+ *ret = r;
+ return 0; /* finito */
+ }
+
+ /* CONTAINER_REBOOTED, loop again */
+
+ if (arg_keep_unit) {
+ /* Special handling if we are running as a service: instead of simply
+ * restarting the machine we want to restart the entire service, so let's
+ * inform systemd about this with the special exit code 133. The service
+ * file uses RestartForceExitStatus=133 so that this results in a full
+ * nspawn restart. This is necessary since we might have cgroup parameters
+ * set we want to have flushed out. */
+ *ret = 0;
+ return 133;
+ }
+
+ expose_port_flush(arg_expose_ports, exposed);
+
+ (void) remove_veth_links(veth_name, arg_network_veth_extra);
+ *veth_created = false;
+ return 1; /* loop again */
+}
+
int main(int argc, char *argv[]) {
- _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
+ _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
_cleanup_close_ int master = -1, image_fd = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
- int r, n_fd_passed, loop_nr = -1;
+ int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS;
char veth_name[IFNAMSIZ] = "";
bool secondary = false, remove_subvol = false;
- sigset_t mask_chld;
pid_t pid = 0;
- int ret = EXIT_SUCCESS;
union in_addr_union exposed = {};
_cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
bool interactive, veth_created = false;
@@ -3627,6 +4199,7 @@ int main(int argc, char *argv[]) {
&root_device, &root_device_rw,
&home_device, &home_device_rw,
&srv_device, &srv_device_rw,
+ &esp_device,
&secondary);
if (r < 0)
goto finish;
@@ -3669,469 +4242,25 @@ int main(int argc, char *argv[]) {
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
- assert_se(sigemptyset(&mask_chld) == 0);
- assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
-
if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
r = log_error_errno(errno, "Failed to become subreaper: %m");
goto finish;
}
for (;;) {
- static const struct sigaction sa = {
- .sa_handler = nop_signal_handler,
- .sa_flags = SA_NOCLDSTOP,
- };
-
- _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
- _cleanup_close_ int etc_passwd_lock = -1;
- _cleanup_close_pair_ int
- kmsg_socket_pair[2] = { -1, -1 },
- rtnl_socket_pair[2] = { -1, -1 },
- pid_socket_pair[2] = { -1, -1 },
- uuid_socket_pair[2] = { -1, -1 },
- notify_socket_pair[2] = { -1, -1 },
- uid_shift_socket_pair[2] = { -1, -1 };
- _cleanup_close_ int notify_socket= -1;
- _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
- _cleanup_(sd_event_unrefp) sd_event *event = NULL;
- _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
- _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
- ContainerStatus container_status;
- char last_char = 0;
- int ifi = 0;
- ssize_t l;
-
- if (arg_userns_mode == USER_NAMESPACE_PICK) {
- /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
- * check with getpwuid() if the specific user already exists. Note that /etc might be
- * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
- * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
- * really just an extra safety net. We kinda assume that the UID range we allocate from is
- * really ours. */
-
- etc_passwd_lock = take_etc_passwd_lock(NULL);
- if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
- log_error_errno(r, "Failed to take /etc/passwd lock: %m");
- goto finish;
- }
- }
-
- r = barrier_create(&barrier);
- if (r < 0) {
- log_error_errno(r, "Cannot initialize IPC barrier: %m");
- goto finish;
- }
-
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
- r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
- goto finish;
- }
-
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
- r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
- goto finish;
- }
-
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
- r = log_error_errno(errno, "Failed to create pid socket pair: %m");
- goto finish;
- }
-
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
- r = log_error_errno(errno, "Failed to create id socket pair: %m");
- goto finish;
- }
-
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
- r = log_error_errno(errno, "Failed to create notify socket pair: %m");
- goto finish;
- }
-
- if (arg_userns_mode != USER_NAMESPACE_NO)
- if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
- r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
- goto finish;
- }
-
- /* Child can be killed before execv(), so handle SIGCHLD
- * in order to interrupt parent's blocking calls and
- * give it a chance to call wait() and terminate. */
- r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
- if (r < 0) {
- r = log_error_errno(errno, "Failed to change the signal mask: %m");
- goto finish;
- }
-
- r = sigaction(SIGCHLD, &sa, NULL);
- if (r < 0) {
- r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
- goto finish;
- }
-
- pid = raw_clone(SIGCHLD|CLONE_NEWNS);
- if (pid < 0) {
- if (errno == EINVAL)
- r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
- else
- r = log_error_errno(errno, "clone() failed: %m");
-
- goto finish;
- }
-
- if (pid == 0) {
- /* The outer child only has a file system namespace. */
- barrier_set_role(&barrier, BARRIER_CHILD);
-
- master = safe_close(master);
-
- kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
- rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
- pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
- uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
- notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
- uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
-
- (void) reset_all_signal_handlers();
- (void) reset_signal_mask();
-
- r = outer_child(&barrier,
- arg_directory,
- console,
- root_device, root_device_rw,
- home_device, home_device_rw,
- srv_device, srv_device_rw,
- interactive,
- secondary,
- pid_socket_pair[1],
- uuid_socket_pair[1],
- notify_socket_pair[1],
- kmsg_socket_pair[1],
- rtnl_socket_pair[1],
- uid_shift_socket_pair[1],
- fds);
- if (r < 0)
- _exit(EXIT_FAILURE);
-
- _exit(EXIT_SUCCESS);
- }
-
- barrier_set_role(&barrier, BARRIER_PARENT);
-
- fds = fdset_free(fds);
-
- kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
- rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
- pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
- uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
- notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
- uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
-
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- /* The child just let us know the UID shift it might have read from the image. */
- l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
- if (l < 0) {
- r = log_error_errno(errno, "Failed to read UID shift: %m");
- goto finish;
- }
- if (l != sizeof(arg_uid_shift)) {
- log_error("Short read while reading UID shift.");
- r = EIO;
- goto finish;
- }
-
- if (arg_userns_mode == USER_NAMESPACE_PICK) {
- /* If we are supposed to pick the UID shift, let's try to use the shift read from the
- * image, but if that's already in use, pick a new one, and report back to the child,
- * which one we now picked. */
-
- r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
- if (r < 0) {
- log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
- goto finish;
- }
-
- l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
- if (l < 0) {
- r = log_error_errno(errno, "Failed to send UID shift: %m");
- goto finish;
- }
- if (l != sizeof(arg_uid_shift)) {
- log_error("Short write while writing UID shift.");
- r = -EIO;
- goto finish;
- }
- }
- }
-
- /* Wait for the outer child. */
- r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
- if (r < 0)
- goto finish;
- if (r != 0) {
- r = -EIO;
- goto finish;
- }
- pid = 0;
-
- /* And now retrieve the PID of the inner child. */
- l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
- if (l < 0) {
- r = log_error_errno(errno, "Failed to read inner child PID: %m");
- goto finish;
- }
- if (l != sizeof(pid)) {
- log_error("Short read while reading inner child PID.");
- r = EIO;
- goto finish;
- }
-
- /* We also retrieve container UUID in case it was generated by outer child */
- l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
- if (l < 0) {
- r = log_error_errno(errno, "Failed to read container machine ID: %m");
- goto finish;
- }
- if (l != sizeof(arg_uuid)) {
- log_error("Short read while reading container machined ID.");
- r = EIO;
- goto finish;
- }
-
- /* We also retrieve the socket used for notifications generated by outer child */
- notify_socket = receive_one_fd(notify_socket_pair[0], 0);
- if (notify_socket < 0) {
- r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
- goto finish;
- }
-
- log_debug("Init process invoked as PID " PID_FMT, pid);
-
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- if (!barrier_place_and_sync(&barrier)) { /* #1 */
- log_error("Child died too early.");
- r = -ESRCH;
- goto finish;
- }
-
- r = setup_uid_map(pid);
- if (r < 0)
- goto finish;
-
- (void) barrier_place(&barrier); /* #2 */
- }
-
- if (arg_private_network) {
-
- r = move_network_interfaces(pid, arg_network_interfaces);
- if (r < 0)
- goto finish;
-
- if (arg_network_veth) {
- r = setup_veth(arg_machine, pid, veth_name,
- arg_network_bridge || arg_network_zone);
- if (r < 0)
- goto finish;
- else if (r > 0)
- ifi = r;
-
- if (arg_network_bridge) {
- /* Add the interface to a bridge */
- r = setup_bridge(veth_name, arg_network_bridge, false);
- if (r < 0)
- goto finish;
- if (r > 0)
- ifi = r;
- } else if (arg_network_zone) {
- /* Add the interface to a bridge, possibly creating it */
- r = setup_bridge(veth_name, arg_network_zone, true);
- if (r < 0)
- goto finish;
- if (r > 0)
- ifi = r;
- }
- }
-
- r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
- if (r < 0)
- goto finish;
-
- /* We created the primary and extra veth links now; let's remember this, so that we know to
- remove them later on. Note that we don't bother with removing veth links that were created
- here when their setup failed half-way, because in that case the kernel should be able to
- remove them on its own, since they cannot be referenced by anything yet. */
- veth_created = true;
-
- r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
- if (r < 0)
- goto finish;
-
- r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
- if (r < 0)
- goto finish;
- }
-
- if (arg_register) {
- r = register_machine(
- arg_machine,
- pid,
- arg_directory,
- arg_uuid,
- ifi,
- arg_slice,
- arg_custom_mounts, arg_n_custom_mounts,
- arg_kill_signal,
- arg_property,
- arg_keep_unit,
- arg_container_service_name);
- if (r < 0)
- goto finish;
- }
-
- r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
- if (r < 0)
- goto finish;
-
- if (arg_keep_unit) {
- r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
- if (r < 0)
- goto finish;
- }
-
- r = chown_cgroup(pid, arg_uid_shift);
- if (r < 0)
- goto finish;
-
- /* Notify the child that the parent is ready with all
- * its setup (including cgroup-ification), and that
- * the child can now hand over control to the code to
- * run inside the container. */
- (void) barrier_place(&barrier); /* #3 */
-
- /* Block SIGCHLD here, before notifying child.
- * process_pty() will handle it with the other signals. */
- assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
-
- /* Reset signal to default */
- r = default_signals(SIGCHLD, -1);
- if (r < 0) {
- log_error_errno(r, "Failed to reset SIGCHLD: %m");
- goto finish;
- }
-
- r = sd_event_new(&event);
- if (r < 0) {
- log_error_errno(r, "Failed to get default event source: %m");
- goto finish;
- }
-
- r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
- if (r < 0)
- goto finish;
-
- /* Let the child know that we are ready and wait that the child is completely ready now. */
- if (!barrier_place_and_sync(&barrier)) { /* #4 */
- log_error("Child died too early.");
- r = -ESRCH;
- goto finish;
- }
-
- /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
- * in getpwuid(), thus we can release the /etc/passwd lock. */
- etc_passwd_lock = safe_close(etc_passwd_lock);
-
- sd_notifyf(false,
- "STATUS=Container running.\n"
- "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
- if (!arg_notify_ready)
- sd_notify(false, "READY=1\n");
-
- if (arg_kill_signal > 0) {
- /* Try to kill the init system on SIGINT or SIGTERM */
- sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
- sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
- } else {
- /* Immediately exit */
- sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
- sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
- }
-
- /* simply exit on sigchld */
- sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
-
- if (arg_expose_ports) {
- r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
- if (r < 0)
- goto finish;
-
- (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
- }
-
- rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
-
- r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
- if (r < 0) {
- log_error_errno(r, "Failed to create PTY forwarder: %m");
- goto finish;
- }
-
- r = sd_event_loop(event);
- if (r < 0) {
- log_error_errno(r, "Failed to run event loop: %m");
- goto finish;
- }
-
- pty_forward_get_last_char(forward, &last_char);
-
- forward = pty_forward_free(forward);
-
- if (!arg_quiet && last_char != '\n')
- putc('\n', stdout);
-
- /* Kill if it is not dead yet anyway */
- if (arg_register && !arg_keep_unit)
- terminate_machine(pid);
-
- /* Normally redundant, but better safe than sorry */
- kill(pid, SIGKILL);
-
- r = wait_for_container(pid, &container_status);
- pid = 0;
-
- if (r < 0)
- /* We failed to wait for the container, or the
- * container exited abnormally */
- goto finish;
- else if (r > 0 || container_status == CONTAINER_TERMINATED) {
- /* The container exited with a non-zero
- * status, or with zero status and no reboot
- * was requested. */
- ret = r;
+ r = run(master,
+ console,
+ root_device, root_device_rw,
+ home_device, home_device_rw,
+ srv_device, srv_device_rw,
+ esp_device,
+ interactive, secondary,
+ fds,
+ veth_name, &veth_created,
+ &exposed,
+ &pid, &ret);
+ if (r <= 0)
break;
- }
-
- /* CONTAINER_REBOOTED, loop again */
-
- if (arg_keep_unit) {
- /* Special handling if we are running as a
- * service: instead of simply restarting the
- * machine we want to restart the entire
- * service, so let's inform systemd about this
- * with the special exit code 133. The service
- * file uses RestartForceExitStatus=133 so
- * that this results in a full nspawn
- * restart. This is necessary since we might
- * have cgroup parameters set we want to have
- * flushed out. */
- ret = 133;
- r = 0;
- break;
- }
-
- expose_port_flush(arg_expose_ports, &exposed);
-
- (void) remove_veth_links(veth_name, arg_network_veth_extra);
- veth_created = false;
}
finish:
diff --git a/src/systemd-nspawn/systemd-nspawn.xml b/src/systemd-nspawn/systemd-nspawn.xml
index 69d2f6ff7d..c449edee89 100644
--- a/src/systemd-nspawn/systemd-nspawn.xml
+++ b/src/systemd-nspawn/systemd-nspawn.xml
@@ -274,8 +274,7 @@
signals. It is recommended to use this mode to invoke arbitrary commands in containers, unless they have been
modified to run correctly as PID 1. Or in other words: this switch should be used for pretty much all commands,
except when the command refers to an init or shell implementation, as these are generally capable of running
- correctly as PID 1. This option may not be combined with <option>--boot</option> or
- <option>--share-system</option>.</para>
+ correctly as PID 1. This option may not be combined with <option>--boot</option>.</para>
</listitem>
</varlistentry>
@@ -285,8 +284,7 @@
<listitem><para>Automatically search for an init binary and invoke it as PID 1, instead of a shell or a user
supplied program. If this option is used, arguments specified on the command line are used as arguments for the
- init binary. This option may not be combined with <option>--as-pid2</option> or
- <option>--share-system</option>.</para>
+ init binary. This option may not be combined with <option>--as-pid2</option>.</para>
<para>The following table explains the different modes of invocation and relationship to
<option>--as-pid2</option> (see above):</para>
@@ -407,41 +405,42 @@
purposes (usually in the range beyond the host's UID/GID 65536). The parameter may be specified as follows:</para>
<orderedlist>
- <listitem><para>The value <literal>no</literal> turns off user namespacing. This is the default.</para></listitem>
-
- <listitem><para>The value <literal>yes</literal> (or the omission of a parameter) turns on user
- namespacing. The UID/GID range to use is determined automatically from the file ownership of the root
- directory of the container's directory tree. To use this option, make sure to prepare the directory tree in
- advance, and ensure that all files and directories in it are owned by UIDs/GIDs in the range you'd like to
- use. Also, make sure that used file ACLs exclusively reference UIDs/GIDs in the appropriate range. If this
- mode is used the number of UIDs/GIDs assigned to the container for use is 65536, and the UID/GID of the
- root directory must be a multiple of 65536.</para></listitem>
-
- <listitem><para>The value "pick" turns on user namespacing. In this case the UID/GID range is automatically
- chosen. As first step, the file owner of the root directory of the container's directory tree is read, and it
- is checked that it is currently not used by the system otherwise (in particular, that no other container is
- using it). If this check is successful, the UID/GID range determined this way is used, similar to the
- behaviour if "yes" is specified. If the check is not successful (and thus the UID/GID range indicated in the
- root directory's file owner is already used elsewhere) a new – currently unused – UID/GID range of 65536
- UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and 1878982656, always starting at a
- multiple of 65536. This setting implies <option>--private-users-chown</option> (see below), which has the
- effect that the files and directories in the container's directory tree will be owned by the appropriate
- users of the range picked. Using this option makes user namespace behaviour fully automatic. Note that the
- first invocation of a previously unused container image might result in picking a new UID/GID range for it,
- and thus in the (possibly expensive) file ownership adjustment operation. However, subsequent invocations of
- the container will be cheap (unless of course the picked UID/GID range is assigned to a different use by
- then).</para></listitem>
-
- <listitem><para>Finally if one or two colon-separated numeric parameters are specified, user namespacing is
- turned on, too. The first parameter specifies the first host UID/GID to assign to the container, the second
- parameter specifies the number of host UIDs/GIDs to assign to the container. If the second parameter is
- omitted, 65536 UIDs/GIDs are assigned.</para></listitem>
+ <listitem><para>If one or two colon-separated numbers are specified, user namespacing is turned on. The first
+ parameter specifies the first host UID/GID to assign to the container, the second parameter specifies the
+ number of host UIDs/GIDs to assign to the container. If the second parameter is omitted, 65536 UIDs/GIDs are
+ assigned.</para></listitem>
+
+ <listitem><para>If the parameter is omitted, or true, user namespacing is turned on. The UID/GID range to
+ use is determined automatically from the file ownership of the root directory of the container's directory
+ tree. To use this option, make sure to prepare the directory tree in advance, and ensure that all files and
+ directories in it are owned by UIDs/GIDs in the range you'd like to use. Also, make sure that used file ACLs
+ exclusively reference UIDs/GIDs in the appropriate range. If this mode is used the number of UIDs/GIDs
+ assigned to the container for use is 65536, and the UID/GID of the root directory must be a multiple of
+ 65536.</para></listitem>
+
+ <listitem><para>If the parameter is false, user namespacing is turned off. This is the default.</para>
+ </listitem>
+
+ <listitem><para>The special value <literal>pick</literal> turns on user namespacing. In this case the UID/GID
+ range is automatically chosen. As first step, the file owner of the root directory of the container's
+ directory tree is read, and it is checked that it is currently not used by the system otherwise (in
+ particular, that no other container is using it). If this check is successful, the UID/GID range determined
+ this way is used, similar to the behavior if "yes" is specified. If the check is not successful (and thus
+ the UID/GID range indicated in the root directory's file owner is already used elsewhere) a new – currently
+ unused – UID/GID range of 65536 UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and
+ 1878982656, always starting at a multiple of 65536. This setting implies
+ <option>--private-users-chown</option> (see below), which has the effect that the files and directories in
+ the container's directory tree will be owned by the appropriate users of the range picked. Using this option
+ makes user namespace behavior fully automatic. Note that the first invocation of a previously unused
+ container image might result in picking a new UID/GID range for it, and thus in the (possibly expensive) file
+ ownership adjustment operation. However, subsequent invocations of the container will be cheap (unless of
+ course the picked UID/GID range is assigned to a different use by then).</para></listitem>
</orderedlist>
<para>It is recommended to assign at least 65536 UIDs/GIDs to each container, so that the usable UID/GID range in the
container covers 16 bit. For best security, do not assign overlapping UID/GID ranges to multiple containers. It is
hence a good idea to use the upper 16 bit of the host 32-bit UIDs/GIDs as container identifier, while the lower 16
- bit encode the container UID/GID used. This is in fact the behaviour enforced by the
+ bit encode the container UID/GID used. This is in fact the behavior enforced by the
<option>--private-users=pick</option> option.</para>
<para>When user namespaces are used, the GID range assigned to each container is always chosen identical to the
@@ -456,17 +455,6 @@
</varlistentry>
<varlistentry>
- <term><option>-U</option></term>
-
- <listitem><para>If the kernel supports the user namespaces feature, equivalent to
- <option>--private-users=pick</option>, otherwise equivalent to
- <option>--private-users=no</option>.</para>
-
- <para>Note that <option>-U</option> is the default if the <filename>systemd-nspawn@.service</filename> template unit
- file is used.</para></listitem>
- </varlistentry>
-
- <varlistentry>
<term><option>--private-users-chown</option></term>
<listitem><para>If specified, all files and directories in the container's directory tree will adjusted so that
@@ -479,6 +467,23 @@
</varlistentry>
<varlistentry>
+ <term><option>-U</option></term>
+
+ <listitem><para>If the kernel supports the user namespaces feature, equivalent to
+ <option>--private-users=pick --private-users-chown</option>, otherwise equivalent to
+ <option>--private-users=no</option>.</para>
+
+ <para>Note that <option>-U</option> is the default if the
+ <filename>systemd-nspawn@.service</filename> template unit file is used.</para>
+
+ <para>Note: it is possible to undo the effect of <option>--private-users-chown</option> (or
+ <option>-U</option>) on the file system by redoing the operation with the first UID of 0:</para>
+
+ <programlisting>systemd-nspawn … --private-users=0 --private-users-chown</programlisting>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><option>--private-network</option></term>
<listitem><para>Disconnect networking of the container from
@@ -717,7 +722,7 @@
and the subdirectory is symlinked into the host at the same
location. <literal>try-host</literal> and
<literal>try-guest</literal> do the same but do not fail if
- the host does not have persistent journalling enabled. If
+ the host does not have persistent journaling enabled. If
<literal>auto</literal> (the default), and the right
subdirectory of <filename>/var/log/journal</filename> exists,
it will be bind mounted into the container. If the
@@ -847,23 +852,6 @@
</varlistentry>
<varlistentry>
- <term><option>--share-system</option></term>
-
- <listitem><para>Allows the container to share certain system
- facilities with the host. More specifically, this turns off
- PID namespacing, UTS namespacing and IPC namespacing, and thus
- allows the guest to see and interact more easily with
- processes outside of the container. Note that using this
- option makes it impossible to start up a full Operating System
- in the container, as an init system cannot operate in this
- mode. It is only useful to run specific programs or
- applications this way, without involving an init system in the
- container. This option implies <option>--register=no</option>.
- This option may not be combined with
- <option>--boot</option>.</para></listitem>
- </varlistentry>
-
- <varlistentry>
<term><option>--register=</option></term>
<listitem><para>Controls whether the container is registered
@@ -877,9 +865,7 @@
and shown by tools such as
<citerefentry project='man-pages'><refentrytitle>ps</refentrytitle><manvolnum>1</manvolnum></citerefentry>.
If the container does not run an init system, it is
- recommended to set this option to <literal>no</literal>. Note
- that <option>--share-system</option> implies
- <option>--register=no</option>. </para></listitem>
+ recommended to set this option to <literal>no</literal>.</para></listitem>
</varlistentry>
<varlistentry>
@@ -1037,9 +1023,9 @@
</example>
<example>
- <title>Spawn a shell in a container of a minimal gNewSense unstable distribution</title>
+ <title>Spawn a shell in a container of a minimal gNewSense Ucclia distribution</title>
- <programlisting># debootstrap --arch=amd64 unstable ~/gnewsense-tree/
+ <programlisting># debootstrap --arch=amd64 ucclia ~/gnewsense-tree/
# systemd-nspawn -D ~/gnewsense-tree/</programlisting>
<para>This installs a minimal gNewSense unstable distribution into
@@ -1048,12 +1034,12 @@
</example>
<example>
- <title>Boot a minimal Parabola GNU/Linux-libre distribution in a container</title>
+ <title>Boot a minimal Parabola distribution in a container</title>
<programlisting># pacstrap -c -d ~/parabola-tree/ base
# systemd-nspawn -bD ~/parabola-tree/</programlisting>
- <para>This installs a minimal Parabola GNU/Linux-libre distribution into the
+ <para>This installs a minimal Parabola distribution into the
directory <filename>~/parabola-tree/</filename> and then boots an OS
in a namespace container in it.</para>
</example>