summaryrefslogtreecommitdiff
path: root/src/nspawn/nspawn.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r--src/nspawn/nspawn.c451
1 files changed, 356 insertions, 95 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index d1c65e8b0b..fcf14bba4c 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -61,9 +61,9 @@
#include "fs-util.h"
#include "gpt.h"
#include "hostname-util.h"
+#include "id128-util.h"
#include "log.h"
#include "loopback-setup.h"
-#include "machine-id-setup.h"
#include "machine-image.h"
#include "macro.h"
#include "missing.h"
@@ -76,10 +76,10 @@
#include "nspawn-network.h"
#include "nspawn-patch-uid.h"
#include "nspawn-register.h"
+#include "nspawn-seccomp.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
-#include "nspawn-seccomp.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@@ -101,10 +101,16 @@
#include "util.h"
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
- * UID range here */
+ * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
+ * may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
+ * nspawn_notify_socket_path is relative to the container
+ * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
+#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
+
typedef enum ContainerStatus {
CONTAINER_TERMINATED,
CONTAINER_REBOOTED
@@ -133,6 +139,8 @@ static bool arg_ephemeral = false;
static LinkJournal arg_link_journal = LINK_AUTO;
static bool arg_link_journal_try = false;
static uint64_t arg_caps_retain =
+ (1ULL << CAP_AUDIT_CONTROL) |
+ (1ULL << CAP_AUDIT_WRITE) |
(1ULL << CAP_CHOWN) |
(1ULL << CAP_DAC_OVERRIDE) |
(1ULL << CAP_DAC_READ_SEARCH) |
@@ -142,23 +150,21 @@ static uint64_t arg_caps_retain =
(1ULL << CAP_KILL) |
(1ULL << CAP_LEASE) |
(1ULL << CAP_LINUX_IMMUTABLE) |
+ (1ULL << CAP_MKNOD) |
(1ULL << CAP_NET_BIND_SERVICE) |
(1ULL << CAP_NET_BROADCAST) |
(1ULL << CAP_NET_RAW) |
- (1ULL << CAP_SETGID) |
(1ULL << CAP_SETFCAP) |
+ (1ULL << CAP_SETGID) |
(1ULL << CAP_SETPCAP) |
(1ULL << CAP_SETUID) |
(1ULL << CAP_SYS_ADMIN) |
+ (1ULL << CAP_SYS_BOOT) |
(1ULL << CAP_SYS_CHROOT) |
(1ULL << CAP_SYS_NICE) |
(1ULL << CAP_SYS_PTRACE) |
- (1ULL << CAP_SYS_TTY_CONFIG) |
(1ULL << CAP_SYS_RESOURCE) |
- (1ULL << CAP_SYS_BOOT) |
- (1ULL << CAP_AUDIT_WRITE) |
- (1ULL << CAP_AUDIT_CONTROL) |
- (1ULL << CAP_MKNOD);
+ (1ULL << CAP_SYS_TTY_CONFIG);
static CustomMount *arg_custom_mounts = NULL;
static unsigned arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
@@ -187,6 +193,8 @@ static SettingsMask arg_settings_mask = 0;
static int arg_settings_trusted = -1;
static char **arg_parameters = NULL;
static const char *arg_container_service_name = "systemd-nspawn";
+static bool arg_notify_ready = false;
+static bool arg_use_cgns = true;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -208,10 +216,10 @@ static void help(void) {
" --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n"
- " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
+ " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
" --private-users[=UIDBASE[:NUIDS]]\n"
- " Run within user namespace, user configured UID/GID range\n"
- " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
+ " Similar, but with user configured UID/GID range\n"
+ " --private-user-chown Adjust OS tree ownership to private UID/GID range\n"
" --private-network Disable network in container\n"
" --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n"
@@ -228,11 +236,10 @@ static void help(void) {
" Add an additional virtual Ethernet link between\n"
" host and container\n"
" --network-bridge=INTERFACE\n"
- " Add a virtual Ethernet connection between host\n"
- " and container and add it to an existing bridge on\n"
- " the host\n"
- " --network-zone=NAME Add a virtual Ethernet connection to the container,\n"
- " and add it to an automatically managed bridge interface\n"
+ " Add a virtual Ethernet connection to the container\n"
+ " and attach it to an existing bridge on the host\n"
+ " --network-zone=NAME Similar, but attach the new interface to an\n"
+ " an automatically managed bridge interface\n"
" -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
" Expose a container IP port on the host\n"
" -Z --selinux-context=SECLABEL\n"
@@ -261,16 +268,15 @@ static void help(void) {
" --overlay-ro=PATH[:PATH...]:PATH\n"
" Similar, but creates a read-only overlay mount\n"
" -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
- " --share-system Share system namespaces with host\n"
" --register=BOOLEAN Register container as machine\n"
" --keep-unit Do not register a scope for the machine, reuse\n"
" the service unit nspawn is running in\n"
" --volatile[=MODE] Run the system in volatile mode\n"
" --settings=BOOLEAN Load additional settings from .nspawn file\n"
+ " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
, program_invocation_short_name);
}
-
static int custom_mounts_prepare(void) {
unsigned i;
int r;
@@ -367,6 +373,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_SETTINGS,
ARG_CHDIR,
ARG_PRIVATE_USERS_CHOWN,
+ ARG_NOTIFY_READY,
};
static const struct option options[] = {
@@ -395,7 +402,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "selinux-context", required_argument, NULL, 'Z' },
{ "selinux-apifs-context", required_argument, NULL, 'L' },
{ "quiet", no_argument, NULL, 'q' },
- { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
+ { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
{ "register", required_argument, NULL, ARG_REGISTER },
{ "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
{ "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
@@ -415,6 +422,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{ "settings", required_argument, NULL, ARG_SETTINGS },
{ "chdir", required_argument, NULL, ARG_CHDIR },
+ { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{}
};
@@ -584,9 +592,12 @@ static int parse_argv(int argc, char *argv[]) {
case ARG_UUID:
r = sd_id128_from_string(optarg, &arg_uuid);
- if (r < 0) {
- log_error("Invalid UUID: %s", optarg);
- return r;
+ if (r < 0)
+ return log_error_errno(r, "Invalid UUID: %s", optarg);
+
+ if (sd_id128_is_null(arg_uuid)) {
+ log_error("Machine UUID may not be all zeroes.");
+ return -EINVAL;
}
arg_settings_mask |= SETTING_MACHINE_ID;
@@ -800,6 +811,8 @@ static int parse_argv(int argc, char *argv[]) {
break;
case ARG_SHARE_SYSTEM:
+ /* We don't officially support this anymore, except for compat reasons. People should use the
+ * $SYSTEMD_NSPAWN_SHARE_SYSTEM environment variable instead. */
arg_share_system = true;
break;
@@ -987,6 +1000,16 @@ static int parse_argv(int argc, char *argv[]) {
arg_settings_mask |= SETTING_WORKING_DIRECTORY;
break;
+ case ARG_NOTIFY_READY:
+ r = parse_boolean(optarg);
+ if (r < 0) {
+ log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
+ return -EINVAL;
+ }
+ arg_notify_ready = r;
+ arg_settings_mask |= SETTING_NOTIFY_READY;
+ break;
+
case '?':
return -EINVAL;
@@ -994,6 +1017,9 @@ static int parse_argv(int argc, char *argv[]) {
assert_not_reached("Unhandled option");
}
+ if (getenv_bool("SYSTEMD_NSPAWN_SHARE_SYSTEM") > 0)
+ arg_share_system = true;
+
if (arg_share_system)
arg_register = false;
@@ -1001,7 +1027,7 @@ static int parse_argv(int argc, char *argv[]) {
arg_userns_chown = true;
if (arg_start_mode != START_PID1 && arg_share_system) {
- log_error("--boot and --share-system may not be combined.");
+ log_error("--boot and SYSTEMD_NSPAWN_SHARE_SYSTEM=1 may not be combined.");
return -EINVAL;
}
@@ -1081,6 +1107,12 @@ static int parse_argv(int argc, char *argv[]) {
if (e)
arg_container_service_name = e;
+ r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+ if (r < 0)
+ arg_use_cgns = cg_ns_supported();
+ else
+ arg_use_cgns = r;
+
return 1;
}
@@ -1224,42 +1256,46 @@ static int setup_resolv_conf(const char *dest) {
/* Fix resolv.conf, if possible */
where = prefix_roota(dest, "/etc/resolv.conf");
+ if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
+ /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
+ * container, so that the container can use the host's resolver. Given that network namespacing is
+ * disabled it's only natural of the container also uses the host's resolver. It also has the big
+ * advantage that the container will be able to follow the host's DNS server configuration changes
+ * transparently. */
+
+ if (mount("/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL) < 0)
+ log_warning_errno(errno, "Failed to mount /etc/resolv.conf in the container, ignoring: %m");
+ else {
+ if (mount(NULL, where, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
+ return log_error_errno(errno, "Failed to remount /etc/resolv.conf read-only: %m");
+
+ return 0;
+ }
+ }
+
+ /* If that didn't work, let's copy the file */
r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
if (r < 0) {
- /* If the file already exists as symlink, let's
- * suppress the warning, under the assumption that
- * resolved or something similar runs inside and the
- * symlink points there.
+ /* If the file already exists as symlink, let's suppress the warning, under the assumption that
+ * resolved or something similar runs inside and the symlink points there.
*
- * If the disk image is read-only, there's also no
- * point in complaining.
+ * If the disk image is read-only, there's also no point in complaining.
*/
log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
- "Failed to copy /etc/resolv.conf to %s: %m", where);
+ "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
return 0;
}
r = userns_lchown(where, 0, 0);
if (r < 0)
- log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
+ log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
return 0;
}
-static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
- assert(s);
-
- snprintf(s, 37,
- "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
- SD_ID128_FORMAT_VAL(id));
-
- return s;
-}
-
static int setup_boot_id(const char *dest) {
+ sd_id128_t rnd = SD_ID128_NULL;
const char *from, *to;
- sd_id128_t rnd = {};
- char as_uuid[37];
int r;
if (arg_share_system)
@@ -1275,18 +1311,16 @@ static int setup_boot_id(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to generate random boot id: %m");
- id128_format_as_uuid(rnd, as_uuid);
-
- r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
+ r = id128_write(from, ID128_UUID, rnd, false);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
if (mount(from, to, NULL, MS_BIND, NULL) < 0)
r = log_error_errno(errno, "Failed to bind mount boot id: %m");
else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
- log_warning_errno(errno, "Failed to make boot id read-only: %m");
+ r = log_error_errno(errno, "Failed to make boot id read-only: %m");
- unlink(from);
+ (void) unlink(from);
return r;
}
@@ -1784,17 +1818,18 @@ static int dissect_image(
char **root_device, bool *root_device_rw,
char **home_device, bool *home_device_rw,
char **srv_device, bool *srv_device_rw,
+ char **esp_device,
bool *secondary) {
#ifdef HAVE_BLKID
- int home_nr = -1, srv_nr = -1;
+ int home_nr = -1, srv_nr = -1, esp_nr = -1;
#ifdef GPT_ROOT_NATIVE
int root_nr = -1;
#endif
#ifdef GPT_ROOT_SECONDARY
int secondary_root_nr = -1;
#endif
- _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
+ _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
_cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
_cleanup_udev_device_unref_ struct udev_device *d = NULL;
_cleanup_blkid_free_probe_ blkid_probe b = NULL;
@@ -1812,6 +1847,7 @@ static int dissect_image(
assert(root_device);
assert(home_device);
assert(srv_device);
+ assert(esp_device);
assert(secondary);
assert(arg_image);
@@ -2025,6 +2061,16 @@ static int dissect_image(
r = free_and_strdup(&srv, node);
if (r < 0)
return log_oom();
+ } else if (sd_id128_equal(type_id, GPT_ESP)) {
+
+ if (esp && nr >= esp_nr)
+ continue;
+
+ esp_nr = nr;
+
+ r = free_and_strdup(&esp, node);
+ if (r < 0)
+ return log_oom();
}
#ifdef GPT_ROOT_NATIVE
else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
@@ -2142,6 +2188,11 @@ static int dissect_image(
*srv_device_rw = srv_rw;
}
+ if (esp) {
+ *esp_device = esp;
+ esp = NULL;
+ }
+
return 0;
#else
log_error("--image= is not supported, compiled without blkid support.");
@@ -2212,33 +2263,37 @@ static int mount_device(const char *what, const char *where, const char *directo
}
static int setup_machine_id(const char *directory) {
+ const char *etc_machine_id;
+ sd_id128_t id;
int r;
- const char *etc_machine_id, *t;
- _cleanup_free_ char *s = NULL;
- etc_machine_id = prefix_roota(directory, "/etc/machine-id");
+ /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
+ * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
+ * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
+ * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
+ * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
+ * container behaves nicely). */
- r = read_one_line_file(etc_machine_id, &s);
- if (r < 0)
- return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
+ etc_machine_id = prefix_roota(directory, "/etc/machine-id");
- t = strstrip(s);
+ r = id128_read(etc_machine_id, ID128_PLAIN, &id);
+ if (r < 0) {
+ if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
+ return log_error_errno(r, "Failed to read machine ID from container image: %m");
- if (!isempty(t)) {
- r = sd_id128_from_string(t, &arg_uuid);
- if (r < 0)
- return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
- } else {
if (sd_id128_is_null(arg_uuid)) {
r = sd_id128_randomize(&arg_uuid);
if (r < 0)
- return log_error_errno(r, "Failed to generate random machine ID: %m");
+ return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
+ }
+ } else {
+ if (sd_id128_is_null(id)) {
+ log_error("Machine ID in container image is zero, refusing.");
+ return -EINVAL;
}
- }
- r = machine_id_setup(directory, arg_uuid);
- if (r < 0)
- return log_error_errno(r, "Failed to setup machine ID: %m");
+ arg_uuid = id;
+ }
return 0;
}
@@ -2270,7 +2325,8 @@ static int mount_devices(
const char *where,
const char *root_device, bool root_device_rw,
const char *home_device, bool home_device_rw,
- const char *srv_device, bool srv_device_rw) {
+ const char *srv_device, bool srv_device_rw,
+ const char *esp_device) {
int r;
assert(where);
@@ -2293,6 +2349,27 @@ static int mount_devices(
return log_error_errno(r, "Failed to mount server data directory: %m");
}
+ if (esp_device) {
+ const char *mp, *x;
+
+ /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
+
+ mp = "/efi";
+ x = strjoina(arg_directory, mp);
+ r = dir_is_empty(x);
+ if (r == -ENOENT) {
+ mp = "/boot";
+ x = strjoina(arg_directory, mp);
+ r = dir_is_empty(x);
+ }
+
+ if (r > 0) {
+ r = mount_device(esp_device, arg_directory, mp, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mount ESP: %m");
+ }
+ }
+
return 0;
}
@@ -2529,6 +2606,7 @@ static int inner_child(
NULL, /* container_uuid */
NULL, /* LISTEN_FDS */
NULL, /* LISTEN_PID */
+ NULL, /* NOTIFY_SOCKET */
NULL
};
@@ -2574,9 +2652,25 @@ static int inner_child(
return -ESRCH;
}
- r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
- if (r < 0)
- return r;
+ if (arg_use_cgns && cg_ns_supported()) {
+ r = unshare(CLONE_NEWCGROUP);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to unshare cgroup namespace");
+ r = mount_cgroups(
+ "",
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context,
+ arg_use_cgns);
+ if (r < 0)
+ return r;
+ } else {
+ r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+ if (r < 0)
+ return r;
+ }
r = reset_uid_gid();
if (r < 0)
@@ -2622,7 +2716,7 @@ static int inner_child(
#ifdef HAVE_SELINUX
if (arg_selinux_context)
- if (setexeccon((security_context_t) arg_selinux_context) < 0)
+ if (setexeccon(arg_selinux_context) < 0)
return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
#endif
@@ -2642,9 +2736,9 @@ static int inner_child(
(asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
return log_oom();
- assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
+ assert(!sd_id128_is_null(arg_uuid));
- if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+ if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
return log_oom();
if (fdset_size(fds) > 0) {
@@ -2656,6 +2750,8 @@ static int inner_child(
(asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
return log_oom();
}
+ if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
+ return log_oom();
env_use = strv_env_merge(2, envp, arg_setenv);
if (!env_use)
@@ -2725,6 +2821,37 @@ static int inner_child(
return log_error_errno(r, "execv() failed: %m");
}
+static int setup_sd_notify_child(void) {
+ static const int one = 1;
+ int fd = -1;
+ union sockaddr_union sa = {
+ .sa.sa_family = AF_UNIX,
+ };
+ int r;
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to allocate notification socket: %m");
+
+ (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
+ (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
+
+ strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
+ r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
+ if (r < 0) {
+ safe_close(fd);
+ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+ }
+
+ r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
+ if (r < 0) {
+ safe_close(fd);
+ return log_error_errno(errno, "SO_PASSCRED failed: %m");
+ }
+
+ return fd;
+}
+
static int outer_child(
Barrier *barrier,
const char *directory,
@@ -2732,10 +2859,12 @@ static int outer_child(
const char *root_device, bool root_device_rw,
const char *home_device, bool home_device_rw,
const char *srv_device, bool srv_device_rw,
+ const char *esp_device,
bool interactive,
bool secondary,
int pid_socket,
int uuid_socket,
+ int notify_socket,
int kmsg_socket,
int rtnl_socket,
int uid_shift_socket,
@@ -2744,12 +2873,14 @@ static int outer_child(
pid_t pid;
ssize_t l;
int r;
+ _cleanup_close_ int fd = -1;
assert(barrier);
assert(directory);
assert(console);
assert(pid_socket >= 0);
assert(uuid_socket >= 0);
+ assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
cg_unified_flush();
@@ -2790,7 +2921,8 @@ static int outer_child(
r = mount_devices(directory,
root_device, root_device_rw,
home_device, home_device_rw,
- srv_device, srv_device_rw);
+ srv_device, srv_device_rw,
+ esp_device);
if (r < 0)
return r;
@@ -2817,7 +2949,7 @@ static int outer_child(
if (l < 0)
return log_error_errno(errno, "Failed to recv UID shift: %m");
if (l != sizeof(arg_uid_shift)) {
- log_error("Short read while recieving UID shift.");
+ log_error("Short read while receiving UID shift.");
return -EIO;
}
}
@@ -2922,20 +3054,27 @@ static int outer_child(
if (r < 0)
return r;
- r = mount_cgroups(
- directory,
- arg_unified_cgroup_hierarchy,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_apifs_context);
- if (r < 0)
- return r;
+ if (!arg_use_cgns || !cg_ns_supported()) {
+ r = mount_cgroups(
+ directory,
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context,
+ arg_use_cgns);
+ if (r < 0)
+ return r;
+ }
r = mount_move_root(directory);
if (r < 0)
return log_error_errno(r, "Failed to move root directory: %m");
+ fd = setup_sd_notify_child();
+ if (fd < 0)
+ return fd;
+
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
(arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
(arg_private_network ? CLONE_NEWNET : 0) |
@@ -2945,6 +3084,7 @@ static int outer_child(
if (pid == 0) {
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
+ notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
/* The inner child has all namespaces that are
@@ -2974,8 +3114,13 @@ static int outer_child(
return -EIO;
}
+ l = send_one_fd(notify_socket, fd, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send notify fd: %m");
+
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
+ notify_socket = safe_close(notify_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
@@ -3058,6 +3203,95 @@ static int setup_uid_map(pid_t pid) {
return 0;
}
+static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ char buf[NOTIFY_BUFFER_MAX+1];
+ char *p = NULL;
+ struct iovec iovec = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf)-1,
+ };
+ union {
+ struct cmsghdr cmsghdr;
+ uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
+ CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
+ } control = {};
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+ struct cmsghdr *cmsg;
+ struct ucred *ucred = NULL;
+ ssize_t n;
+ pid_t inner_child_pid;
+ _cleanup_strv_free_ char **tags = NULL;
+
+ assert(userdata);
+
+ inner_child_pid = PTR_TO_PID(userdata);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected poll event for notify fd.");
+ return 0;
+ }
+
+ n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+ if (n < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ return 0;
+
+ return log_warning_errno(errno, "Couldn't read notification socket: %m");
+ }
+ cmsg_close_all(&msghdr);
+
+ CMSG_FOREACH(cmsg, &msghdr) {
+ if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_CREDENTIALS &&
+ cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+
+ ucred = (struct ucred*) CMSG_DATA(cmsg);
+ }
+ }
+
+ if (!ucred || ucred->pid != inner_child_pid) {
+ log_warning("Received notify message without valid credentials. Ignoring.");
+ return 0;
+ }
+
+ if ((size_t) n >= sizeof(buf)) {
+ log_warning("Received notify message exceeded maximum size. Ignoring.");
+ return 0;
+ }
+
+ buf[n] = 0;
+ tags = strv_split(buf, "\n\r");
+ if (!tags)
+ return log_oom();
+
+ if (strv_find(tags, "READY=1"))
+ sd_notifyf(false, "READY=1\n");
+
+ p = strv_find_startswith(tags, "STATUS=");
+ if (p)
+ sd_notifyf(false, "STATUS=Container running: %s", p);
+
+ return 0;
+}
+
+static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
+ int r;
+ sd_event_source *notify_event_source;
+
+ r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify event source: %m");
+
+ (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
+
+ return 0;
+}
+
static int load_settings(void) {
_cleanup_(settings_freep) Settings *settings = NULL;
_cleanup_fclose_ FILE *f = NULL;
@@ -3286,12 +3520,15 @@ static int load_settings(void) {
}
}
+ if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
+ arg_notify_ready = settings->notify_ready;
+
return 0;
}
int main(int argc, char *argv[]) {
- _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
+ _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
_cleanup_close_ int master = -1, image_fd = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
@@ -3400,7 +3637,7 @@ int main(int argc, char *argv[]) {
}
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", arg_directory);
- return r;
+ goto finish;
}
if (arg_template) {
@@ -3473,6 +3710,7 @@ int main(int argc, char *argv[]) {
&root_device, &root_device_rw,
&home_device, &home_device_rw,
&srv_device, &srv_device_rw,
+ &esp_device,
&secondary);
if (r < 0)
goto finish;
@@ -3536,7 +3774,9 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[2] = { -1, -1 },
pid_socket_pair[2] = { -1, -1 },
uuid_socket_pair[2] = { -1, -1 },
+ notify_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 };
+ _cleanup_close_ int notify_socket= -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
@@ -3587,6 +3827,11 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0) {
+ r = log_error_errno(errno, "Failed to create notify socket pair: %m");
+ goto finish;
+ }
+
if (arg_userns_mode != USER_NAMESPACE_NO)
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
@@ -3628,6 +3873,7 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
+ notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
(void) reset_all_signal_handlers();
@@ -3639,10 +3885,12 @@ int main(int argc, char *argv[]) {
root_device, root_device_rw,
home_device, home_device_rw,
srv_device, srv_device_rw,
+ esp_device,
interactive,
secondary,
pid_socket_pair[1],
uuid_socket_pair[1],
+ notify_socket_pair[1],
kmsg_socket_pair[1],
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
@@ -3661,6 +3909,7 @@ int main(int argc, char *argv[]) {
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
+ notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -3734,6 +3983,13 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ /* We also retrieve the socket used for notifications generated by outer child */
+ notify_socket = receive_one_fd(notify_socket_pair[0], 0);
+ if (notify_socket < 0) {
+ r = log_error_errno(errno, "Failed to receive notification socket from the outer child: %m");
+ goto finish;
+ }
+
log_debug("Init process invoked as PID " PID_FMT, pid);
if (arg_userns_mode != USER_NAMESPACE_NO) {
@@ -3848,6 +4104,16 @@ int main(int argc, char *argv[]) {
goto finish;
}
+ r = sd_event_new(&event);
+ if (r < 0) {
+ log_error_errno(r, "Failed to get default event source: %m");
+ goto finish;
+ }
+
+ r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(pid));
+ if (r < 0)
+ goto finish;
+
/* Let the child know that we are ready and wait that the child is completely ready now. */
if (!barrier_place_and_sync(&barrier)) { /* #4 */
log_error("Child died too early.");
@@ -3860,15 +4126,10 @@ int main(int argc, char *argv[]) {
etc_passwd_lock = safe_close(etc_passwd_lock);
sd_notifyf(false,
- "READY=1\n"
"STATUS=Container running.\n"
"X_NSPAWN_LEADER_PID=" PID_FMT, pid);
-
- r = sd_event_new(&event);
- if (r < 0) {
- log_error_errno(r, "Failed to get default event source: %m");
- goto finish;
- }
+ if (!arg_notify_ready)
+ sd_notify(false, "READY=1\n");
if (arg_kill_signal > 0) {
/* Try to kill the init system on SIGINT or SIGTERM */