summaryrefslogtreecommitdiff
path: root/src/nspawn/nspawn.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r--src/nspawn/nspawn.c186
1 files changed, 125 insertions, 61 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index e4be0a2251..6cc1b9177d 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -61,9 +61,9 @@
#include "fs-util.h"
#include "gpt.h"
#include "hostname-util.h"
+#include "id128-util.h"
#include "log.h"
#include "loopback-setup.h"
-#include "machine-id-setup.h"
#include "machine-image.h"
#include "macro.h"
#include "missing.h"
@@ -76,10 +76,10 @@
#include "nspawn-network.h"
#include "nspawn-patch-uid.h"
#include "nspawn-register.h"
+#include "nspawn-seccomp.h"
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
-#include "nspawn-seccomp.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@@ -101,9 +101,11 @@
#include "util.h"
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
- * UID range here */
+ * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
+ * may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+
/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
* nspawn_notify_socket_path is relative to the container
* the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
@@ -192,6 +194,7 @@ static int arg_settings_trusted = -1;
static char **arg_parameters = NULL;
static const char *arg_container_service_name = "systemd-nspawn";
static bool arg_notify_ready = false;
+static bool arg_use_cgns = true;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -277,7 +280,6 @@ static void help(void) {
, program_invocation_short_name);
}
-
static int custom_mounts_prepare(void) {
unsigned i;
int r;
@@ -593,9 +595,12 @@ static int parse_argv(int argc, char *argv[]) {
case ARG_UUID:
r = sd_id128_from_string(optarg, &arg_uuid);
- if (r < 0) {
- log_error("Invalid UUID: %s", optarg);
- return r;
+ if (r < 0)
+ return log_error_errno(r, "Invalid UUID: %s", optarg);
+
+ if (sd_id128_is_null(arg_uuid)) {
+ log_error("Machine UUID may not be all zeroes.");
+ return -EINVAL;
}
arg_settings_mask |= SETTING_MACHINE_ID;
@@ -1100,6 +1105,12 @@ static int parse_argv(int argc, char *argv[]) {
if (e)
arg_container_service_name = e;
+ r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
+ if (r < 0)
+ arg_use_cgns = cg_ns_supported();
+ else
+ arg_use_cgns = r;
+
return 1;
}
@@ -1265,20 +1276,9 @@ static int setup_resolv_conf(const char *dest) {
return 0;
}
-static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
- assert(s);
-
- snprintf(s, 37,
- "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
- SD_ID128_FORMAT_VAL(id));
-
- return s;
-}
-
static int setup_boot_id(const char *dest) {
+ sd_id128_t rnd = SD_ID128_NULL;
const char *from, *to;
- sd_id128_t rnd = {};
- char as_uuid[37];
int r;
if (arg_share_system)
@@ -1294,18 +1294,16 @@ static int setup_boot_id(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to generate random boot id: %m");
- id128_format_as_uuid(rnd, as_uuid);
-
- r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
+ r = id128_write(from, ID128_UUID, rnd, false);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
if (mount(from, to, NULL, MS_BIND, NULL) < 0)
r = log_error_errno(errno, "Failed to bind mount boot id: %m");
else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
- log_warning_errno(errno, "Failed to make boot id read-only: %m");
+ log_warning_errno(errno, "Failed to make boot id read-only, ignoring: %m");
- unlink(from);
+ (void) unlink(from);
return r;
}
@@ -1803,17 +1801,18 @@ static int dissect_image(
char **root_device, bool *root_device_rw,
char **home_device, bool *home_device_rw,
char **srv_device, bool *srv_device_rw,
+ char **esp_device,
bool *secondary) {
#ifdef HAVE_BLKID
- int home_nr = -1, srv_nr = -1;
+ int home_nr = -1, srv_nr = -1, esp_nr = -1;
#ifdef GPT_ROOT_NATIVE
int root_nr = -1;
#endif
#ifdef GPT_ROOT_SECONDARY
int secondary_root_nr = -1;
#endif
- _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
+ _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
_cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
_cleanup_udev_device_unref_ struct udev_device *d = NULL;
_cleanup_blkid_free_probe_ blkid_probe b = NULL;
@@ -1831,6 +1830,7 @@ static int dissect_image(
assert(root_device);
assert(home_device);
assert(srv_device);
+ assert(esp_device);
assert(secondary);
assert(arg_image);
@@ -2044,6 +2044,16 @@ static int dissect_image(
r = free_and_strdup(&srv, node);
if (r < 0)
return log_oom();
+ } else if (sd_id128_equal(type_id, GPT_ESP)) {
+
+ if (esp && nr >= esp_nr)
+ continue;
+
+ esp_nr = nr;
+
+ r = free_and_strdup(&esp, node);
+ if (r < 0)
+ return log_oom();
}
#ifdef GPT_ROOT_NATIVE
else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
@@ -2161,6 +2171,11 @@ static int dissect_image(
*srv_device_rw = srv_rw;
}
+ if (esp) {
+ *esp_device = esp;
+ esp = NULL;
+ }
+
return 0;
#else
log_error("--image= is not supported, compiled without blkid support.");
@@ -2231,33 +2246,37 @@ static int mount_device(const char *what, const char *where, const char *directo
}
static int setup_machine_id(const char *directory) {
+ const char *etc_machine_id;
+ sd_id128_t id;
int r;
- const char *etc_machine_id, *t;
- _cleanup_free_ char *s = NULL;
- etc_machine_id = prefix_roota(directory, "/etc/machine-id");
+ /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
+ * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
+ * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
+ * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
+ * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
+ * container behaves nicely). */
- r = read_one_line_file(etc_machine_id, &s);
- if (r < 0)
- return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
+ etc_machine_id = prefix_roota(directory, "/etc/machine-id");
- t = strstrip(s);
+ r = id128_read(etc_machine_id, ID128_PLAIN, &id);
+ if (r < 0) {
+ if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
+ return log_error_errno(r, "Failed to read machine ID from container image: %m");
- if (!isempty(t)) {
- r = sd_id128_from_string(t, &arg_uuid);
- if (r < 0)
- return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
- } else {
if (sd_id128_is_null(arg_uuid)) {
r = sd_id128_randomize(&arg_uuid);
if (r < 0)
- return log_error_errno(r, "Failed to generate random machine ID: %m");
+ return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
+ }
+ } else {
+ if (sd_id128_is_null(id)) {
+ log_error("Machine ID in container image is zero, refusing.");
+ return -EINVAL;
}
- }
- r = machine_id_setup(directory, arg_uuid);
- if (r < 0)
- return log_error_errno(r, "Failed to setup machine ID: %m");
+ arg_uuid = id;
+ }
return 0;
}
@@ -2289,7 +2308,8 @@ static int mount_devices(
const char *where,
const char *root_device, bool root_device_rw,
const char *home_device, bool home_device_rw,
- const char *srv_device, bool srv_device_rw) {
+ const char *srv_device, bool srv_device_rw,
+ const char *esp_device) {
int r;
assert(where);
@@ -2312,6 +2332,27 @@ static int mount_devices(
return log_error_errno(r, "Failed to mount server data directory: %m");
}
+ if (esp_device) {
+ const char *mp, *x;
+
+ /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
+
+ mp = "/efi";
+ x = strjoina(arg_directory, mp);
+ r = dir_is_empty(x);
+ if (r == -ENOENT) {
+ mp = "/boot";
+ x = strjoina(arg_directory, mp);
+ r = dir_is_empty(x);
+ }
+
+ if (r > 0) {
+ r = mount_device(esp_device, arg_directory, mp, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to mount ESP: %m");
+ }
+ }
+
return 0;
}
@@ -2594,9 +2635,25 @@ static int inner_child(
return -ESRCH;
}
- r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
- if (r < 0)
- return r;
+ if (arg_use_cgns && cg_ns_supported()) {
+ r = unshare(CLONE_NEWCGROUP);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to unshare cgroup namespace");
+ r = mount_cgroups(
+ "",
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context,
+ arg_use_cgns);
+ if (r < 0)
+ return r;
+ } else {
+ r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
+ if (r < 0)
+ return r;
+ }
r = reset_uid_gid();
if (r < 0)
@@ -2662,9 +2719,9 @@ static int inner_child(
(asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
return log_oom();
- assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
+ assert(!sd_id128_is_null(arg_uuid));
- if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
+ if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
return log_oom();
if (fdset_size(fds) > 0) {
@@ -2785,6 +2842,7 @@ static int outer_child(
const char *root_device, bool root_device_rw,
const char *home_device, bool home_device_rw,
const char *srv_device, bool srv_device_rw,
+ const char *esp_device,
bool interactive,
bool secondary,
int pid_socket,
@@ -2846,7 +2904,8 @@ static int outer_child(
r = mount_devices(directory,
root_device, root_device_rw,
home_device, home_device_rw,
- srv_device, srv_device_rw);
+ srv_device, srv_device_rw,
+ esp_device);
if (r < 0)
return r;
@@ -2978,15 +3037,18 @@ static int outer_child(
if (r < 0)
return r;
- r = mount_cgroups(
- directory,
- arg_unified_cgroup_hierarchy,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_apifs_context);
- if (r < 0)
- return r;
+ if (!arg_use_cgns || !cg_ns_supported()) {
+ r = mount_cgroups(
+ directory,
+ arg_unified_cgroup_hierarchy,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_apifs_context,
+ arg_use_cgns);
+ if (r < 0)
+ return r;
+ }
r = mount_move_root(directory);
if (r < 0)
@@ -3449,7 +3511,7 @@ static int load_settings(void) {
int main(int argc, char *argv[]) {
- _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
+ _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
_cleanup_close_ int master = -1, image_fd = -1;
_cleanup_fdset_free_ FDSet *fds = NULL;
@@ -3558,7 +3620,7 @@ int main(int argc, char *argv[]) {
}
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", arg_directory);
- return r;
+ goto finish;
}
if (arg_template) {
@@ -3631,6 +3693,7 @@ int main(int argc, char *argv[]) {
&root_device, &root_device_rw,
&home_device, &home_device_rw,
&srv_device, &srv_device_rw,
+ &esp_device,
&secondary);
if (r < 0)
goto finish;
@@ -3805,6 +3868,7 @@ int main(int argc, char *argv[]) {
root_device, root_device_rw,
home_device, home_device_rw,
srv_device, srv_device_rw,
+ esp_device,
interactive,
secondary,
pid_socket_pair[1],