diff options
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r-- | src/nspawn/nspawn.c | 2651 |
1 files changed, 460 insertions, 2191 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a56960506c..99e24cf4ff 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -19,95 +19,77 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ -#include <signal.h> -#include <sched.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/mount.h> -#include <stdlib.h> -#include <string.h> -#include <stdio.h> +#ifdef HAVE_BLKID +#include <blkid/blkid.h> +#endif #include <errno.h> -#include <sys/prctl.h> #include <getopt.h> -#include <grp.h> -#include <linux/fs.h> -#include <sys/socket.h> -#include <linux/netlink.h> -#include <net/if.h> -#include <linux/veth.h> -#include <sys/personality.h> #include <linux/loop.h> -#include <sys/file.h> - -#ifdef HAVE_SELINUX -#include <selinux/selinux.h> -#endif - +#include <sched.h> #ifdef HAVE_SECCOMP #include <seccomp.h> #endif - -#ifdef HAVE_BLKID -#include <blkid/blkid.h> +#ifdef HAVE_SELINUX +#include <selinux/selinux.h> #endif +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/personality.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <unistd.h> #include "sd-daemon.h" -#include "sd-bus.h" #include "sd-id128.h" -#include "sd-netlink.h" -#include "random-util.h" -#include "log.h" -#include "util.h" -#include "mkdir.h" -#include "rm-rf.h" -#include "macro.h" -#include "missing.h" + +#include "barrier.h" +#include "base-filesystem.h" +#include "blkid-util.h" +#include "btrfs-util.h" +#include "cap-list.h" +#include "capability.h" #include "cgroup-util.h" -#include "strv.h" -#include "path-util.h" -#include "loopback-setup.h" +#include "copy.h" #include "dev-setup.h" +#include "env-util.h" +#include "event-util.h" #include "fdset.h" -#include "build.h" #include "fileio.h" -#include "bus-util.h" -#include "bus-error.h" -#include "ptyfwd.h" -#include "env-util.h" -#include "netlink-util.h" -#include "udev-util.h" -#include "blkid-util.h" +#include "formats-util.h" #include "gpt.h" -#include "siphash24.h" -#include "copy.h" -#include "base-filesystem.h" -#include "barrier.h" -#include "event-util.h" -#include "capability.h" -#include "cap-list.h" -#include "btrfs-util.h" +#include "hostname-util.h" +#include "log.h" +#include "loopback-setup.h" #include "machine-image.h" -#include "list.h" -#include "in-addr-util.h" -#include "firewall-util.h" -#include "local-addresses.h" -#include "formats-util.h" +#include "macro.h" +#include "missing.h" +#include "mkdir.h" +#include "netlink-util.h" +#include "path-util.h" #include "process-util.h" -#include "terminal-util.h" -#include "hostname-util.h" -#include "signal-util.h" - +#include "ptyfwd.h" +#include "random-util.h" +#include "rm-rf.h" #ifdef HAVE_SECCOMP #include "seccomp-util.h" #endif +#include "signal-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "util.h" -typedef struct ExposePort { - int protocol; - uint16_t host_port; - uint16_t container_port; - LIST_FIELDS(struct ExposePort, ports); -} ExposePort; +#include "nspawn-cgroup.h" +#include "nspawn-expose-ports.h" +#include "nspawn-mount.h" +#include "nspawn-network.h" +#include "nspawn-register.h" +#include "nspawn-settings.h" +#include "nspawn-setuid.h" typedef enum ContainerStatus { CONTAINER_TERMINATED, @@ -121,28 +103,6 @@ typedef enum LinkJournal { LINK_GUEST } LinkJournal; -typedef enum Volatile { - VOLATILE_NO, - VOLATILE_YES, - VOLATILE_STATE, -} Volatile; - -typedef enum CustomMountType { - CUSTOM_MOUNT_BIND, - CUSTOM_MOUNT_TMPFS, - CUSTOM_MOUNT_OVERLAY, -} CustomMountType; - -typedef struct CustomMount { - CustomMountType type; - bool read_only; - char *source; /* for overlayfs this is the upper directory */ - char *destination; - char *options; - char *work_dir; - char **lower; -} CustomMount; - static char *arg_directory = NULL; static char *arg_template = NULL; static char *arg_user = NULL; @@ -195,16 +155,19 @@ static char **arg_network_interfaces = NULL; static char **arg_network_macvlan = NULL; static char **arg_network_ipvlan = NULL; static bool arg_network_veth = false; -static const char *arg_network_bridge = NULL; +static char *arg_network_bridge = NULL; static unsigned long arg_personality = PERSONALITY_INVALID; static char *arg_image = NULL; -static Volatile arg_volatile = VOLATILE_NO; +static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; +static SettingsMask arg_settings_mask = 0; +static int arg_settings_trusted = -1; +static char **arg_parameters = NULL; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -275,62 +238,10 @@ static void help(void) { " --keep-unit Do not register a scope for the machine, reuse\n" " the service unit nspawn is running in\n" " --volatile[=MODE] Run the system in volatile mode\n" + " --settings=BOOLEAN Load additional settings from .nspawn file\n" , program_invocation_short_name); } -static CustomMount* custom_mount_add(CustomMountType t) { - CustomMount *c, *ret; - - c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount)); - if (!c) - return NULL; - - arg_custom_mounts = c; - ret = arg_custom_mounts + arg_n_custom_mounts; - arg_n_custom_mounts++; - - *ret = (CustomMount) { .type = t }; - - return ret; -} - -static void custom_mount_free_all(void) { - unsigned i; - - for (i = 0; i < arg_n_custom_mounts; i++) { - CustomMount *m = &arg_custom_mounts[i]; - - free(m->source); - free(m->destination); - free(m->options); - - if (m->work_dir) { - (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL); - free(m->work_dir); - } - - strv_free(m->lower); - } - - arg_custom_mounts = mfree(arg_custom_mounts); - arg_n_custom_mounts = 0; -} - -static int custom_mount_compare(const void *a, const void *b) { - const CustomMount *x = a, *y = b; - int r; - - r = path_compare(x->destination, y->destination); - if (r != 0) - return r; - - if (x->type < y->type) - return -1; - if (x->type > y->type) - return 1; - - return 0; -} static int custom_mounts_prepare(void) { unsigned i; @@ -439,6 +350,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_PROPERTY, ARG_PRIVATE_USERS, ARG_KILL_SIGNAL, + ARG_SETTINGS, }; static const struct option options[] = { @@ -481,11 +393,13 @@ static int parse_argv(int argc, char *argv[]) { { "property", required_argument, NULL, ARG_PROPERTY }, { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, + { "settings", required_argument, NULL, ARG_SETTINGS }, {} }; int c, r; uint64_t plus = 0, minus = 0; + bool mask_all_settings = false, mask_no_settings = false; assert(argc >= 0); assert(argv); @@ -499,9 +413,7 @@ static int parse_argv(int argc, char *argv[]) { return 0; case ARG_VERSION: - puts(PACKAGE_STRING); - puts(SYSTEMD_FEATURES); - return 0; + return version(); case 'D': r = set_sanitized_path(&arg_directory, optarg); @@ -533,16 +445,20 @@ static int parse_argv(int argc, char *argv[]) { if (r < 0) return log_oom(); + arg_settings_mask |= SETTING_USER; break; case ARG_NETWORK_BRIDGE: - arg_network_bridge = optarg; + r = free_and_strdup(&arg_network_bridge, optarg); + if (r < 0) + return log_oom(); /* fall through */ case 'n': arg_network_veth = true; arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case ARG_NETWORK_INTERFACE: @@ -550,6 +466,7 @@ static int parse_argv(int argc, char *argv[]) { return log_oom(); arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case ARG_NETWORK_MACVLAN: @@ -557,6 +474,7 @@ static int parse_argv(int argc, char *argv[]) { return log_oom(); arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case ARG_NETWORK_IPVLAN: @@ -567,10 +485,12 @@ static int parse_argv(int argc, char *argv[]) { case ARG_PRIVATE_NETWORK: arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case 'b': arg_boot = true; + arg_settings_mask |= SETTING_BOOT; break; case ARG_UUID: @@ -579,6 +499,8 @@ static int parse_argv(int argc, char *argv[]) { log_error("Invalid UUID: %s", optarg); return r; } + + arg_settings_mask |= SETTING_MACHINE_ID; break; case 'S': @@ -611,6 +533,7 @@ static int parse_argv(int argc, char *argv[]) { case ARG_READ_ONLY: arg_read_only = true; + arg_settings_mask |= SETTING_READ_ONLY; break; case ARG_CAPABILITY: @@ -646,6 +569,7 @@ static int parse_argv(int argc, char *argv[]) { } } + arg_settings_mask |= SETTING_CAPABILITY; break; } @@ -681,83 +605,21 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_BIND: - case ARG_BIND_RO: { - const char *current = optarg; - _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL; - CustomMount *m; - - r = extract_many_words(¤t, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL); - switch (r) { - case 1: - destination = strdup(source); - case 2: - case 3: - break; - case -ENOMEM: - return log_oom(); - default: - log_error("Invalid bind mount specification: %s", optarg); - return -EINVAL; - } - - if (!source || !destination) - return log_oom(); - - if (!path_is_absolute(source) || !path_is_absolute(destination)) { - log_error("Invalid bind mount specification: %s", optarg); - return -EINVAL; - } - - m = custom_mount_add(CUSTOM_MOUNT_BIND); - if (!m) - return log_oom(); - - m->source = source; - m->destination = destination; - m->read_only = c == ARG_BIND_RO; - m->options = opts; - - source = destination = opts = NULL; + case ARG_BIND_RO: + r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO); + if (r < 0) + return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg); + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; - } - case ARG_TMPFS: { - const char *current = optarg; - _cleanup_free_ char *path = NULL, *opts = NULL; - CustomMount *m; - - r = extract_first_word(¤t, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); - if (r == -ENOMEM) - return log_oom(); - else if (r < 0) { - log_error("Invalid tmpfs specification: %s", optarg); - return r; - } - if (r) - opts = strdup(current); - else - opts = strdup("mode=0755"); - - if (!path || !opts) - return log_oom(); - - if (!path_is_absolute(path)) { - log_error("Invalid tmpfs specification: %s", optarg); - return -EINVAL; - } - - m = custom_mount_add(CUSTOM_MOUNT_TMPFS); - if (!m) - return log_oom(); - - m->destination = path; - m->options = opts; - - path = opts = NULL; + case ARG_TMPFS: + r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg); + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; - } case ARG_OVERLAY: case ARG_OVERLAY_RO: { @@ -808,7 +670,7 @@ static int parse_argv(int argc, char *argv[]) { lower[n - 2] = NULL; } - m = custom_mount_add(CUSTOM_MOUNT_OVERLAY); + m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY); if (!m) return log_oom(); @@ -820,6 +682,7 @@ static int parse_argv(int argc, char *argv[]) { upper = destination = NULL; lower = NULL; + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; } @@ -837,6 +700,8 @@ static int parse_argv(int argc, char *argv[]) { strv_free(arg_setenv); arg_setenv = n; + + arg_settings_mask |= SETTING_ENVIRONMENT; break; } @@ -870,85 +735,36 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } + arg_settings_mask |= SETTING_PERSONALITY; break; case ARG_VOLATILE: if (!optarg) - arg_volatile = VOLATILE_YES; + arg_volatile_mode = VOLATILE_YES; else { - r = parse_boolean(optarg); - if (r < 0) { - if (streq(optarg, "state")) - arg_volatile = VOLATILE_STATE; - else { - log_error("Failed to parse --volatile= argument: %s", optarg); - return r; - } - } else - arg_volatile = r ? VOLATILE_YES : VOLATILE_NO; - } + VolatileMode m; - break; - - case 'p': { - const char *split, *e; - uint16_t container_port, host_port; - int protocol; - ExposePort *p; - - if ((e = startswith(optarg, "tcp:"))) - protocol = IPPROTO_TCP; - else if ((e = startswith(optarg, "udp:"))) - protocol = IPPROTO_UDP; - else { - e = optarg; - protocol = IPPROTO_TCP; - } - - split = strchr(e, ':'); - if (split) { - char v[split - e + 1]; - - memcpy(v, e, split - e); - v[split - e] = 0; - - r = safe_atou16(v, &host_port); - if (r < 0 || host_port <= 0) { - log_error("Failed to parse host port: %s", optarg); + m = volatile_mode_from_string(optarg); + if (m < 0) { + log_error("Failed to parse --volatile= argument: %s", optarg); return -EINVAL; - } - - r = safe_atou16(split + 1, &container_port); - } else { - r = safe_atou16(e, &container_port); - host_port = container_port; - } - - if (r < 0 || container_port <= 0) { - log_error("Failed to parse host port: %s", optarg); - return -EINVAL; - } - - LIST_FOREACH(ports, p, arg_expose_ports) { - if (p->protocol == protocol && p->host_port == host_port) { - log_error("Duplicate port specification: %s", optarg); - return -EINVAL; - } + } else + arg_volatile_mode = m; } - p = new(ExposePort, 1); - if (!p) - return log_oom(); - - p->protocol = protocol; - p->host_port = host_port; - p->container_port = container_port; + arg_settings_mask |= SETTING_VOLATILE_MODE; + break; - LIST_PREPEND(ports, arg_expose_ports, p); + case 'p': + r = expose_port_parse(&arg_expose_ports, optarg); + if (r == -EEXIST) + return log_error_errno(r, "Duplicate port specification: %s", optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse host port %s: %m", optarg); + arg_settings_mask |= SETTING_EXPOSE_PORTS; break; - } case ARG_PROPERTY: if (strv_extend(&arg_property, optarg) < 0) @@ -992,6 +808,42 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } + arg_settings_mask |= SETTING_KILL_SIGNAL; + break; + + case ARG_SETTINGS: + + /* no → do not read files + * yes → read files, do not override cmdline, trust only subset + * override → read files, override cmdline, trust only subset + * trusted → read files, do not override cmdline, trust all + */ + + r = parse_boolean(optarg); + if (r < 0) { + if (streq(optarg, "trusted")) { + mask_all_settings = false; + mask_no_settings = false; + arg_settings_trusted = true; + + } else if (streq(optarg, "override")) { + mask_all_settings = false; + mask_no_settings = true; + arg_settings_trusted = -1; + } else + return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg); + } else if (r > 0) { + /* yes */ + mask_all_settings = false; + mask_no_settings = false; + arg_settings_trusted = -1; + } else { + /* no */ + mask_all_settings = true; + mask_no_settings = false; + arg_settings_trusted = false; + } + break; case '?': @@ -1044,557 +896,48 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_volatile != VOLATILE_NO && arg_read_only) { - log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); - return -EINVAL; - } - - if (arg_expose_ports && !arg_private_network) { - log_error("Cannot use --port= without private networking."); - return -EINVAL; - } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support."); - arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; - - if (arg_boot && arg_kill_signal <= 0) - arg_kill_signal = SIGRTMIN+3; - - r = detect_unified_cgroup_hierarchy(); - if (r < 0) - return r; - - return 1; -} - -static int tmpfs_patch_options(const char *options, char **ret) { - char *buf = NULL; - - if (arg_userns && arg_uid_shift != 0) { - assert(arg_uid_shift != UID_INVALID); - - if (options) - (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift); - else - (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift); - if (!buf) - return -ENOMEM; - - options = buf; - } - -#ifdef HAVE_SELINUX - if (arg_selinux_apifs_context) { - char *t; - - if (options) - t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL); - else - t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL); - if (!t) { - free(buf); - return -ENOMEM; - } - - free(buf); - buf = t; - } -#endif - - *ret = buf; - return !!buf; -} - -static int mount_all(const char *dest, bool userns) { - - typedef struct MountPoint { - const char *what; - const char *where; - const char *type; - const char *options; - unsigned long flags; - bool fatal; - bool userns; - } MountPoint; - - static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */ - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false }, -#ifdef HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */ -#endif - }; - - unsigned k; - int r; - - for (k = 0; k < ELEMENTSOF(mount_table); k++) { - _cleanup_free_ char *where = NULL, *options = NULL; - const char *o; - - if (userns != mount_table[k].userns) - continue; - - where = prefix_root(dest, mount_table[k].where); - if (!where) - return log_oom(); - - r = path_is_mount_point(where, AT_SYMLINK_FOLLOW); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); - - /* Skip this entry if it is not a remount. */ - if (mount_table[k].what && r > 0) - continue; - - r = mkdir_p(where, 0755); - if (r < 0) { - if (mount_table[k].fatal) - return log_error_errno(r, "Failed to create directory %s: %m", where); - - log_warning_errno(r, "Failed to create directory %s: %m", where); - continue; - } - - o = mount_table[k].options; - if (streq_ptr(mount_table[k].type, "tmpfs")) { - r = tmpfs_patch_options(o, &options); - if (r < 0) - return log_oom(); - if (r > 0) - o = options; - } - - if (mount(mount_table[k].what, - where, - mount_table[k].type, - mount_table[k].flags, - o) < 0) { - - if (mount_table[k].fatal) - return log_error_errno(errno, "mount(%s) failed: %m", where); - - log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where); - } - } - - return 0; -} - -static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) { - const char *p = options; - unsigned long flags = *mount_flags; - char *opts = NULL; - - assert(options); - - for (;;) { - _cleanup_free_ char *word = NULL; - int r = extract_first_word(&p, &word, ",", 0); - if (r < 0) - return log_error_errno(r, "Failed to extract mount option: %m"); - if (r == 0) - break; - - if (streq(word, "rbind")) - flags |= MS_REC; - else if (streq(word, "norbind")) - flags &= ~MS_REC; - else { - log_error("Invalid bind mount option: %s", word); - return -EINVAL; - } - } - - *mount_flags = flags; - /* in the future mount_opts will hold string options for mount(2) */ - *mount_opts = opts; - - return 0; -} - -static int mount_bind(const char *dest, CustomMount *m) { - struct stat source_st, dest_st; - const char *where; - unsigned long mount_flags = MS_BIND | MS_REC; - _cleanup_free_ char *mount_opts = NULL; - int r; - - assert(m); - - if (m->options) { - r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts); - if (r < 0) - return r; - } - - if (stat(m->source, &source_st) < 0) - return log_error_errno(errno, "Failed to stat %s: %m", m->source); - - where = prefix_roota(dest, m->destination); - - if (stat(where, &dest_st) >= 0) { - if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) { - log_error("Cannot bind mount directory %s on file %s.", m->source, where); - return -EINVAL; - } - - if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) { - log_error("Cannot bind mount file %s on directory %s.", m->source, where); - return -EINVAL; - } - - } else if (errno == ENOENT) { - r = mkdir_parents_label(where, 0755); - if (r < 0) - return log_error_errno(r, "Failed to make parents of %s: %m", where); - } else { - log_error_errno(errno, "Failed to stat %s: %m", where); - return -errno; - } - - /* Create the mount point. Any non-directory file can be - * mounted on any non-directory file (regular, fifo, socket, - * char, block). - */ - if (S_ISDIR(source_st.st_mode)) - r = mkdir_label(where, 0755); - else - r = touch(where); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Failed to create mount point %s: %m", where); - - if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0) - return log_error_errno(errno, "mount(%s) failed: %m", where); - - if (m->read_only) { - r = bind_remount_recursive(where, true); - if (r < 0) - return log_error_errno(r, "Read-only bind mount failed: %m"); - } - - return 0; -} - -static int mount_tmpfs(const char *dest, CustomMount *m) { - const char *where, *options; - _cleanup_free_ char *buf = NULL; - int r; - - assert(dest); - assert(m); - - where = prefix_roota(dest, m->destination); - - r = mkdir_p_label(where, 0755); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where); - - r = tmpfs_patch_options(m->options, &buf); - if (r < 0) - return log_oom(); - options = r > 0 ? buf : m->options; - - if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0) - return log_error_errno(errno, "tmpfs mount to %s failed: %m", where); - - return 0; -} - -static char *joined_and_escaped_lower_dirs(char * const *lower) { - _cleanup_strv_free_ char **sv = NULL; - - sv = strv_copy(lower); - if (!sv) - return NULL; - - strv_reverse(sv); - - if (!strv_shell_escape(sv, ",:")) - return NULL; - - return strv_join(sv, ":"); -} - -static int mount_overlay(const char *dest, CustomMount *m) { - _cleanup_free_ char *lower = NULL; - const char *where, *options; - int r; - - assert(dest); - assert(m); - - where = prefix_roota(dest, m->destination); - - r = mkdir_label(where, 0755); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where); - - (void) mkdir_p_label(m->source, 0755); - - lower = joined_and_escaped_lower_dirs(m->lower); - if (!lower) - return log_oom(); - - if (m->read_only) { - _cleanup_free_ char *escaped_source = NULL; - - escaped_source = shell_escape(m->source, ",:"); - if (!escaped_source) + if (argc > optind) { + arg_parameters = strv_copy(argv + optind); + if (!arg_parameters) return log_oom(); - options = strjoina("lowerdir=", escaped_source, ":", lower); - } else { - _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL; - - assert(m->work_dir); - (void) mkdir_label(m->work_dir, 0700); - - escaped_source = shell_escape(m->source, ",:"); - if (!escaped_source) - return log_oom(); - escaped_work_dir = shell_escape(m->work_dir, ",:"); - if (!escaped_work_dir) - return log_oom(); - - options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir); + arg_settings_mask |= SETTING_BOOT; } - if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0) - return log_error_errno(errno, "overlay mount to %s failed: %m", where); - - return 0; -} - -static int mount_custom(const char *dest) { - unsigned i; - int r; - - assert(dest); - - for (i = 0; i < arg_n_custom_mounts; i++) { - CustomMount *m = &arg_custom_mounts[i]; - - switch (m->type) { - - case CUSTOM_MOUNT_BIND: - r = mount_bind(dest, m); - break; - - case CUSTOM_MOUNT_TMPFS: - r = mount_tmpfs(dest, m); - break; - - case CUSTOM_MOUNT_OVERLAY: - r = mount_overlay(dest, m); - break; - - default: - assert_not_reached("Unknown custom mount type"); - } - - if (r < 0) - return r; - } - - return 0; -} - -static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { - char *to; - int r; - - to = strjoina(dest, "/sys/fs/cgroup/", hierarchy); + /* Load all settings from .nspawn files */ + if (mask_no_settings) + arg_settings_mask = 0; - r = path_is_mount_point(to, 0); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); - if (r > 0) - return 0; - - mkdir_p(to, 0755); - - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ - if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0) - return log_error_errno(errno, "Failed to mount to %s: %m", to); - - /* ... hence let's only make the bind mount read-only, not the - * superblock. */ - if (read_only) { - if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) - return log_error_errno(errno, "Failed to remount %s read-only: %m", to); - } - return 1; -} - -static int mount_legacy_cgroups(const char *dest) { - _cleanup_set_free_free_ Set *controllers = NULL; - const char *cgroup_root; - int r; - - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); - - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - r = tmpfs_patch_options("mode=755", &options); - if (r < 0) - return log_oom(); - - if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0) - return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m"); - } - - if (cg_unified() > 0) - goto skip_controllers; - - controllers = set_new(&string_hash_ops); - if (!controllers) - return log_oom(); - - r = cg_kernel_controllers(controllers); - if (r < 0) - return log_error_errno(r, "Failed to determine cgroup controllers: %m"); - - for (;;) { - _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; - - controller = set_steal_first(controllers); - if (!controller) - break; - - origin = prefix_root("/sys/fs/cgroup/", controller); - if (!origin) - return log_oom(); - - r = readlink_malloc(origin, &combined); - if (r == -EINVAL) { - /* Not a symbolic link, but directly a single cgroup hierarchy */ - - r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); - if (r < 0) - return r; - - } else if (r < 0) - return log_error_errno(r, "Failed to read link %s: %m", origin); - else { - _cleanup_free_ char *target = NULL; + /* Don't load any settings from .nspawn files */ + if (mask_all_settings) + arg_settings_mask = _SETTINGS_MASK_ALL; - target = prefix_root(dest, origin); - if (!target) - return log_oom(); - - /* A symbolic link, a combination of controllers in one hierarchy */ - - if (!filename_is_valid(combined)) { - log_warning("Ignoring invalid combined hierarchy %s.", combined); - continue; - } - - r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); - if (r < 0) - return r; - - r = symlink_idempotent(combined, target); - if (r == -EINVAL) { - log_error("Invalid existing symlink for combined hierarchy"); - return r; - } - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); - } - } + arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; -skip_controllers: - r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false); + r = detect_unified_cgroup_hierarchy(); if (r < 0) return r; - if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) - return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); - - return 0; + return 1; } -static int mount_unified_cgroups(const char *dest) { - const char *p; - int r; - - assert(dest); +static int verify_arguments(void) { - p = strjoina(dest, "/sys/fs/cgroup"); - - r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); - if (r > 0) { - p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs"); - if (access(p, F_OK) >= 0) - return 0; - if (errno != ENOENT) - return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); - - log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + if (arg_volatile_mode != VOLATILE_NO && arg_read_only) { + log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); return -EINVAL; } - if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0) - return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p); - - return 0; -} - -static int mount_cgroups(const char *dest) { - if (arg_unified_cgroup_hierarchy) - return mount_unified_cgroups(dest); - else - return mount_legacy_cgroups(dest); -} - -static int mount_systemd_cgroup_writable(const char *dest) { - _cleanup_free_ char *own_cgroup_path = NULL; - const char *systemd_root, *systemd_own; - int r; - - assert(dest); - - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); - - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; - - if (arg_unified_cgroup_hierarchy) { - systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); - } else { - systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + if (arg_expose_ports && !arg_private_network) { + log_error("Cannot use --port= without private networking."); + return -EINVAL; } - /* Make our own cgroup a (writable) bind mount */ - if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0) - return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path); - - /* And then remount the systemd cgroup root read-only */ - if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) - return log_error_errno(errno, "Failed to mount cgroup root read-only: %m"); + if (arg_boot && arg_kill_signal <= 0) + arg_kill_signal = SIGRTMIN+3; return 0; } @@ -1736,114 +1079,6 @@ static int setup_resolv_conf(const char *dest) { return 0; } -static int setup_volatile_state(const char *directory) { - _cleanup_free_ char *buf = NULL; - const char *p, *options; - int r; - - assert(directory); - - if (arg_volatile != VOLATILE_STATE) - return 0; - - /* --volatile=state means we simply overmount /var - with a tmpfs, and the rest read-only. */ - - r = bind_remount_recursive(directory, true); - if (r < 0) - return log_error_errno(r, "Failed to remount %s read-only: %m", directory); - - p = prefix_roota(directory, "/var"); - r = mkdir(p, 0755); - if (r < 0 && errno != EEXIST) - return log_error_errno(errno, "Failed to create %s: %m", directory); - - options = "mode=755"; - r = tmpfs_patch_options(options, &buf); - if (r < 0) - return log_oom(); - if (r > 0) - options = buf; - - if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0) - return log_error_errno(errno, "Failed to mount tmpfs to /var: %m"); - - return 0; -} - -static int setup_volatile(const char *directory) { - bool tmpfs_mounted = false, bind_mounted = false; - char template[] = "/tmp/nspawn-volatile-XXXXXX"; - _cleanup_free_ char *buf = NULL; - const char *f, *t, *options; - int r; - - assert(directory); - - if (arg_volatile != VOLATILE_YES) - return 0; - - /* --volatile=yes means we mount a tmpfs to the root dir, and - the original /usr to use inside it, and that read-only. */ - - if (!mkdtemp(template)) - return log_error_errno(errno, "Failed to create temporary directory: %m"); - - options = "mode=755"; - r = tmpfs_patch_options(options, &buf); - if (r < 0) - return log_oom(); - if (r > 0) - options = buf; - - if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) { - r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m"); - goto fail; - } - - tmpfs_mounted = true; - - f = prefix_roota(directory, "/usr"); - t = prefix_roota(template, "/usr"); - - r = mkdir(t, 0755); - if (r < 0 && errno != EEXIST) { - r = log_error_errno(errno, "Failed to create %s: %m", t); - goto fail; - } - - if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) { - r = log_error_errno(errno, "Failed to create /usr bind mount: %m"); - goto fail; - } - - bind_mounted = true; - - r = bind_remount_recursive(t, true); - if (r < 0) { - log_error_errno(r, "Failed to remount %s read-only: %m", t); - goto fail; - } - - if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) { - r = log_error_errno(errno, "Failed to move root mount: %m"); - goto fail; - } - - (void) rmdir(template); - - return 0; - -fail: - if (bind_mounted) - (void) umount(t); - - if (tmpfs_mounted) - (void) umount(template); - (void) rmdir(template); - return r; -} - static char* id128_format_as_uuid(sd_id128_t id, char s[37]) { assert(s); @@ -2026,16 +1261,7 @@ static int setup_dev_console(const char *dest, const char *console) { static int setup_kmsg(const char *dest, int kmsg_socket) { const char *from, *to; _cleanup_umask_ mode_t u; - int fd, k; - union { - struct cmsghdr cmsghdr; - uint8_t buf[CMSG_SPACE(sizeof(int))]; - } control = {}; - struct msghdr mh = { - .msg_control = &control, - .msg_controllen = sizeof(control), - }; - struct cmsghdr *cmsg; + int fd, r; assert(kmsg_socket >= 0); @@ -2060,21 +1286,13 @@ static int setup_kmsg(const char *dest, int kmsg_socket) { if (fd < 0) return log_error_errno(errno, "Failed to open fifo: %m"); - cmsg = CMSG_FIRSTHDR(&mh); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); - - mh.msg_controllen = cmsg->cmsg_len; - /* Store away the fd in the socket, so that it stays open as * long as we run the child */ - k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL); + r = send_one_fd(kmsg_socket, fd, 0); safe_close(fd); - if (k < 0) - return log_error_errno(errno, "Failed to send FIFO fd: %m"); + if (r < 0) + return log_error_errno(r, "Failed to send FIFO fd: %m"); /* And now make the FIFO unavailable as /run/kmsg... */ (void) unlink(from); @@ -2082,132 +1300,6 @@ static int setup_kmsg(const char *dest, int kmsg_socket) { return 0; } -static int send_rtnl(int send_fd) { - union { - struct cmsghdr cmsghdr; - uint8_t buf[CMSG_SPACE(sizeof(int))]; - } control = {}; - struct msghdr mh = { - .msg_control = &control, - .msg_controllen = sizeof(control), - }; - struct cmsghdr *cmsg; - _cleanup_close_ int fd = -1; - ssize_t k; - - assert(send_fd >= 0); - - if (!arg_expose_ports) - return 0; - - fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE); - if (fd < 0) - return log_error_errno(errno, "Failed to allocate container netlink: %m"); - - cmsg = CMSG_FIRSTHDR(&mh); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); - - mh.msg_controllen = cmsg->cmsg_len; - - /* Store away the fd in the socket, so that it stays open as - * long as we run the child */ - k = sendmsg(send_fd, &mh, MSG_NOSIGNAL); - if (k < 0) - return log_error_errno(errno, "Failed to send netlink fd: %m"); - - return 0; -} - -static int flush_ports(union in_addr_union *exposed) { - ExposePort *p; - int r, af = AF_INET; - - assert(exposed); - - if (!arg_expose_ports) - return 0; - - if (in_addr_is_null(af, exposed)) - return 0; - - log_debug("Lost IP address."); - - LIST_FOREACH(ports, p, arg_expose_ports) { - r = fw_add_local_dnat(false, - af, - p->protocol, - NULL, - NULL, 0, - NULL, 0, - p->host_port, - exposed, - p->container_port, - NULL); - if (r < 0) - log_warning_errno(r, "Failed to modify firewall: %m"); - } - - *exposed = IN_ADDR_NULL; - return 0; -} - -static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) { - _cleanup_free_ struct local_address *addresses = NULL; - _cleanup_free_ char *pretty = NULL; - union in_addr_union new_exposed; - ExposePort *p; - bool add; - int af = AF_INET, r; - - assert(exposed); - - /* Invoked each time an address is added or removed inside the - * container */ - - if (!arg_expose_ports) - return 0; - - r = local_addresses(rtnl, 0, af, &addresses); - if (r < 0) - return log_error_errno(r, "Failed to enumerate local addresses: %m"); - - add = r > 0 && - addresses[0].family == af && - addresses[0].scope < RT_SCOPE_LINK; - - if (!add) - return flush_ports(exposed); - - new_exposed = addresses[0].address; - if (in_addr_equal(af, exposed, &new_exposed)) - return 0; - - in_addr_to_string(af, &new_exposed, &pretty); - log_debug("New container IP is %s.", strna(pretty)); - - LIST_FOREACH(ports, p, arg_expose_ports) { - - r = fw_add_local_dnat(true, - af, - p->protocol, - NULL, - NULL, 0, - NULL, 0, - p->host_port, - &new_exposed, - p->container_port, - in_addr_is_null(af, exposed) ? NULL : exposed); - if (r < 0) - log_warning_errno(r, "Failed to modify firewall: %m"); - } - - *exposed = new_exposed; - return 0; -} - static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { union in_addr_union *exposed = userdata; @@ -2215,62 +1307,7 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user assert(m); assert(exposed); - expose_ports(rtnl, exposed); - return 0; -} - -static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) { - union { - struct cmsghdr cmsghdr; - uint8_t buf[CMSG_SPACE(sizeof(int))]; - } control = {}; - struct msghdr mh = { - .msg_control = &control, - .msg_controllen = sizeof(control), - }; - struct cmsghdr *cmsg; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - int fd, r; - ssize_t k; - - assert(event); - assert(recv_fd >= 0); - assert(ret); - - if (!arg_expose_ports) - return 0; - - k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL); - if (k < 0) - return log_error_errno(errno, "Failed to recv netlink fd: %m"); - - cmsg = CMSG_FIRSTHDR(&mh); - assert(cmsg->cmsg_level == SOL_SOCKET); - assert(cmsg->cmsg_type == SCM_RIGHTS); - assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); - memcpy(&fd, CMSG_DATA(cmsg), sizeof(int)); - - r = sd_netlink_open_fd(&rtnl, fd); - if (r < 0) { - safe_close(fd); - return log_error_errno(r, "Failed to create rtnl object: %m"); - } - - r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed); - if (r < 0) - return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m"); - - r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed); - if (r < 0) - return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m"); - - r = sd_netlink_attach_event(rtnl, event, 0); - if (r < 0) - return log_error_errno(r, "Failed to add to even loop: %m"); - - *ret = rtnl; - rtnl = NULL; - + expose_port_execute(rtnl, arg_expose_ports, exposed); return 0; } @@ -2447,220 +1484,6 @@ static int drop_capabilities(void) { return capability_bounding_set_drop(~arg_retain, false); } -static int register_machine(pid_t pid, int local_ifindex) { - _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; - int r; - - if (!arg_register) - return 0; - - r = sd_bus_default_system(&bus); - if (r < 0) - return log_error_errno(r, "Failed to open system bus: %m"); - - if (arg_keep_unit) { - r = sd_bus_call_method( - bus, - "org.freedesktop.machine1", - "/org/freedesktop/machine1", - "org.freedesktop.machine1.Manager", - "RegisterMachineWithNetwork", - &error, - NULL, - "sayssusai", - arg_machine, - SD_BUS_MESSAGE_APPEND_ID128(arg_uuid), - "nspawn", - "container", - (uint32_t) pid, - strempty(arg_directory), - local_ifindex > 0 ? 1 : 0, local_ifindex); - } else { - _cleanup_bus_message_unref_ sd_bus_message *m = NULL; - char **i; - unsigned j; - - r = sd_bus_message_new_method_call( - bus, - &m, - "org.freedesktop.machine1", - "/org/freedesktop/machine1", - "org.freedesktop.machine1.Manager", - "CreateMachineWithNetwork"); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_message_append( - m, - "sayssusai", - arg_machine, - SD_BUS_MESSAGE_APPEND_ID128(arg_uuid), - "nspawn", - "container", - (uint32_t) pid, - strempty(arg_directory), - local_ifindex > 0 ? 1 : 0, local_ifindex); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_message_open_container(m, 'a', "(sv)"); - if (r < 0) - return bus_log_create_error(r); - - if (!isempty(arg_slice)) { - r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice); - if (r < 0) - return bus_log_create_error(r); - } - - r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict"); - if (r < 0) - return bus_log_create_error(r); - - /* If you make changes here, also make sure to update - * systemd-nspawn@.service, to keep the device - * policies in sync regardless if we are run with or - * without the --keep-unit switch. */ - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9, - /* Allow the container to - * access and create the API - * device nodes, so that - * PrivateDevices= in the - * container can work - * fine */ - "/dev/null", "rwm", - "/dev/zero", "rwm", - "/dev/full", "rwm", - "/dev/random", "rwm", - "/dev/urandom", "rwm", - "/dev/tty", "rwm", - "/dev/net/tun", "rwm", - /* Allow the container - * access to ptys. However, - * do not permit the - * container to ever create - * these device nodes. */ - "/dev/pts/ptmx", "rw", - "char-pts", "rw"); - if (r < 0) - return bus_log_create_error(r); - - for (j = 0; j < arg_n_custom_mounts; j++) { - CustomMount *cm = &arg_custom_mounts[j]; - - if (cm->type != CUSTOM_MOUNT_BIND) - continue; - - r = is_device_node(cm->source); - if (r < 0) - return log_error_errno(r, "Failed to stat %s: %m", cm->source); - - if (r) { - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1, - cm->source, cm->read_only ? "r" : "rw"); - if (r < 0) - return log_error_errno(r, "Failed to append message arguments: %m"); - } - } - - if (arg_kill_signal != 0) { - r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed"); - if (r < 0) - return bus_log_create_error(r); - } - - STRV_FOREACH(i, arg_property) { - r = sd_bus_message_open_container(m, 'r', "sv"); - if (r < 0) - return bus_log_create_error(r); - - r = bus_append_unit_property_assignment(m, *i); - if (r < 0) - return r; - - r = sd_bus_message_close_container(m); - if (r < 0) - return bus_log_create_error(r); - } - - r = sd_bus_message_close_container(m); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_call(bus, m, 0, &error, NULL); - } - - if (r < 0) { - log_error("Failed to register machine: %s", bus_error_message(&error, r)); - return r; - } - - return 0; -} - -static int terminate_machine(pid_t pid) { - _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_bus_message_unref_ sd_bus_message *reply = NULL; - _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; - const char *path; - int r; - - if (!arg_register) - return 0; - - /* If we are reusing the unit, then just exit, systemd will do - * the right thing when we exit. */ - if (arg_keep_unit) - return 0; - - r = sd_bus_default_system(&bus); - if (r < 0) - return log_error_errno(r, "Failed to open system bus: %m"); - - r = sd_bus_call_method( - bus, - "org.freedesktop.machine1", - "/org/freedesktop/machine1", - "org.freedesktop.machine1.Manager", - "GetMachineByPID", - &error, - &reply, - "u", - (uint32_t) pid); - if (r < 0) { - /* Note that the machine might already have been - * cleaned up automatically, hence don't consider it a - * failure if we cannot get the machine object. */ - log_debug("Failed to get machine: %s", bus_error_message(&error, r)); - return 0; - } - - r = sd_bus_message_read(reply, "o", &path); - if (r < 0) - return bus_log_parse_error(r); - - r = sd_bus_call_method( - bus, - "org.freedesktop.machine1", - path, - "org.freedesktop.machine1.Machine", - "Terminate", - &error, - NULL, - NULL); - if (r < 0) { - log_debug("Failed to terminate machine: %s", bus_error_message(&error, r)); - return 0; - } - - return 0; -} - static int reset_audit_loginuid(void) { _cleanup_free_ char *p = NULL; int r; @@ -2693,427 +1516,6 @@ static int reset_audit_loginuid(void) { return 0; } -#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1) -#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2) -#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f) - -static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) { - uint8_t result[8]; - size_t l, sz; - uint8_t *v, *i; - int r; - - l = strlen(arg_machine); - sz = sizeof(sd_id128_t) + l; - if (idx > 0) - sz += sizeof(idx); - - v = alloca(sz); - - /* fetch some persistent data unique to the host */ - r = sd_id128_get_machine((sd_id128_t*) v); - if (r < 0) - return r; - - /* combine with some data unique (on this host) to this - * container instance */ - i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l); - if (idx > 0) { - idx = htole64(idx); - memcpy(i, &idx, sizeof(idx)); - } - - /* Let's hash the host machine ID plus the container name. We - * use a fixed, but originally randomly created hash key here. */ - siphash24(result, v, sz, hash_key.bytes); - - assert_cc(ETH_ALEN <= sizeof(result)); - memcpy(mac->ether_addr_octet, result, ETH_ALEN); - - /* see eth_random_addr in the kernel */ - mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */ - mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */ - - return 0; -} - -static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - struct ether_addr mac_host, mac_container; - int r, i; - - if (!arg_private_network) - return 0; - - if (!arg_network_veth) - return 0; - - /* Use two different interface name prefixes depending whether - * we are in bridge mode or not. */ - snprintf(iface_name, IFNAMSIZ - 1, "%s-%s", - arg_network_bridge ? "vb" : "ve", arg_machine); - - r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0); - if (r < 0) - return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m"); - - r = generate_mac(&mac_host, HOST_HASH_KEY, 0); - if (r < 0) - return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m"); - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host); - if (r < 0) - return log_error_errno(r, "Failed to add netlink MAC address: %m"); - - r = sd_netlink_message_open_container(m, IFLA_LINKINFO); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth"); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container(m, VETH_INFO_PEER); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0"); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container); - if (r < 0) - return log_error_errno(r, "Failed to add netlink MAC address: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to add netlink namespace field: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name); - - i = (int) if_nametoindex(iface_name); - if (i <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name); - - *ifi = i; - - return 0; -} - -static int setup_bridge(const char veth_name[], int *ifi) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - int r, bridge; - - if (!arg_private_network) - return 0; - - if (!arg_network_veth) - return 0; - - if (!arg_network_bridge) - return 0; - - bridge = (int) if_nametoindex(arg_network_bridge); - if (bridge <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge); - - *ifi = bridge; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP); - if (r < 0) - return log_error_errno(r, "Failed to set IFF_UP flag: %m"); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name field: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge); - if (r < 0) - return log_error_errno(r, "Failed to add netlink master field: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add veth interface to bridge: %m"); - - return 0; -} - -static int parse_interface(struct udev *udev, const char *name) { - _cleanup_udev_device_unref_ struct udev_device *d = NULL; - char ifi_str[2 + DECIMAL_STR_MAX(int)]; - int ifi; - - ifi = (int) if_nametoindex(name); - if (ifi <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", name); - - sprintf(ifi_str, "n%i", ifi); - d = udev_device_new_from_device_id(udev, ifi_str); - if (!d) - return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name); - - if (udev_device_get_is_initialized(d) <= 0) { - log_error("Network interface %s is not initialized yet.", name); - return -EBUSY; - } - - return ifi; -} - -static int move_network_interfaces(pid_t pid) { - _cleanup_udev_unref_ struct udev *udev = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - char **i; - int r; - - if (!arg_private_network) - return 0; - - if (strv_isempty(arg_network_interfaces)) - return 0; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - udev = udev_new(); - if (!udev) { - log_error("Failed to connect to udev."); - return -ENOMEM; - } - - STRV_FOREACH(i, arg_network_interfaces) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - int ifi; - - ifi = parse_interface(udev, *i); - if (ifi < 0) - return ifi; - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to append namespace PID to netlink message: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i); - } - - return 0; -} - -static int setup_macvlan(pid_t pid) { - _cleanup_udev_unref_ struct udev *udev = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - unsigned idx = 0; - char **i; - int r; - - if (!arg_private_network) - return 0; - - if (strv_isempty(arg_network_macvlan)) - return 0; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - udev = udev_new(); - if (!udev) { - log_error("Failed to connect to udev."); - return -ENOMEM; - } - - STRV_FOREACH(i, arg_network_macvlan) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_free_ char *n = NULL; - struct ether_addr mac; - int ifi; - - ifi = parse_interface(udev, *i); - if (ifi < 0) - return ifi; - - r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++); - if (r < 0) - return log_error_errno(r, "Failed to create MACVLAN MAC address: %m"); - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface index: %m"); - - n = strappend("mv-", *i); - if (!n) - return log_oom(); - - strshorten(n, IFNAMSIZ-1); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac); - if (r < 0) - return log_error_errno(r, "Failed to add netlink MAC address: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to add netlink namespace field: %m"); - - r = sd_netlink_message_open_container(m, IFLA_LINKINFO); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan"); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE); - if (r < 0) - return log_error_errno(r, "Failed to append macvlan mode: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add new macvlan interfaces: %m"); - } - - return 0; -} - -static int setup_ipvlan(pid_t pid) { - _cleanup_udev_unref_ struct udev *udev = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - char **i; - int r; - - if (!arg_private_network) - return 0; - - if (strv_isempty(arg_network_ipvlan)) - return 0; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - udev = udev_new(); - if (!udev) { - log_error("Failed to connect to udev."); - return -ENOMEM; - } - - STRV_FOREACH(i, arg_network_ipvlan) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_free_ char *n = NULL; - int ifi; - - ifi = parse_interface(udev, *i); - if (ifi < 0) - return ifi; - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface index: %m"); - - n = strappend("iv-", *i); - if (!n) - return log_oom(); - - strshorten(n, IFNAMSIZ-1); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to add netlink namespace field: %m"); - - r = sd_netlink_message_open_container(m, IFLA_LINKINFO); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan"); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2); - if (r < 0) - return log_error_errno(r, "Failed to add ipvlan mode: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add new ipvlan interfaces: %m"); - } - - return 0; -} - static int setup_seccomp(void) { #ifdef HAVE_SECCOMP @@ -3812,247 +2214,6 @@ static void loop_remove(int nr, int *image_fd) { log_debug_errno(errno, "Failed to remove loop %d: %m", nr); } -static int spawn_getent(const char *database, const char *key, pid_t *rpid) { - int pipe_fds[2]; - pid_t pid; - - assert(database); - assert(key); - assert(rpid); - - if (pipe2(pipe_fds, O_CLOEXEC) < 0) - return log_error_errno(errno, "Failed to allocate pipe: %m"); - - pid = fork(); - if (pid < 0) - return log_error_errno(errno, "Failed to fork getent child: %m"); - else if (pid == 0) { - int nullfd; - char *empty_env = NULL; - - if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0) - _exit(EXIT_FAILURE); - - if (pipe_fds[0] > 2) - safe_close(pipe_fds[0]); - if (pipe_fds[1] > 2) - safe_close(pipe_fds[1]); - - nullfd = open("/dev/null", O_RDWR); - if (nullfd < 0) - _exit(EXIT_FAILURE); - - if (dup3(nullfd, STDIN_FILENO, 0) < 0) - _exit(EXIT_FAILURE); - - if (dup3(nullfd, STDERR_FILENO, 0) < 0) - _exit(EXIT_FAILURE); - - if (nullfd > 2) - safe_close(nullfd); - - (void) reset_all_signal_handlers(); - (void) reset_signal_mask(); - close_all_fds(NULL, 0); - - execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env); - execle("/bin/getent", "getent", database, key, NULL, &empty_env); - _exit(EXIT_FAILURE); - } - - pipe_fds[1] = safe_close(pipe_fds[1]); - - *rpid = pid; - - return pipe_fds[0]; -} - -static int change_uid_gid(char **_home) { - char line[LINE_MAX], *x, *u, *g, *h; - const char *word, *state; - _cleanup_free_ uid_t *uids = NULL; - _cleanup_free_ char *home = NULL; - _cleanup_fclose_ FILE *f = NULL; - _cleanup_close_ int fd = -1; - unsigned n_uids = 0; - size_t sz = 0, l; - uid_t uid; - gid_t gid; - pid_t pid; - int r; - - assert(_home); - - if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) { - /* Reset everything fully to 0, just in case */ - - r = reset_uid_gid(); - if (r < 0) - return log_error_errno(r, "Failed to become root: %m"); - - *_home = NULL; - return 0; - } - - /* First, get user credentials */ - fd = spawn_getent("passwd", arg_user, &pid); - if (fd < 0) - return fd; - - f = fdopen(fd, "r"); - if (!f) - return log_oom(); - fd = -1; - - if (!fgets(line, sizeof(line), f)) { - - if (!ferror(f)) { - log_error("Failed to resolve user %s.", arg_user); - return -ESRCH; - } - - log_error_errno(errno, "Failed to read from getent: %m"); - return -errno; - } - - truncate_nl(line); - - wait_for_terminate_and_warn("getent passwd", pid, true); - - x = strchr(line, ':'); - if (!x) { - log_error("/etc/passwd entry has invalid user field."); - return -EIO; - } - - u = strchr(x+1, ':'); - if (!u) { - log_error("/etc/passwd entry has invalid password field."); - return -EIO; - } - - u++; - g = strchr(u, ':'); - if (!g) { - log_error("/etc/passwd entry has invalid UID field."); - return -EIO; - } - - *g = 0; - g++; - x = strchr(g, ':'); - if (!x) { - log_error("/etc/passwd entry has invalid GID field."); - return -EIO; - } - - *x = 0; - h = strchr(x+1, ':'); - if (!h) { - log_error("/etc/passwd entry has invalid GECOS field."); - return -EIO; - } - - h++; - x = strchr(h, ':'); - if (!x) { - log_error("/etc/passwd entry has invalid home directory field."); - return -EIO; - } - - *x = 0; - - r = parse_uid(u, &uid); - if (r < 0) { - log_error("Failed to parse UID of user."); - return -EIO; - } - - r = parse_gid(g, &gid); - if (r < 0) { - log_error("Failed to parse GID of user."); - return -EIO; - } - - home = strdup(h); - if (!home) - return log_oom(); - - /* Second, get group memberships */ - fd = spawn_getent("initgroups", arg_user, &pid); - if (fd < 0) - return fd; - - fclose(f); - f = fdopen(fd, "r"); - if (!f) - return log_oom(); - fd = -1; - - if (!fgets(line, sizeof(line), f)) { - if (!ferror(f)) { - log_error("Failed to resolve user %s.", arg_user); - return -ESRCH; - } - - log_error_errno(errno, "Failed to read from getent: %m"); - return -errno; - } - - truncate_nl(line); - - wait_for_terminate_and_warn("getent initgroups", pid, true); - - /* Skip over the username and subsequent separator whitespace */ - x = line; - x += strcspn(x, WHITESPACE); - x += strspn(x, WHITESPACE); - - FOREACH_WORD(word, l, x, state) { - char c[l+1]; - - memcpy(c, word, l); - c[l] = 0; - - if (!GREEDY_REALLOC(uids, sz, n_uids+1)) - return log_oom(); - - r = parse_uid(c, &uids[n_uids++]); - if (r < 0) { - log_error("Failed to parse group data from getent."); - return -EIO; - } - } - - r = mkdir_parents(home, 0775); - if (r < 0) - return log_error_errno(r, "Failed to make home root directory: %m"); - - r = mkdir_safe(home, 0755, uid, gid); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Failed to make home directory: %m"); - - (void) fchown(STDIN_FILENO, uid, gid); - (void) fchown(STDOUT_FILENO, uid, gid); - (void) fchown(STDERR_FILENO, uid, gid); - - if (setgroups(n_uids, uids) < 0) - return log_error_errno(errno, "Failed to set auxiliary groups: %m"); - - if (setresgid(gid, gid, gid) < 0) - return log_error_errno(errno, "setregid() failed: %m"); - - if (setresuid(uid, uid, uid) < 0) - return log_error_errno(errno, "setreuid() failed: %m"); - - if (_home) { - *_home = home; - home = NULL; - } - - return 0; -} - /* * Return values: * < 0 : wait_for_terminate() failed to get the state of the @@ -4118,8 +2279,6 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { return r; } -static void nop_handler(int sig) {} - static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { pid_t pid; @@ -4254,9 +2413,7 @@ static int inner_child( bool secondary, int kmsg_socket, int rtnl_socket, - FDSet *fds, - int argc, - char *argv[]) { + FDSet *fds) { _cleanup_free_ char *home = NULL; unsigned n_env = 2; @@ -4293,7 +2450,11 @@ static int inner_child( } } - r = mount_all(NULL, true); + r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context); + if (r < 0) + return r; + + r = mount_sysfs(NULL); if (r < 0) return r; @@ -4304,7 +2465,7 @@ static int inner_child( return -ESRCH; } - r = mount_systemd_cgroup_writable(""); + r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); if (r < 0) return r; @@ -4329,10 +2490,12 @@ static int inner_child( if (arg_private_network) loopback_setup(); - r = send_rtnl(rtnl_socket); - if (r < 0) - return r; - rtnl_socket = safe_close(rtnl_socket); + if (arg_expose_ports) { + r = expose_port_send_rtnl(rtnl_socket); + if (r < 0) + return r; + rtnl_socket = safe_close(rtnl_socket); + } if (drop_capabilities() < 0) return log_error_errno(errno, "drop_capabilities() failed: %m"); @@ -4353,7 +2516,7 @@ static int inner_child( return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); #endif - r = change_uid_gid(&home); + r = change_uid_gid(arg_user, &home); if (r < 0) return r; @@ -4412,9 +2575,12 @@ static int inner_child( /* Automatically search for the init system */ - m = 1 + argc - optind; + m = 1 + strv_length(arg_parameters); a = newa(char*, m + 1); - memcpy(a + 1, argv + optind, m * sizeof(char*)); + if (strv_isempty(arg_parameters)) + a[1] = NULL; + else + memcpy(a + 1, arg_parameters, m * sizeof(char*)); a[0] = (char*) "/usr/lib/systemd/systemd"; execve(a[0], a, env_use); @@ -4424,10 +2590,10 @@ static int inner_child( a[0] = (char*) "/sbin/init"; execve(a[0], a, env_use); - } else if (argc > optind) - execvpe(argv[optind], argv + optind, env_use); + } else if (!strv_isempty(arg_parameters)) + execvpe(arg_parameters[0], arg_parameters, env_use); else { - chdir(home ? home : "/root"); + chdir(home ?: "/root"); execle("/bin/bash", "-bash", NULL, env_use); execle("/bin/sh", "-sh", NULL, env_use); } @@ -4449,9 +2615,7 @@ static int outer_child( int kmsg_socket, int rtnl_socket, int uid_shift_socket, - FDSet *fds, - int argc, - char *argv[]) { + FDSet *fds) { pid_t pid; ssize_t l; @@ -4523,11 +2687,11 @@ static int outer_child( if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) return log_error_errno(errno, "Failed to make bind mount: %m"); - r = setup_volatile(directory); + r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); if (r < 0) return r; - r = setup_volatile_state(directory); + r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); if (r < 0) return r; @@ -4541,16 +2705,18 @@ static int outer_child( return log_error_errno(r, "Failed to make tree read-only: %m"); } - r = mount_all(directory, false); + r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; - if (copy_devnodes(directory) < 0) + r = copy_devnodes(directory); + if (r < 0) return r; dev_setup(directory, arg_uid_shift, arg_uid_shift); - if (setup_pts(directory) < 0) + r = setup_pts(directory); + if (r < 0) return r; r = setup_propagate(directory); @@ -4577,11 +2743,11 @@ static int outer_child( if (r < 0) return r; - r = mount_custom(directory); + r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; - r = mount_cgroups(directory); + r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; @@ -4604,7 +2770,7 @@ static int outer_child( * requested, so that we all are owned by the user if * user namespaces are turned on. */ - r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv); + r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds); if (r < 0) _exit(EXIT_FAILURE); @@ -4620,6 +2786,8 @@ static int outer_child( } pid_socket = safe_close(pid_socket); + kmsg_socket = safe_close(kmsg_socket); + rtnl_socket = safe_close(rtnl_socket); return 0; } @@ -4645,130 +2813,203 @@ static int setup_uid_map(pid_t pid) { return 0; } -static int chown_cgroup(pid_t pid) { - _cleanup_free_ char *path = NULL, *fs = NULL; - _cleanup_close_ int fd = -1; - const char *fn; +static int load_settings(void) { + _cleanup_(settings_freep) Settings *settings = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + const char *fn, *i; int r; - r = cg_pid_get_path(NULL, pid, &path); - if (r < 0) - return log_error_errno(r, "Failed to get container cgroup path: %m"); + /* If all settings are masked, there's no point in looking for + * the settings file */ + if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL) + return 0; - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); - if (r < 0) - return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + fn = strjoina(arg_machine, ".nspawn"); - fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY); - if (fd < 0) - return log_error_errno(errno, "Failed to open %s: %m", fs); - - FOREACH_STRING(fn, - ".", - "tasks", - "notify_on_release", - "cgroup.procs", - "cgroup.clone_children", - "cgroup.controllers", - "cgroup.subtree_control", - "cgroup.populated") - if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0) - log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, - "Failed to chown() cgroup file %s, ignoring: %m", fn); + /* We first look in the admin's directories in /etc and /run */ + FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { + _cleanup_free_ char *j = NULL; - return 0; -} + j = strjoin(i, "/", fn, NULL); + if (!j) + return log_oom(); + + f = fopen(j, "re"); + if (f) { + p = j; + j = NULL; + + /* By default we trust configuration from /etc and /run */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; + + break; + } -static int sync_cgroup(pid_t pid) { - _cleanup_free_ char *cgroup = NULL; - char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; - bool undo_mount = false; - const char *fn; - int unified, r; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", j); + } - unified = cg_unified(); - if (unified < 0) - return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + if (!f) { + /* After that, let's look for a file next to the + * actual image we shall boot. */ - if ((unified > 0) == arg_unified_cgroup_hierarchy) + if (arg_image) { + p = file_in_same_dir(arg_image, fn); + if (!p) + return log_oom(); + } else if (arg_directory) { + p = file_in_same_dir(arg_directory, fn); + if (!p) + return log_oom(); + } + + if (p) { + f = fopen(p, "re"); + if (!f && errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", p); + + /* By default we do not trust configuration from /var/lib/machines */ + if (arg_settings_trusted < 0) + arg_settings_trusted = false; + } + } + + if (!f) return 0; - /* When the host uses the legacy cgroup setup, but the - * container shall use the unified hierarchy, let's make sure - * we copy the path from the name=systemd hierarchy into the - * unified hierarchy. Similar for the reverse situation. */ + log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted)); - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + r = settings_load(f, p, &settings); if (r < 0) - return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid); + return r; - /* In order to access the unified hierarchy we need to mount it */ - if (!mkdtemp(tree)) - return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m"); + /* Copy over bits from the settings, unless they have been + * explicitly masked by command line switches. */ - if (unified) - r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); - else - r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior"); - if (r < 0) { - r = log_error_errno(errno, "Failed to mount unified hierarchy: %m"); - goto finish; + if ((arg_settings_mask & SETTING_BOOT) == 0 && + settings->boot >= 0) { + arg_boot = settings->boot; + + strv_free(arg_parameters); + arg_parameters = settings->parameters; + settings->parameters = NULL; + } + + if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 && + settings->environment) { + strv_free(arg_setenv); + arg_setenv = settings->environment; + settings->environment = NULL; } - undo_mount = true; + if ((arg_settings_mask & SETTING_USER) == 0 && + settings->user) { + free(arg_user); + arg_user = settings->user; + settings->user = NULL; + } - fn = strjoina(tree, cgroup, "/cgroup.procs"); - (void) mkdir_parents(fn, 0755); + if ((arg_settings_mask & SETTING_CAPABILITY) == 0) { + uint64_t plus; - sprintf(pid_string, PID_FMT, pid); - r = write_string_file(fn, pid_string, 0); - if (r < 0) - log_error_errno(r, "Failed to move process: %m"); + plus = settings->capability; + if (settings_private_network(settings)) + plus |= (1ULL << CAP_NET_ADMIN); -finish: - if (undo_mount) - (void) umount(tree); + if (!arg_settings_trusted && plus != 0) { + if (settings->capability != 0) + log_warning("Ignoring Capability= setting, file %s is not trusted.", p); + } else + arg_retain |= plus; - (void) rmdir(tree); - return r; -} + arg_retain &= ~settings->drop_capability; + } -static int create_subcgroup(pid_t pid) { - _cleanup_free_ char *cgroup = NULL; - const char *child; - int unified, r; + if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && + settings->kill_signal > 0) + arg_kill_signal = settings->kill_signal; - /* In the unified hierarchy inner nodes may only only contain - * subgroups, but not processes. Hence, if we running in the - * unified hierarchy and the container does the same, and we - * did not create a scope unit for the container move us and - * the container into two separate subcgroups. */ + if ((arg_settings_mask & SETTING_PERSONALITY) == 0 && + settings->personality != PERSONALITY_INVALID) + arg_personality = settings->personality; - if (!arg_keep_unit) - return 0; + if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 && + !sd_id128_is_null(settings->machine_id)) { - if (!arg_unified_cgroup_hierarchy) - return 0; + if (!arg_settings_trusted) + log_warning("Ignoring MachineID= setting, file %s is not trusted.", p); + else + arg_uuid = settings->machine_id; + } - unified = cg_unified(); - if (unified < 0) - return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); - if (unified == 0) - return 0; + if ((arg_settings_mask & SETTING_READ_ONLY) == 0 && + settings->read_only >= 0) + arg_read_only = settings->read_only; - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); - if (r < 0) - return log_error_errno(r, "Failed to get our control group: %m"); + if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 && + settings->volatile_mode != _VOLATILE_MODE_INVALID) + arg_volatile_mode = settings->volatile_mode; - child = strjoina(cgroup, "/payload"); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid); - if (r < 0) - return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 && + settings->n_custom_mounts > 0) { - child = strjoina(cgroup, "/supervisor"); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0); - if (r < 0) - return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + if (!arg_settings_trusted) + log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p); + else { + custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); + arg_custom_mounts = settings->custom_mounts; + arg_n_custom_mounts = settings->n_custom_mounts; + + settings->custom_mounts = NULL; + settings->n_custom_mounts = 0; + } + } + + if ((arg_settings_mask & SETTING_NETWORK) == 0 && + (settings->private_network >= 0 || + settings->network_veth >= 0 || + settings->network_bridge || + settings->network_interfaces || + settings->network_macvlan || + settings->network_ipvlan)) { + + if (!arg_settings_trusted) + log_warning("Ignoring network settings, file %s is not trusted.", p); + else { + arg_network_veth = settings_private_network(settings); + arg_private_network = settings_private_network(settings); + + strv_free(arg_network_interfaces); + arg_network_interfaces = settings->network_interfaces; + settings->network_interfaces = NULL; + + strv_free(arg_network_macvlan); + arg_network_macvlan = settings->network_macvlan; + settings->network_macvlan = NULL; + + strv_free(arg_network_ipvlan); + arg_network_ipvlan = settings->network_ipvlan; + settings->network_ipvlan = NULL; + + free(arg_network_bridge); + arg_network_bridge = settings->network_bridge; + settings->network_bridge = NULL; + } + } + + if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 && + settings->expose_ports) { + + if (!arg_settings_trusted) + log_warning("Ignoring Port= setting, file %s is not trusted.", p); + else { + expose_port_free_all(arg_expose_ports); + arg_expose_ports = settings->expose_ports; + settings->expose_ports = NULL; + } + } return 0; } @@ -4796,15 +3037,22 @@ int main(int argc, char *argv[]) { if (r <= 0) goto finish; - r = determine_names(); - if (r < 0) - goto finish; - if (geteuid() != 0) { log_error("Need to be root."); r = -EPERM; goto finish; } + r = determine_names(); + if (r < 0) + goto finish; + + r = load_settings(); + if (r < 0) + goto finish; + + r = verify_arguments(); + if (r < 0) + goto finish; n_fd_passed = sd_listen_fds(false); if (n_fd_passed > 0) { @@ -4853,7 +3101,7 @@ int main(int argc, char *argv[]) { goto finish; } - r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE); + r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA); if (r < 0) { log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory); goto finish; @@ -4877,7 +3125,7 @@ int main(int argc, char *argv[]) { } if (arg_template) { - r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE); + r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA); if (r == -EEXIST) { if (!arg_quiet) log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template); @@ -4900,10 +3148,9 @@ int main(int argc, char *argv[]) { } else { const char *p; - p = strjoina(arg_directory, - argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/"); - if (access(p, F_OK) < 0) { - log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory); + p = strjoina(arg_directory, "/usr/"); + if (laccess(p, F_OK) < 0) { + log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory); r = -EINVAL; goto finish; } @@ -4997,7 +3244,7 @@ int main(int argc, char *argv[]) { ContainerStatus container_status; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; static const struct sigaction sa = { - .sa_handler = nop_handler, + .sa_handler = nop_signal_handler, .sa_flags = SA_NOCLDSTOP, }; int ifi = 0; @@ -5013,23 +3260,23 @@ int main(int argc, char *argv[]) { goto finish; } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create kmsg socket pair: %m"); goto finish; } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create rtnl socket pair: %m"); goto finish; } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create pid socket pair: %m"); goto finish; } if (arg_userns) - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); goto finish; } @@ -5085,8 +3332,7 @@ int main(int argc, char *argv[]) { kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], - fds, - argc, argv); + fds); if (r < 0) _exit(EXIT_FAILURE); @@ -5095,12 +3341,12 @@ int main(int argc, char *argv[]) { barrier_set_role(&barrier, BARRIER_PARENT); - fdset_free(fds); - fds = NULL; + fds = fdset_free(fds); kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); pid_socket_pair[1] = safe_close(pid_socket_pair[1]); + uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", pid, NULL); @@ -5119,7 +3365,7 @@ int main(int argc, char *argv[]) { goto finish; } if (l != sizeof(pid)) { - log_error("Short read while reading inner child PID: %m"); + log_error("Short read while reading inner child PID."); r = EIO; goto finish; } @@ -5139,7 +3385,7 @@ int main(int argc, char *argv[]) { goto finish; } if (l != sizeof(arg_uid_shift)) { - log_error("Short read while reading UID shift: %m"); + log_error("Short read while reading UID shift."); r = EIO; goto finish; } @@ -5151,39 +3397,64 @@ int main(int argc, char *argv[]) { (void) barrier_place(&barrier); /* #2 */ } - r = move_network_interfaces(pid); - if (r < 0) - goto finish; + if (arg_private_network) { - r = setup_veth(pid, veth_name, &ifi); - if (r < 0) - goto finish; + r = move_network_interfaces(pid, arg_network_interfaces); + if (r < 0) + goto finish; - r = setup_bridge(veth_name, &ifi); - if (r < 0) - goto finish; + if (arg_network_veth) { + r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge); + if (r < 0) + goto finish; + else if (r > 0) + ifi = r; - r = setup_macvlan(pid); - if (r < 0) - goto finish; + if (arg_network_bridge) { + r = setup_bridge(veth_name, arg_network_bridge); + if (r < 0) + goto finish; + if (r > 0) + ifi = r; + } + } - r = setup_ipvlan(pid); - if (r < 0) - goto finish; + r = setup_macvlan(arg_machine, pid, arg_network_macvlan); + if (r < 0) + goto finish; - r = register_machine(pid, ifi); - if (r < 0) - goto finish; + r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan); + if (r < 0) + goto finish; + } - r = sync_cgroup(pid); - if (r < 0) - goto finish; + if (arg_register) { + r = register_machine( + arg_machine, + pid, + arg_directory, + arg_uuid, + ifi, + arg_slice, + arg_custom_mounts, arg_n_custom_mounts, + arg_kill_signal, + arg_property, + arg_keep_unit); + if (r < 0) + goto finish; + } - r = create_subcgroup(pid); + r = sync_cgroup(pid, arg_unified_cgroup_hierarchy); if (r < 0) goto finish; - r = chown_cgroup(pid); + if (arg_keep_unit) { + r = create_subcgroup(pid, arg_unified_cgroup_hierarchy); + if (r < 0) + goto finish; + } + + r = chown_cgroup(pid, arg_uid_shift); if (r < 0) goto finish; @@ -5205,8 +3476,8 @@ int main(int argc, char *argv[]) { } /* Let the child know that we are ready and wait that the child is completely ready now. */ - if (!barrier_place_and_sync(&barrier)) { /* #5 */ - log_error("Client died too early."); + if (!barrier_place_and_sync(&barrier)) { /* #4 */ + log_error("Child died too early."); r = -ESRCH; goto finish; } @@ -5236,16 +3507,16 @@ int main(int argc, char *argv[]) { sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL); if (arg_expose_ports) { - r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl); + r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl); if (r < 0) goto finish; - (void) expose_ports(rtnl, &exposed); + (void) expose_port_execute(rtnl, arg_expose_ports, &exposed); } rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); - r = pty_forward_new(event, master, true, !interactive, &forward); + r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward); if (r < 0) { log_error_errno(r, "Failed to create PTY forwarder: %m"); goto finish; @@ -5265,7 +3536,8 @@ int main(int argc, char *argv[]) { putc('\n', stdout); /* Kill if it is not dead yet anyway */ - terminate_machine(pid); + if (arg_register && !arg_keep_unit) + terminate_machine(pid); /* Normally redundant, but better safe than sorry */ kill(pid, SIGKILL); @@ -5303,7 +3575,7 @@ int main(int argc, char *argv[]) { break; } - flush_ports(&exposed); + expose_port_flush(arg_expose_ports, &exposed); } finish: @@ -5316,14 +3588,14 @@ finish: /* Try to flush whatever is still queued in the pty */ if (master >= 0) - (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false); + (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false); loop_remove(loop_nr, &image_fd); if (remove_subvol && arg_directory) { int k; - k = btrfs_subvol_remove(arg_directory, true); + k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); if (k < 0) log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory); } @@ -5335,24 +3607,21 @@ finish: (void) rm_rf(p, REMOVE_ROOT); } + expose_port_flush(arg_expose_ports, &exposed); + free(arg_directory); free(arg_template); free(arg_image); free(arg_machine); free(arg_user); strv_free(arg_setenv); + free(arg_network_bridge); strv_free(arg_network_interfaces); strv_free(arg_network_macvlan); strv_free(arg_network_ipvlan); - custom_mount_free_all(); - - flush_ports(&exposed); - - while (arg_expose_ports) { - ExposePort *p = arg_expose_ports; - LIST_REMOVE(ports, arg_expose_ports, p); - free(p); - } + strv_free(arg_parameters); + custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); + expose_port_free_all(arg_expose_ports); return r < 0 ? EXIT_FAILURE : ret; } |