diff options
-rw-r--r-- | man/bootup.xml | 7 | ||||
-rw-r--r-- | man/machinectl.xml | 6 | ||||
-rw-r--r-- | man/systemd-nspawn.xml | 56 | ||||
-rw-r--r-- | man/systemd.exec.xml | 4 | ||||
-rw-r--r-- | src/basic/btrfs-util.c | 39 | ||||
-rw-r--r-- | src/basic/btrfs-util.h | 4 | ||||
-rw-r--r-- | src/basic/missing.h | 4 | ||||
-rw-r--r-- | src/import/pull-common.c | 12 | ||||
-rw-r--r-- | src/libsystemd/sd-event/sd-event.c | 3 | ||||
-rw-r--r-- | src/libsystemd/sd-event/test-event.c | 3 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.c | 24 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 123 | ||||
-rw-r--r-- | src/shared/machine-image.c | 39 | ||||
-rw-r--r-- | src/shared/seccomp-util.c | 72 | ||||
-rw-r--r-- | src/shared/seccomp-util.h | 1 |
15 files changed, 282 insertions, 115 deletions
diff --git a/man/bootup.xml b/man/bootup.xml index 986996398c..b92c60f43a 100644 --- a/man/bootup.xml +++ b/man/bootup.xml @@ -179,6 +179,13 @@ identical to the system manager bootup (see above) until it reaches <filename>basic.target</filename>. From there, systemd approaches the special target <filename>initrd.target</filename>. + + Before any file systems are mounted, it must be determined whether + the system will resume from hibernation or proceed with normal boot. + This is accomplished by <filename>systemd-hibernate-resume@.service</filename> + which must be finished before <filename>local-fs-pre.target</filename>, + so no filesystems can be mounted before the check is complete. + When the root device becomes available, <filename>initd-root-device.target</filename> is reached. If the root device can be mounted at diff --git a/man/machinectl.xml b/man/machinectl.xml index 5a6ec294d2..81192417d8 100644 --- a/man/machinectl.xml +++ b/man/machinectl.xml @@ -599,8 +599,8 @@ <listitem><para>Clones a container or VM image. The arguments specify the name of the image to clone and the name of the newly cloned image. Note that plain directory container images are cloned into btrfs subvolume images with this command, if the underlying file system supports this. Note that cloning a container or VM - image is optimized for btrfs file systems, and might not be efficient on others, due to file system - limitations.</para> + image is optimized for file systems that support copy-on-write, and might not be efficient on others, due to + file system limitations.</para> <para>Note that this command leaves host name, machine ID and all other settings that could identify the instance @@ -910,7 +910,7 @@ <filename>/var/lib/machines/</filename> to make them available for control with <command>machinectl</command>.</para> - <para>Note that many image operations are only supported, + <para>Note that some image operations are only supported, efficient or atomic on btrfs file systems. Due to this, if the <command>pull-tar</command>, <command>pull-raw</command>, <command>import-tar</command>, <command>import-raw</command> and diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index f153034296..dbbf9890c8 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -181,25 +181,15 @@ <varlistentry> <term><option>--template=</option></term> - <listitem><para>Directory or <literal>btrfs</literal> - subvolume to use as template for the container's root - directory. If this is specified and the container's root - directory (as configured by <option>--directory=</option>) - does not yet exist it is created as <literal>btrfs</literal> - subvolume and populated from this template tree. Ideally, the - specified template path refers to the root of a - <literal>btrfs</literal> subvolume, in which case a simple - copy-on-write snapshot is taken, and populating the root - directory is instant. If the specified template path does not - refer to the root of a <literal>btrfs</literal> subvolume (or - not even to a <literal>btrfs</literal> file system at all), - the tree is copied, which can be substantially more - time-consuming. Note that if this option is used the - container's root directory (in contrast to the template - directory!) must be located on a <literal>btrfs</literal> file - system, so that the <literal>btrfs</literal> subvolume may be - created. May not be specified together with - <option>--image=</option> or + <listitem><para>Directory or <literal>btrfs</literal> subvolume to use as template for the container's root + directory. If this is specified and the container's root directory (as configured by + <option>--directory=</option>) does not yet exist it is created as <literal>btrfs</literal> snapshot (if + supported) or plain directory (otherwise) and populated from this template tree. Ideally, the specified + template path refers to the root of a <literal>btrfs</literal> subvolume, in which case a simple copy-on-write + snapshot is taken, and populating the root directory is instant. If the specified template path does not refer + to the root of a <literal>btrfs</literal> subvolume (or not even to a <literal>btrfs</literal> file system at + all), the tree is copied (though possibly in a copy-on-write scheme — if the file system supports that), which + can be substantially more time-consuming. May not be specified together with <option>--image=</option> or <option>--ephemeral</option>.</para> <para>Note that this switch leaves host name, machine ID and @@ -211,13 +201,8 @@ <term><option>-x</option></term> <term><option>--ephemeral</option></term> - <listitem><para>If specified, the container is run with a - temporary <literal>btrfs</literal> snapshot of its root - directory (as configured with <option>--directory=</option>), - that is removed immediately when the container terminates. - This option is only supported if the root file system is - <literal>btrfs</literal>. May not be specified together with - <option>--image=</option> or + <listitem><para>If specified, the container is run with a temporary snapshot of its file system that is removed + immediately when the container terminates. May not be specified together with <option>--template=</option>.</para> <para>Note that this switch leaves host name, machine ID and all other settings that could identify the instance @@ -252,11 +237,12 @@ Partitions Specification</ulink>.</para></listitem> </itemizedlist> - <para>Any other partitions, such as foreign partitions, swap - partitions or EFI system partitions are not mounted. May not - be specified together with <option>--directory=</option>, - <option>--template=</option> or - <option>--ephemeral</option>.</para></listitem> + <para>On GPT images, if an EFI System Partition (ESP) is discovered, it is automatically mounted to + <filename>/efi</filename> (or <filename>/boot</filename> as fallback) in case a directory by this name exists + and is empty.</para> + + <para>Any other partitions, such as foreign partitions or swap partitions are not mounted. May not be specified + together with <option>--directory=</option>, <option>--template=</option>.</para></listitem> </varlistentry> <varlistentry> @@ -1056,14 +1042,12 @@ </example> <example> - <title>Boot into an ephemeral <literal>btrfs</literal> snapshot of the host system</title> + <title>Boot into an ephemeral snapshot of the host system</title> <programlisting># systemd-nspawn -D / -xb</programlisting> - <para>This runs a copy of the host system in a - <literal>btrfs</literal> snapshot which is removed immediately - when the container exits. All file system changes made during - runtime will be lost on shutdown, hence.</para> + <para>This runs a copy of the host system in a snapshot which is removed immediately when the container + exits. All file system changes made during runtime will be lost on shutdown, hence.</para> </example> <example> diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 2ea4a53d18..03e55a7aff 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1356,6 +1356,10 @@ <entry>Debugging, performance monitoring and tracing functionality (<citerefentry project='man-pages'><refentrytitle>ptrace</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>perf_event_open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and related calls)</entry> </row> <row> + <entry>@file-system</entry> + <entry>File system operations: opening, creating files and directories for read and write, renaming and removing them, reading file properties, or creating hard and symbolic links.</entry> + </row> + <row> <entry>@io-event</entry> <entry>Event loop system calls (<citerefentry project='man-pages'><refentrytitle>poll</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>select</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>epoll</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>eventfd</refentrytitle><manvolnum>2</manvolnum></citerefentry> and related calls)</entry> </row> diff --git a/src/basic/btrfs-util.c b/src/basic/btrfs-util.c index 656bb13719..5f9e21dcba 100644 --- a/src/basic/btrfs-util.c +++ b/src/basic/btrfs-util.c @@ -20,6 +20,7 @@ #include <errno.h> #include <fcntl.h> #include <inttypes.h> +#include <linux/fs.h> #include <linux/loop.h> #include <stddef.h> #include <stdio.h> @@ -38,6 +39,7 @@ #include "alloc-util.h" #include "btrfs-ctree.h" #include "btrfs-util.h" +#include "chattr-util.h" #include "copy.h" #include "fd-util.h" #include "fileio.h" @@ -45,6 +47,7 @@ #include "macro.h" #include "missing.h" #include "path-util.h" +#include "rm-rf.h" #include "selinux-util.h" #include "smack-util.h" #include "sparse-endian.h" @@ -1718,28 +1721,46 @@ int btrfs_subvol_snapshot_fd(int old_fd, const char *new_path, BtrfsSnapshotFlag if (r < 0) return r; if (r == 0) { + bool plain_directory = false; + + /* If the source isn't a proper subvolume, fail unless fallback is requested */ if (!(flags & BTRFS_SNAPSHOT_FALLBACK_COPY)) return -EISDIR; r = btrfs_subvol_make(new_path); - if (r < 0) + if (r == -ENOTTY && (flags & BTRFS_SNAPSHOT_FALLBACK_DIRECTORY)) { + /* If the destination doesn't support subvolumes, then use a plain directory, if that's requested. */ + if (mkdir(new_path, 0755) < 0) + return r; + + plain_directory = true; + } else if (r < 0) return r; r = copy_directory_fd(old_fd, new_path, true); - if (r < 0) { - (void) btrfs_subvol_remove(new_path, BTRFS_REMOVE_QUOTA); - return r; - } + if (r < 0) + goto fallback_fail; if (flags & BTRFS_SNAPSHOT_READ_ONLY) { - r = btrfs_subvol_set_read_only(new_path, true); - if (r < 0) { - (void) btrfs_subvol_remove(new_path, BTRFS_REMOVE_QUOTA); - return r; + + if (plain_directory) { + /* Plain directories have no recursive read-only flag, but something pretty close to + * it: the IMMUTABLE bit. Let's use this here, if this is requested. */ + + if (flags & BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE) + (void) chattr_path(new_path, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL); + } else { + r = btrfs_subvol_set_read_only(new_path, true); + if (r < 0) + goto fallback_fail; } } return 0; + + fallback_fail: + (void) rm_rf(new_path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME); + return r; } r = extract_subvolume_name(new_path, &subvolume); diff --git a/src/basic/btrfs-util.h b/src/basic/btrfs-util.h index 1d852d502c..04a2e1274b 100644 --- a/src/basic/btrfs-util.h +++ b/src/basic/btrfs-util.h @@ -45,10 +45,12 @@ typedef struct BtrfsQuotaInfo { } BtrfsQuotaInfo; typedef enum BtrfsSnapshotFlags { - BTRFS_SNAPSHOT_FALLBACK_COPY = 1, + BTRFS_SNAPSHOT_FALLBACK_COPY = 1, /* If the source isn't a subvolume, reflink everything */ BTRFS_SNAPSHOT_READ_ONLY = 2, BTRFS_SNAPSHOT_RECURSIVE = 4, BTRFS_SNAPSHOT_QUOTA = 8, + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY = 16, /* If the destination doesn't support subvolumes, reflink/copy instead */ + BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE = 32, /* When we can't create a subvolume, use the FS_IMMUTABLE attribute for indicating read-only */ } BtrfsSnapshotFlags; typedef enum BtrfsRemoveFlags { diff --git a/src/basic/missing.h b/src/basic/missing.h index a5ae5d9e79..8833617dc6 100644 --- a/src/basic/missing.h +++ b/src/basic/missing.h @@ -143,6 +143,10 @@ #define GRND_RANDOM 0x0002 #endif +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 +#endif + #ifndef BTRFS_IOCTL_MAGIC #define BTRFS_IOCTL_MAGIC 0x94 #endif diff --git a/src/import/pull-common.c b/src/import/pull-common.c index 2ae2a4174c..5ddc0c56f4 100644 --- a/src/import/pull-common.c +++ b/src/import/pull-common.c @@ -144,12 +144,12 @@ int pull_make_local_copy(const char *final, const char *image_root, const char * if (force_local) (void) rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME); - r = btrfs_subvol_snapshot(final, p, BTRFS_SNAPSHOT_QUOTA); - if (r == -ENOTTY) { - r = copy_tree(final, p, false); - if (r < 0) - return log_error_errno(r, "Failed to copy image: %m"); - } else if (r < 0) + r = btrfs_subvol_snapshot(final, p, + BTRFS_SNAPSHOT_QUOTA| + BTRFS_SNAPSHOT_FALLBACK_COPY| + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY| + BTRFS_SNAPSHOT_RECURSIVE); + if (r < 0) return log_error_errno(r, "Failed to create local image: %m"); log_info("Created new local image '%s'.", local); diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 9857f8b1fc..f94959adac 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -1539,7 +1539,8 @@ _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) assert_return(s, -EINVAL); assert_return(!event_pid_changed(s->event), -ECHILD); - return s->priority; + *priority = s->priority; + return 0; } _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) { diff --git a/src/libsystemd/sd-event/test-event.c b/src/libsystemd/sd-event/test-event.c index 289114490c..c0e5e06a18 100644 --- a/src/libsystemd/sd-event/test-event.c +++ b/src/libsystemd/sd-event/test-event.c @@ -172,6 +172,7 @@ static void test_basic(void) { static const char ch = 'x'; int a[2] = { -1, -1 }, b[2] = { -1, -1}, d[2] = { -1, -1}, k[2] = { -1, -1 }; uint64_t event_now; + int64_t priority; assert_se(pipe(a) >= 0); assert_se(pipe(b) >= 0); @@ -209,6 +210,8 @@ static void test_basic(void) { assert_se(sd_event_add_exit(e, &q, exit_handler, INT_TO_PTR('g')) >= 0); assert_se(sd_event_source_set_priority(x, 99) >= 0); + assert_se(sd_event_source_get_priority(x, &priority) >= 0); + assert_se(priority == 99); assert_se(sd_event_source_set_enabled(y, SD_EVENT_ONESHOT) >= 0); assert_se(sd_event_source_set_prepare(x, prepare_handler) >= 0); assert_se(sd_event_source_set_priority(z, 50) >= 0); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 95bb3c09b0..91cb0861d3 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -298,7 +298,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); } -static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) { +static int mkdir_userns(const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) { int r; assert(path); @@ -307,16 +307,20 @@ static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid if (r < 0 && errno != EEXIST) return -errno; - if (!in_userns) { - r = lchown(path, uid_shift, uid_shift); - if (r < 0) - return -errno; - } + if ((mask & MOUNT_USE_USERNS) == 0) + return 0; + + if (mask & MOUNT_IN_USERNS) + return 0; + + r = lchown(path, uid_shift, uid_shift); + if (r < 0) + return -errno; return 0; } -static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, bool in_userns, uid_t uid_shift) { +static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, MountSettingsMask mask, uid_t uid_shift) { const char *p, *e; int r; @@ -343,12 +347,12 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, boo if (prefix && path_startswith(prefix, t)) continue; - r = mkdir_userns(t, mode, in_userns, uid_shift); + r = mkdir_userns(t, mode, mask, uid_shift); if (r < 0) return r; } - return mkdir_userns(path, mode, in_userns, uid_shift); + return mkdir_userns(path, mode, mask, uid_shift); } int mount_all(const char *dest, @@ -422,7 +426,7 @@ int mount_all(const char *dest, if (mount_table[k].what && r > 0) continue; - r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift); + r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift); if (r < 0 && r != -EEXIST) { if (fatal) return log_error_errno(r, "Failed to create directory %s: %m", where); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a6adbbe879..2770770cd0 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1143,11 +1143,6 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_ephemeral && arg_image) { - log_error("--ephemeral and --image= may not be combined."); - return -EINVAL; - } - if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) { log_error("--ephemeral and --link-journal= may not be combined."); return -EINVAL; @@ -2605,7 +2600,7 @@ static int determine_names(void) { r = image_find(arg_machine, &i); if (r < 0) return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine); - else if (r == 0) { + if (r == 0) { log_error("No image for machine '%s': %m", arg_machine); return -ENOENT; } @@ -2615,14 +2610,14 @@ static int determine_names(void) { else r = free_and_strdup(&arg_directory, i->path); if (r < 0) - return log_error_errno(r, "Invalid image directory: %m"); + return log_oom(); if (!arg_ephemeral) arg_read_only = arg_read_only || i->read_only; } else arg_directory = get_current_dir_name(); - if (!arg_directory && !arg_machine) { + if (!arg_directory && !arg_image) { log_error("Failed to determine path, please use -D or -i."); return -EINVAL; } @@ -2633,7 +2628,6 @@ static int determine_names(void) { arg_machine = gethostname_malloc(); else arg_machine = strdup(basename(arg_image ?: arg_directory)); - if (!arg_machine) return log_oom(); @@ -3795,7 +3789,6 @@ static int run(int master, l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0); if (l < 0) return log_error_errno(errno, "Failed to read UID shift: %m"); - if (l != sizeof arg_uid_shift) { log_error("Short read while reading UID shift."); return -EIO; @@ -4029,7 +4022,7 @@ static int run(int master, terminate_machine(*pid); /* Normally redundant, but better safe than sorry */ - kill(*pid, SIGKILL); + (void) kill(*pid, SIGKILL); r = wait_for_container(*pid, &container_status); *pid = 0; @@ -4077,11 +4070,12 @@ int main(int argc, char *argv[]) { _cleanup_fdset_free_ FDSet *fds = NULL; int r, n_fd_passed, loop_nr = -1, ret = EXIT_SUCCESS; char veth_name[IFNAMSIZ] = ""; - bool secondary = false, remove_subvol = false; + bool secondary = false, remove_directory = false, remove_image = false; pid_t pid = 0; union in_addr_union exposed = {}; _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; - bool interactive, veth_created = false; + bool interactive, veth_created = false, remove_tmprootdir = false; + char tmprootdir[] = "/tmp/nspawn-root-XXXXXX"; log_parse_environment(); log_open(); @@ -4148,7 +4142,7 @@ int main(int argc, char *argv[]) { else r = tempfn_random(arg_directory, "machine.", &np); if (r < 0) { - log_error_errno(r, "Failed to generate name for snapshot: %m"); + log_error_errno(r, "Failed to generate name for directory snapshot: %m"); goto finish; } @@ -4158,7 +4152,12 @@ int main(int argc, char *argv[]) { goto finish; } - r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA); + r = btrfs_subvol_snapshot(arg_directory, np, + (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_QUOTA); if (r < 0) { log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory); goto finish; @@ -4168,7 +4167,7 @@ int main(int argc, char *argv[]) { arg_directory = np; np = NULL; - remove_subvol = true; + remove_directory = true; } else { r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); @@ -4182,7 +4181,13 @@ int main(int argc, char *argv[]) { } if (arg_template) { - r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA); + r = btrfs_subvol_snapshot(arg_template, arg_directory, + (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_QUOTA); if (r == -EEXIST) { if (!arg_quiet) log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template); @@ -4214,28 +4219,55 @@ int main(int argc, char *argv[]) { } } else { - char template[] = "/tmp/nspawn-root-XXXXXX"; - assert(arg_image); assert(!arg_template); - r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); - if (r == -EBUSY) { - r = log_error_errno(r, "Disk image %s is currently busy.", arg_image); - goto finish; - } - if (r < 0) { - r = log_error_errno(r, "Failed to create image lock: %m"); - goto finish; + if (arg_ephemeral) { + _cleanup_free_ char *np = NULL; + + r = tempfn_random(arg_image, "machine.", &np); + if (r < 0) { + log_error_errno(r, "Failed to generate name for image snapshot: %m"); + goto finish; + } + + r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + if (r < 0) { + r = log_error_errno(r, "Failed to create image lock: %m"); + goto finish; + } + + r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL); + if (r < 0) { + r = log_error_errno(r, "Failed to copy image file: %m"); + goto finish; + } + + free(arg_image); + arg_image = np; + np = NULL; + + remove_image = true; + } else { + r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + if (r == -EBUSY) { + r = log_error_errno(r, "Disk image %s is currently busy.", arg_image); + goto finish; + } + if (r < 0) { + r = log_error_errno(r, "Failed to create image lock: %m"); + goto finish; + } } - if (!mkdtemp(template)) { - log_error_errno(errno, "Failed to create temporary directory: %m"); - r = -errno; + if (!mkdtemp(tmprootdir)) { + r = log_error_errno(errno, "Failed to create temporary directory: %m"); goto finish; } - arg_directory = strdup(template); + remove_tmprootdir = true; + + arg_directory = strdup(tmprootdir); if (!arg_directory) { r = log_oom(); goto finish; @@ -4255,6 +4287,10 @@ int main(int argc, char *argv[]) { &secondary); if (r < 0) goto finish; + + /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */ + if (remove_image && unlink(arg_image) >= 0) + remove_image = false; } r = custom_mounts_prepare(); @@ -4321,20 +4357,35 @@ finish: "STOPPING=1\nSTATUS=Terminating..."); if (pid > 0) - kill(pid, SIGKILL); + (void) kill(pid, SIGKILL); /* Try to flush whatever is still queued in the pty */ - if (master >= 0) + if (master >= 0) { (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false); + master = safe_close(master); + } + + if (pid > 0) + (void) wait_for_terminate(pid, NULL); loop_remove(loop_nr, &image_fd); - if (remove_subvol && arg_directory) { + if (remove_directory && arg_directory) { int k; - k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA); + k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME); if (k < 0) - log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory); + log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory); + } + + if (remove_image && arg_image) { + if (unlink(arg_image) < 0) + log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image); + } + + if (remove_tmprootdir) { + if (rmdir(tmprootdir) < 0) + log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir); } if (arg_machine) { diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index 6414ba5246..712aff65b9 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -27,18 +27,20 @@ #include <sys/stat.h> #include <unistd.h> #include <linux/fs.h> + #include "alloc-util.h" #include "btrfs-util.h" #include "chattr-util.h" #include "copy.h" #include "dirent-util.h" +#include "env-util.h" #include "fd-util.h" #include "fs-util.h" #include "hashmap.h" #include "lockfile-util.h" #include "log.h" -#include "macro.h" #include "machine-image.h" +#include "macro.h" #include "mkdir.h" #include "path-util.h" #include "rm-rf.h" @@ -607,14 +609,14 @@ int image_clone(Image *i, const char *new_name, bool read_only) { new_path = strjoina("/var/lib/machines/", new_name); - r = btrfs_subvol_snapshot(i->path, new_path, (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA); - if (r == -EOPNOTSUPP) { - /* No btrfs snapshots supported, create a normal directory then. */ - - r = copy_directory(i->path, new_path, false); - if (r >= 0) - (void) chattr_path(new_path, read_only ? FS_IMMUTABLE_FL : 0, FS_IMMUTABLE_FL); - } else if (r >= 0) + r = btrfs_subvol_snapshot(i->path, new_path, + (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_QUOTA); + if (r >= 0) /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */ (void) btrfs_subvol_auto_qgroup(new_path, 0, true); @@ -723,12 +725,17 @@ int image_path_lock(const char *path, int operation, LockFile *global, LockFile * uses the device/inode number. This has the benefit that we * can even lock a tree that is a mount point, correctly. */ - if (path_equal(path, "/")) - return -EBUSY; - if (!path_is_absolute(path)) return -EINVAL; + if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) { + *local = *global = (LockFile) LOCK_FILE_INIT; + return 0; + } + + if (path_equal(path, "/")) + return -EBUSY; + if (stat(path, &st) >= 0) { if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0) return -ENOMEM; @@ -746,7 +753,8 @@ int image_path_lock(const char *path, int operation, LockFile *global, LockFile release_lock_file(&t); return r; } - } + } else + *global = (LockFile) LOCK_FILE_INIT; *local = t; return 0; @@ -782,6 +790,11 @@ int image_name_lock(const char *name, int operation, LockFile *ret) { if (!image_name_is_valid(name)) return -EINVAL; + if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) { + *ret = (LockFile) LOCK_FILE_INIT; + return 0; + } + if (streq(name, ".host")) return -EBUSY; diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 4e4b2faca9..66b72b2b27 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -290,6 +290,78 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { #endif "sys_debug_setcontext\0" }, + [SYSCALL_FILTER_SET_FILE_SYSTEM] = { + .name = "@file-system", + .help = "File system operations", + .value = + "access\0" + "chdir\0" + "chmod\0" + "close\0" + "creat\0" + "faccessat\0" + "fallocate\0" + "fchdir\0" + "fchmod\0" + "fchmodat\0" + "fcntl64\0" + "fcntl\0" + "fgetxattr\0" + "flistxattr\0" + "fsetxattr\0" + "fstat64\0" + "fstat\0" + "fstatat64\0" + "fstatfs64\0" + "fstatfs\0" + "ftruncate64\0" + "ftruncate\0" + "futimesat\0" + "getcwd\0" + "getdents64\0" + "getdents\0" + "getxattr\0" + "inotify_add_watch\0" + "inotify_init1\0" + "inotify_rm_watch\0" + "lgetxattr\0" + "link\0" + "linkat\0" + "listxattr\0" + "llistxattr\0" + "lremovexattr\0" + "lsetxattr\0" + "lstat64\0" + "lstat\0" + "mkdir\0" + "mkdirat\0" + "mknod\0" + "mknodat\0" + "mmap2\0" + "mmap\0" + "newfstatat\0" + "open\0" + "openat\0" + "readlink\0" + "readlinkat\0" + "removexattr\0" + "rename\0" + "renameat2\0" + "renameat\0" + "rmdir\0" + "setxattr\0" + "stat64\0" + "stat\0" + "statfs\0" + "symlink\0" + "symlinkat\0" + "truncate64\0" + "truncate\0" + "unlink\0" + "unlinkat\0" + "utimensat\0" + "utimes\0" + }, [SYSCALL_FILTER_SET_IO_EVENT] = { .name = "@io-event", .help = "Event loop system calls", diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 438a6671bc..01cf331b29 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -45,6 +45,7 @@ enum { SYSCALL_FILTER_SET_CLOCK, SYSCALL_FILTER_SET_CPU_EMULATION, SYSCALL_FILTER_SET_DEBUG, + SYSCALL_FILTER_SET_FILE_SYSTEM, SYSCALL_FILTER_SET_IO_EVENT, SYSCALL_FILTER_SET_IPC, SYSCALL_FILTER_SET_KEYRING, |