summaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2017-02-08 23:05:05 -0500
committerGitHub <noreply@github.com>2017-02-08 23:05:05 -0500
commitfc6149a6ce7a5560ae239a317b7f43039a3f80fd (patch)
treed6411b7b1fe425234524e8baea3c297fdd22202f /src/core
parent52a4aafb4dd178afae5ce8ceadd852233cac10f3 (diff)
parentef3116b5d4b9f12ae9f0fc25c8b40a04712c6d56 (diff)
Merge pull request #4962 from poettering/root-directory-2
Add new MountAPIVFS= boolean unit file setting + RootImage=
Diffstat (limited to 'src/core')
-rw-r--r--src/core/dbus-execute.c22
-rw-r--r--src/core/execute.c39
-rw-r--r--src/core/execute.h3
-rw-r--r--src/core/load-fragment-gperf.gperf.m42
-rw-r--r--src/core/load-fragment.c19
-rw-r--r--src/core/namespace.c187
-rw-r--r--src/core/namespace.h6
-rw-r--r--src/core/unit.c6
8 files changed, 243 insertions, 41 deletions
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index cc10e2d8e7..7df4cab3f6 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -758,6 +758,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -828,6 +829,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_VTABLE_END
};
@@ -1047,7 +1049,7 @@ int bus_exec_context_set_transient_property(
return 1;
- } else if (STR_IN_SET(name, "TTYPath", "RootDirectory")) {
+ } else if (STR_IN_SET(name, "TTYPath", "RootDirectory", "RootImage")) {
const char *s;
r = sd_bus_message_read(message, "s", &s);
@@ -1060,6 +1062,8 @@ int bus_exec_context_set_transient_property(
if (mode != UNIT_CHECK) {
if (streq(name, "TTYPath"))
r = free_and_strdup(&c->tty_path, s);
+ else if (streq(name, "RootImage"))
+ r = free_and_strdup(&c->root_image, s);
else {
assert(streq(name, "RootDirectory"));
r = free_and_strdup(&c->root_directory, s);
@@ -1207,7 +1211,7 @@ int bus_exec_context_set_transient_property(
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
- "ProtectKernelModules", "ProtectControlGroups")) {
+ "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@@ -1247,6 +1251,8 @@ int bus_exec_context_set_transient_property(
c->protect_kernel_modules = b;
else if (streq(name, "ProtectControlGroups"))
c->protect_control_groups = b;
+ else if (streq(name, "MountAPIVFS"))
+ c->mount_apivfs = b;
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
}
@@ -1495,12 +1501,15 @@ int bus_exec_context_set_transient_property(
return r;
STRV_FOREACH(p, l) {
- int offset;
- if (!utf8_is_valid(*p))
+ const char *i = *p;
+ size_t offset;
+
+ if (!utf8_is_valid(i))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
- offset = **p == '-';
- if (!path_is_absolute(*p + offset))
+ offset = i[0] == '-';
+ offset += i[offset] == '+';
+ if (!path_is_absolute(i + offset))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);
}
@@ -1519,7 +1528,6 @@ int bus_exec_context_set_transient_property(
unit_write_drop_in_private_format(u, mode, name, "%s=", name);
} else {
r = strv_extend_strv(dirs, l, true);
-
if (r < 0)
return -ENOMEM;
diff --git a/src/core/execute.c b/src/core/execute.c
index aa0ddb564e..f57eb26388 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1640,6 +1640,9 @@ static bool exec_needs_mount_namespace(
assert(context);
assert(params);
+ if (context->root_image)
+ return true;
+
if (!strv_isempty(context->read_write_paths) ||
!strv_isempty(context->read_only_paths) ||
!strv_isempty(context->inaccessible_paths))
@@ -1662,6 +1665,9 @@ static bool exec_needs_mount_namespace(
context->protect_control_groups)
return true;
+ if (context->mount_apivfs)
+ return true;
+
return false;
}
@@ -1935,13 +1941,14 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
int r;
_cleanup_strv_free_ char **rw = NULL;
char *tmp = NULL, *var = NULL;
- const char *root_dir = NULL;
+ const char *root_dir = NULL, *root_image = NULL;
NameSpaceInfo ns_info = {
.ignore_protect_paths = false,
.private_dev = context->private_devices,
.protect_control_groups = context->protect_control_groups,
.protect_kernel_tunables = context->protect_kernel_tunables,
.protect_kernel_modules = context->protect_kernel_modules,
+ .mount_apivfs = context->mount_apivfs,
};
assert(context);
@@ -1961,8 +1968,12 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
if (r < 0)
return r;
- if (params->flags & EXEC_APPLY_CHROOT)
- root_dir = context->root_directory;
+ if (params->flags & EXEC_APPLY_CHROOT) {
+ root_image = context->root_image;
+
+ if (!root_image)
+ root_dir = context->root_directory;
+ }
/*
* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
@@ -1972,7 +1983,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
if (!context->dynamic_user && root_dir)
ns_info.ignore_protect_paths = true;
- r = setup_namespace(root_dir, &ns_info, rw,
+ r = setup_namespace(root_dir, root_image,
+ &ns_info, rw,
context->read_only_paths,
context->inaccessible_paths,
context->bind_mounts,
@@ -1981,7 +1993,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
var,
context->protect_home,
context->protect_system,
- context->mount_flags);
+ context->mount_flags,
+ DISSECT_IMAGE_DISCARD_ON_LOOP);
/* If we couldn't set up the namespace this is probably due to a
* missing capability. In this case, silently proceeed. */
@@ -1995,10 +2008,12 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
return r;
}
-static int apply_working_directory(const ExecContext *context,
- const ExecParameters *params,
- const char *home,
- const bool needs_mount_ns) {
+static int apply_working_directory(
+ const ExecContext *context,
+ const ExecParameters *params,
+ const char *home,
+ const bool needs_mount_ns) {
+
const char *d;
const char *wd;
@@ -2979,6 +2994,7 @@ void exec_context_done(ExecContext *c) {
c->working_directory = mfree(c->working_directory);
c->root_directory = mfree(c->root_directory);
+ c->root_image = mfree(c->root_image);
c->tty_path = mfree(c->tty_path);
c->syslog_identifier = mfree(c->syslog_identifier);
c->user = mfree(c->user);
@@ -3294,6 +3310,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sPrivateUsers: %s\n"
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
+ "%sMountAPIVFS: %s\n"
"%sIgnoreSIGPIPE: %s\n"
"%sMemoryDenyWriteExecute: %s\n"
"%sRestrictRealtime: %s\n",
@@ -3310,10 +3327,14 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->private_users),
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
+ prefix, yes_no(c->mount_apivfs),
prefix, yes_no(c->ignore_sigpipe),
prefix, yes_no(c->memory_deny_write_execute),
prefix, yes_no(c->restrict_realtime));
+ if (c->root_image)
+ fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
+
STRV_FOREACH(e, c->environment)
fprintf(f, "%sEnvironment: %s\n", prefix, *e);
diff --git a/src/core/execute.h b/src/core/execute.h
index f8694ef520..9f2b6fd39e 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -106,7 +106,7 @@ struct ExecContext {
char **pass_environment;
struct rlimit *rlimit[_RLIMIT_MAX];
- char *working_directory, *root_directory;
+ char *working_directory, *root_directory, *root_image;
bool working_directory_missing_ok;
bool working_directory_home;
@@ -183,6 +183,7 @@ struct ExecContext {
bool protect_kernel_tunables;
bool protect_kernel_modules;
bool protect_control_groups;
+ bool mount_apivfs;
bool no_new_privileges;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 15f22a2681..cb9e6fea27 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -19,6 +19,7 @@ m4_dnl Define the context options only once
m4_define(`EXEC_CONTEXT_CONFIG_ITEMS',
`$1.WorkingDirectory, config_parse_working_directory, 0, offsetof($1, exec_context)
$1.RootDirectory, config_parse_unit_path_printf, 0, offsetof($1, exec_context.root_directory)
+$1.RootImage, config_parse_unit_path_printf, 0, offsetof($1, exec_context.root_image)
$1.User, config_parse_user_group, 0, offsetof($1, exec_context.user)
$1.Group, config_parse_user_group, 0, offsetof($1, exec_context.group)
$1.SupplementaryGroups, config_parse_user_group_strv, 0, offsetof($1, exec_context.supplementary_groups)
@@ -101,6 +102,7 @@ $1.PrivateUsers, config_parse_bool, 0,
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context)
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
+$1.MountAPIVFS, config_parse_bool, 0, offsetof($1, exec_context.mount_apivfs)
$1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality)
$1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.runtime_directory_mode)
$1.RuntimeDirectory, config_parse_runtime_directory, 0, offsetof($1, exec_context.runtime_directory)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 243c288885..5b7471c0d0 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -3839,7 +3839,8 @@ int config_parse_namespace_path_strv(
cur = rvalue;
for (;;) {
_cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL;
- bool ignore_enoent;
+ const char *w;
+ bool ignore_enoent = false, shall_prefix = false;
r = extract_first_word(&cur, &word, NULL, EXTRACT_QUOTES);
if (r == 0)
@@ -3856,9 +3857,17 @@ int config_parse_namespace_path_strv(
continue;
}
- ignore_enoent = word[0] == '-';
+ w = word;
+ if (startswith(w, "-")) {
+ ignore_enoent = true;
+ w++;
+ }
+ if (startswith(w, "+")) {
+ shall_prefix = true;
+ w++;
+ }
- r = unit_full_printf(u, word + ignore_enoent, &resolved);
+ r = unit_full_printf(u, w, &resolved);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve specifiers in %s: %m", word);
continue;
@@ -3871,7 +3880,9 @@ int config_parse_namespace_path_strv(
path_kill_slashes(resolved);
- joined = strjoin(ignore_enoent ? "-" : "", resolved);
+ joined = strjoin(ignore_enoent ? "-" : "",
+ shall_prefix ? "+" : "",
+ resolved);
r = strv_push(sv, joined);
if (r < 0)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 834883267c..75dca5b791 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -30,6 +30,7 @@
#include "dev-setup.h"
#include "fd-util.h"
#include "fs-util.h"
+#include "loop-util.h"
#include "loopback-setup.h"
#include "missing.h"
#include "mkdir.h"
@@ -52,10 +53,13 @@ typedef enum MountMode {
INACCESSIBLE,
BIND_MOUNT,
BIND_MOUNT_RECURSIVE,
- READONLY,
PRIVATE_TMP,
PRIVATE_VAR_TMP,
PRIVATE_DEV,
+ BIND_DEV,
+ SYSFS,
+ PROCFS,
+ READONLY,
READWRITE,
} MountMode;
@@ -70,13 +74,13 @@ typedef struct MountEntry {
char *source_malloc;
} MountEntry;
-/*
- * The following Protect tables are to protect paths and mark some of them
- * READONLY, in case a path is covered by an option from another table, then
- * it is marked READWRITE in the current one, and the more restrictive mode is
- * applied from that other table. This way all options can be combined in a
- * safe and comprehensible way for users.
- */
+/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
+ * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
+static const MountEntry apivfs_table[] = {
+ { "/proc", PROCFS, false },
+ { "/dev", BIND_DEV, false },
+ { "/sys", SYSFS, false },
+};
/* ProtectKernelTunables= option and the related filesystem APIs */
static const MountEntry protect_kernel_tunables_table[] = {
@@ -176,6 +180,13 @@ static const char *mount_entry_source(const MountEntry *p) {
return p->source_malloc ?: p->source_const;
}
+static void mount_entry_done(MountEntry *p) {
+ assert(p);
+
+ p->path_malloc = mfree(p->path_malloc);
+ p->source_malloc = mfree(p->source_malloc);
+}
+
static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {
char **i;
@@ -351,7 +362,7 @@ static void drop_duplicates(MountEntry *m, unsigned *n) {
if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {
log_debug("%s is duplicate.", mount_entry_path(f));
previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */
- f->path_malloc = mfree(f->path_malloc);
+ mount_entry_done(f);
continue;
}
@@ -379,7 +390,7 @@ static void drop_inaccessible(MountEntry *m, unsigned *n) {
* it, as inaccessible paths really should drop the entire subtree. */
if (clear && path_startswith(mount_entry_path(f), clear)) {
log_debug("%s is masked by %s.", mount_entry_path(f), clear);
- f->path_malloc = mfree(f->path_malloc);
+ mount_entry_done(f);
continue;
}
@@ -419,7 +430,7 @@ static void drop_nop(MountEntry *m, unsigned *n) {
/* We found it, let's see if it's the same mode, if so, we can drop this entry */
if (found && p->mode == f->mode) {
log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p));
- f->path_malloc = mfree(f->path_malloc);
+ mount_entry_done(f);
continue;
}
}
@@ -447,7 +458,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne
if (!path_startswith(mount_entry_path(f), root_directory)) {
log_debug("%s is outside of root directory.", mount_entry_path(f));
- f->path_malloc = mfree(f->path_malloc);
+ mount_entry_done(f);
continue;
}
@@ -458,7 +469,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne
*n = t - m;
}
-static int mount_dev(MountEntry *m) {
+static int mount_private_dev(MountEntry *m) {
static const char devnodes[] =
"/dev/null\0"
"/dev/zero\0"
@@ -597,6 +608,62 @@ fail:
return r;
}
+static int mount_bind_dev(MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
+ * /dev. This is only used when RootDirectory= is set. */
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
+ if (r > 0) /* make this a NOP if /dev is already a mount point */
+ return 0;
+
+ if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
+ return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
+
+ return 1;
+}
+
+static int mount_sysfs(MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
+ if (r > 0) /* make this a NOP if /sys is already a mount point */
+ return 0;
+
+ /* Bind mount the host's version so that we get all child mounts of it, too. */
+ if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
+ return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+ return 1;
+}
+
+static int mount_procfs(MountEntry *m) {
+ int r;
+
+ assert(m);
+
+ r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
+ if (r > 0) /* make this a NOP if /proc is already a mount point */
+ return 0;
+
+ /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
+ if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
+ return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+ return 1;
+}
+
static int mount_entry_chase(
const char *root_directory,
MountEntry *m,
@@ -684,6 +751,7 @@ static int apply_mount(
case BIND_MOUNT_RECURSIVE:
/* Also chase the source mount */
+
r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
if (r <= 0)
return r;
@@ -700,7 +768,16 @@ static int apply_mount(
break;
case PRIVATE_DEV:
- return mount_dev(m);
+ return mount_private_dev(m);
+
+ case BIND_DEV:
+ return mount_bind_dev(m);
+
+ case SYSFS:
+ return mount_sysfs(m);
+
+ case PROCFS:
+ return mount_procfs(m);
default:
assert_not_reached("Unknown mode");
@@ -722,7 +799,7 @@ static int make_read_only(MountEntry *m, char **blacklist) {
if (mount_entry_read_only(m))
r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
- else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
+ else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't*/
if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
r = -errno;
} else
@@ -738,6 +815,17 @@ static int make_read_only(MountEntry *m, char **blacklist) {
return r;
}
+static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) {
+ assert(ns_info);
+
+ /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts,
+ * they need to be around in the first place... */
+
+ return ns_info->mount_apivfs ||
+ ns_info->protect_control_groups ||
+ ns_info->protect_kernel_tunables;
+}
+
static unsigned namespace_calculate_mounts(
const NameSpaceInfo *ns_info,
char** read_write_paths,
@@ -774,11 +862,13 @@ static unsigned namespace_calculate_mounts(
(ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
(ns_info->protect_control_groups ? 1 : 0) +
(ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
- protect_home_cnt + protect_system_cnt;
+ protect_home_cnt + protect_system_cnt +
+ (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
}
int setup_namespace(
const char* root_directory,
+ const char* root_image,
const NameSpaceInfo *ns_info,
char** read_write_paths,
char** read_only_paths,
@@ -789,16 +879,57 @@ int setup_namespace(
const char* var_tmp_dir,
ProtectHome protect_home,
ProtectSystem protect_system,
- unsigned long mount_flags) {
+ unsigned long mount_flags,
+ DissectImageFlags dissect_image_flags) {
+ _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+ _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
+ _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ _cleanup_free_ void *root_hash = NULL;
MountEntry *m, *mounts = NULL;
+ size_t root_hash_size = 0;
bool make_slave = false;
unsigned n_mounts;
int r = 0;
+ assert(ns_info);
+
if (mount_flags == 0)
mount_flags = MS_SHARED;
+ if (root_image) {
+ dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT;
+
+ if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths))
+ dissect_image_flags |= DISSECT_IMAGE_READ_ONLY;
+
+ r = loop_device_make_by_path(root_image,
+ dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR,
+ &loop_device);
+ if (r < 0)
+ return r;
+
+ r = root_hash_load(root_image, &root_hash, &root_hash_size);
+ if (r < 0)
+ return r;
+
+ r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image);
+ if (r < 0)
+ return r;
+
+ if (!root_directory) {
+ /* Create a mount point for the image, if it's still missing. We use the same mount point for
+ * all images, which is safe, since they all live in their own namespaces after all, and hence
+ * won't see each other. */
+ root_directory = "/run/systemd/unit-root";
+ (void) mkdir(root_directory, 0700);
+ }
+ }
+
n_mounts = namespace_calculate_mounts(
ns_info,
read_write_paths,
@@ -878,6 +1009,12 @@ int setup_namespace(
if (r < 0)
goto finish;
+ if (namespace_info_mount_apivfs(ns_info)) {
+ r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
+ if (r < 0)
+ goto finish;
+ }
+
assert(mounts + n_mounts == m);
/* Prepend the root directory where that's necessary */
@@ -907,7 +1044,19 @@ int setup_namespace(
}
}
- if (root_directory) {
+ if (root_image) {
+ r = dissected_image_mount(dissected_image, root_directory, dissect_image_flags);
+ if (r < 0)
+ goto finish;
+
+ r = decrypted_image_relinquish(decrypted_image);
+ if (r < 0)
+ goto finish;
+
+ loop_device_relinquish(loop_device);
+
+ } else if (root_directory) {
+
/* Turn directory into bind mount, if it isn't one yet */
r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);
if (r < 0)
@@ -964,7 +1113,7 @@ int setup_namespace(
finish:
for (m = mounts; m < mounts + n_mounts; m++)
- free(m->path_malloc);
+ mount_entry_done(m);
return r;
}
diff --git a/src/core/namespace.h b/src/core/namespace.h
index de3edc419c..f54954bd86 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -25,6 +25,7 @@ typedef struct BindMount BindMount;
#include <stdbool.h>
+#include "dissect-image.h"
#include "macro.h"
typedef enum ProtectHome {
@@ -50,6 +51,7 @@ struct NameSpaceInfo {
bool protect_control_groups:1;
bool protect_kernel_tunables:1;
bool protect_kernel_modules:1;
+ bool mount_apivfs:1;
};
struct BindMount {
@@ -62,6 +64,7 @@ struct BindMount {
int setup_namespace(
const char *root_directory,
+ const char *root_image,
const NameSpaceInfo *ns_info,
char **read_write_paths,
char **read_only_paths,
@@ -72,7 +75,8 @@ int setup_namespace(
const char *var_tmp_dir,
ProtectHome protect_home,
ProtectSystem protect_system,
- unsigned long mount_flags);
+ unsigned long mount_flags,
+ DissectImageFlags dissected_image_flags);
int setup_tmp_dirs(
const char *id,
diff --git a/src/core/unit.c b/src/core/unit.c
index 44f1d5e206..90d7eea956 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -862,6 +862,12 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
return r;
}
+ if (c->root_image) {
+ r = unit_require_mounts_for(u, c->root_image);
+ if (r < 0)
+ return r;
+ }
+
if (!MANAGER_IS_SYSTEM(u->manager))
return 0;