diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/dbus-execute.c | 22 | ||||
| -rw-r--r-- | src/core/execute.c | 39 | ||||
| -rw-r--r-- | src/core/execute.h | 3 | ||||
| -rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 2 | ||||
| -rw-r--r-- | src/core/load-fragment.c | 19 | ||||
| -rw-r--r-- | src/core/namespace.c | 187 | ||||
| -rw-r--r-- | src/core/namespace.h | 6 | ||||
| -rw-r--r-- | src/core/unit.c | 6 | ||||
| -rw-r--r-- | src/dissect/dissect.c | 8 | ||||
| -rw-r--r-- | src/nspawn/nspawn.c | 57 | ||||
| -rw-r--r-- | src/shared/bus-unit-util.c | 8 | ||||
| -rw-r--r-- | src/shared/dissect-image.c | 98 | ||||
| -rw-r--r-- | src/shared/dissect-image.h | 2 | ||||
| -rw-r--r-- | src/test/test-ns.c | 2 | 
14 files changed, 350 insertions, 109 deletions
| diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index cc10e2d8e7..7df4cab3f6 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -758,6 +758,7 @@ const sd_bus_vtable bus_exec_vtable[] = {          SD_BUS_PROPERTY("LimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(ExecContext, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("WorkingDirectory", "s", property_get_working_directory, 0, SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("RootDirectory", "s", NULL, offsetof(ExecContext, root_directory), SD_BUS_VTABLE_PROPERTY_CONST), +        SD_BUS_PROPERTY("RootImage", "s", NULL, offsetof(ExecContext, root_image), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("OOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("Nice", "i", property_get_nice, 0, SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("IOScheduling", "i", property_get_ioprio, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -828,6 +829,7 @@ const sd_bus_vtable bus_exec_vtable[] = {          SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST), +        SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_VTABLE_END  }; @@ -1047,7 +1049,7 @@ int bus_exec_context_set_transient_property(                  return 1; -        } else if (STR_IN_SET(name, "TTYPath", "RootDirectory")) { +        } else if (STR_IN_SET(name, "TTYPath", "RootDirectory", "RootImage")) {                  const char *s;                  r = sd_bus_message_read(message, "s", &s); @@ -1060,6 +1062,8 @@ int bus_exec_context_set_transient_property(                  if (mode != UNIT_CHECK) {                          if (streq(name, "TTYPath"))                                  r = free_and_strdup(&c->tty_path, s); +                        else if (streq(name, "RootImage")) +                                r = free_and_strdup(&c->root_image, s);                          else {                                  assert(streq(name, "RootDirectory"));                                  r = free_and_strdup(&c->root_directory, s); @@ -1207,7 +1211,7 @@ int bus_exec_context_set_transient_property(                                "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",                                "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",                                "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", -                              "ProtectKernelModules", "ProtectControlGroups")) { +                              "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {                  int b;                  r = sd_bus_message_read(message, "b", &b); @@ -1247,6 +1251,8 @@ int bus_exec_context_set_transient_property(                                  c->protect_kernel_modules = b;                          else if (streq(name, "ProtectControlGroups"))                                  c->protect_control_groups = b; +                        else if (streq(name, "MountAPIVFS")) +                                c->mount_apivfs = b;                          unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));                  } @@ -1495,12 +1501,15 @@ int bus_exec_context_set_transient_property(                          return r;                  STRV_FOREACH(p, l) { -                        int offset; -                        if (!utf8_is_valid(*p)) +                        const char *i = *p; +                        size_t offset; + +                        if (!utf8_is_valid(i))                                  return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name); -                        offset = **p == '-'; -                        if (!path_is_absolute(*p + offset)) +                        offset = i[0] == '-'; +                        offset += i[offset] == '+'; +                        if (!path_is_absolute(i + offset))                                  return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s", name);                  } @@ -1519,7 +1528,6 @@ int bus_exec_context_set_transient_property(                                  unit_write_drop_in_private_format(u, mode, name, "%s=", name);                          } else {                                  r = strv_extend_strv(dirs, l, true); -                                  if (r < 0)                                          return -ENOMEM; diff --git a/src/core/execute.c b/src/core/execute.c index aa0ddb564e..f57eb26388 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1640,6 +1640,9 @@ static bool exec_needs_mount_namespace(          assert(context);          assert(params); +        if (context->root_image) +                return true; +          if (!strv_isempty(context->read_write_paths) ||              !strv_isempty(context->read_only_paths) ||              !strv_isempty(context->inaccessible_paths)) @@ -1662,6 +1665,9 @@ static bool exec_needs_mount_namespace(              context->protect_control_groups)                  return true; +        if (context->mount_apivfs) +                return true; +          return false;  } @@ -1935,13 +1941,14 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,          int r;          _cleanup_strv_free_ char **rw = NULL;          char *tmp = NULL, *var = NULL; -        const char *root_dir = NULL; +        const char *root_dir = NULL, *root_image = NULL;          NameSpaceInfo ns_info = {                  .ignore_protect_paths = false,                  .private_dev = context->private_devices,                  .protect_control_groups = context->protect_control_groups,                  .protect_kernel_tunables = context->protect_kernel_tunables,                  .protect_kernel_modules = context->protect_kernel_modules, +                .mount_apivfs = context->mount_apivfs,          };          assert(context); @@ -1961,8 +1968,12 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,          if (r < 0)                  return r; -        if (params->flags & EXEC_APPLY_CHROOT) -                root_dir = context->root_directory; +        if (params->flags & EXEC_APPLY_CHROOT) { +                root_image = context->root_image; + +                if (!root_image) +                        root_dir = context->root_directory; +        }          /*           * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed @@ -1972,7 +1983,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,          if (!context->dynamic_user && root_dir)                  ns_info.ignore_protect_paths = true; -        r = setup_namespace(root_dir, &ns_info, rw, +        r = setup_namespace(root_dir, root_image, +                            &ns_info, rw,                              context->read_only_paths,                              context->inaccessible_paths,                              context->bind_mounts, @@ -1981,7 +1993,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,                              var,                              context->protect_home,                              context->protect_system, -                            context->mount_flags); +                            context->mount_flags, +                            DISSECT_IMAGE_DISCARD_ON_LOOP);          /* If we couldn't set up the namespace this is probably due to a           * missing capability. In this case, silently proceeed. */ @@ -1995,10 +2008,12 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,          return r;  } -static int apply_working_directory(const ExecContext *context, -                                   const ExecParameters *params, -                                   const char *home, -                                   const bool needs_mount_ns) { +static int apply_working_directory( +                const ExecContext *context, +                const ExecParameters *params, +                const char *home, +                const bool needs_mount_ns) { +          const char *d;          const char *wd; @@ -2979,6 +2994,7 @@ void exec_context_done(ExecContext *c) {          c->working_directory = mfree(c->working_directory);          c->root_directory = mfree(c->root_directory); +        c->root_image = mfree(c->root_image);          c->tty_path = mfree(c->tty_path);          c->syslog_identifier = mfree(c->syslog_identifier);          c->user = mfree(c->user); @@ -3294,6 +3310,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {                  "%sPrivateUsers: %s\n"                  "%sProtectHome: %s\n"                  "%sProtectSystem: %s\n" +                "%sMountAPIVFS: %s\n"                  "%sIgnoreSIGPIPE: %s\n"                  "%sMemoryDenyWriteExecute: %s\n"                  "%sRestrictRealtime: %s\n", @@ -3310,10 +3327,14 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {                  prefix, yes_no(c->private_users),                  prefix, protect_home_to_string(c->protect_home),                  prefix, protect_system_to_string(c->protect_system), +                prefix, yes_no(c->mount_apivfs),                  prefix, yes_no(c->ignore_sigpipe),                  prefix, yes_no(c->memory_deny_write_execute),                  prefix, yes_no(c->restrict_realtime)); +        if (c->root_image) +                fprintf(f, "%sRootImage: %s\n", prefix, c->root_image); +          STRV_FOREACH(e, c->environment)                  fprintf(f, "%sEnvironment: %s\n", prefix, *e); diff --git a/src/core/execute.h b/src/core/execute.h index f8694ef520..9f2b6fd39e 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -106,7 +106,7 @@ struct ExecContext {          char **pass_environment;          struct rlimit *rlimit[_RLIMIT_MAX]; -        char *working_directory, *root_directory; +        char *working_directory, *root_directory, *root_image;          bool working_directory_missing_ok;          bool working_directory_home; @@ -183,6 +183,7 @@ struct ExecContext {          bool protect_kernel_tunables;          bool protect_kernel_modules;          bool protect_control_groups; +        bool mount_apivfs;          bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 15f22a2681..cb9e6fea27 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -19,6 +19,7 @@ m4_dnl Define the context options only once  m4_define(`EXEC_CONTEXT_CONFIG_ITEMS',  `$1.WorkingDirectory,            config_parse_working_directory,     0,                             offsetof($1, exec_context)  $1.RootDirectory,                config_parse_unit_path_printf,      0,                             offsetof($1, exec_context.root_directory) +$1.RootImage,                    config_parse_unit_path_printf,      0,                             offsetof($1, exec_context.root_image)  $1.User,                         config_parse_user_group,            0,                             offsetof($1, exec_context.user)  $1.Group,                        config_parse_user_group,            0,                             offsetof($1, exec_context.group)  $1.SupplementaryGroups,          config_parse_user_group_strv,       0,                             offsetof($1, exec_context.supplementary_groups) @@ -101,6 +102,7 @@ $1.PrivateUsers,                 config_parse_bool,                  0,  $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)  $1.ProtectHome,                  config_parse_protect_home,          0,                             offsetof($1, exec_context)  $1.MountFlags,                   config_parse_exec_mount_flags,      0,                             offsetof($1, exec_context) +$1.MountAPIVFS,                  config_parse_bool,                  0,                             offsetof($1, exec_context.mount_apivfs)  $1.Personality,                  config_parse_personality,           0,                             offsetof($1, exec_context.personality)  $1.RuntimeDirectoryMode,         config_parse_mode,                  0,                             offsetof($1, exec_context.runtime_directory_mode)  $1.RuntimeDirectory,             config_parse_runtime_directory,     0,                             offsetof($1, exec_context.runtime_directory) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 243c288885..5b7471c0d0 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -3839,7 +3839,8 @@ int config_parse_namespace_path_strv(          cur = rvalue;          for (;;) {                  _cleanup_free_ char *word = NULL, *resolved = NULL, *joined = NULL; -                bool ignore_enoent; +                const char *w; +                bool ignore_enoent = false, shall_prefix = false;                  r = extract_first_word(&cur, &word, NULL, EXTRACT_QUOTES);                  if (r == 0) @@ -3856,9 +3857,17 @@ int config_parse_namespace_path_strv(                          continue;                  } -                ignore_enoent = word[0] == '-'; +                w = word; +                if (startswith(w, "-")) { +                        ignore_enoent = true; +                        w++; +                } +                if (startswith(w, "+")) { +                        shall_prefix = true; +                        w++; +                } -                r = unit_full_printf(u, word + ignore_enoent, &resolved); +                r = unit_full_printf(u, w, &resolved);                  if (r < 0) {                          log_syntax(unit, LOG_ERR, filename, line, r, "Failed to resolve specifiers in %s: %m", word);                          continue; @@ -3871,7 +3880,9 @@ int config_parse_namespace_path_strv(                  path_kill_slashes(resolved); -                joined = strjoin(ignore_enoent ? "-" : "", resolved); +                joined = strjoin(ignore_enoent ? "-" : "", +                                 shall_prefix ? "+" : "", +                                 resolved);                  r = strv_push(sv, joined);                  if (r < 0) diff --git a/src/core/namespace.c b/src/core/namespace.c index 834883267c..75dca5b791 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -30,6 +30,7 @@  #include "dev-setup.h"  #include "fd-util.h"  #include "fs-util.h" +#include "loop-util.h"  #include "loopback-setup.h"  #include "missing.h"  #include "mkdir.h" @@ -52,10 +53,13 @@ typedef enum MountMode {          INACCESSIBLE,          BIND_MOUNT,          BIND_MOUNT_RECURSIVE, -        READONLY,          PRIVATE_TMP,          PRIVATE_VAR_TMP,          PRIVATE_DEV, +        BIND_DEV, +        SYSFS, +        PROCFS, +        READONLY,          READWRITE,  } MountMode; @@ -70,13 +74,13 @@ typedef struct MountEntry {          char *source_malloc;  } MountEntry; -/* - * The following Protect tables are to protect paths and mark some of them - * READONLY, in case a path is covered by an option from another table, then - * it is marked READWRITE in the current one, and the more restrictive mode is - * applied from that other table. This way all options can be combined in a - * safe and comprehensible way for users. - */ +/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted + * something there already. These mounts are hence overriden by any other explicitly configured mounts. */ +static const MountEntry apivfs_table[] = { +        { "/proc",               PROCFS,       false }, +        { "/dev",                BIND_DEV,     false }, +        { "/sys",                SYSFS,        false }, +};  /* ProtectKernelTunables= option and the related filesystem APIs */  static const MountEntry protect_kernel_tunables_table[] = { @@ -176,6 +180,13 @@ static const char *mount_entry_source(const MountEntry *p) {          return p->source_malloc ?: p->source_const;  } +static void mount_entry_done(MountEntry *p) { +        assert(p); + +        p->path_malloc = mfree(p->path_malloc); +        p->source_malloc = mfree(p->source_malloc); +} +  static int append_access_mounts(MountEntry **p, char **strv, MountMode mode) {          char **i; @@ -351,7 +362,7 @@ static void drop_duplicates(MountEntry *m, unsigned *n) {                  if (previous && path_equal(mount_entry_path(f), mount_entry_path(previous))) {                          log_debug("%s is duplicate.", mount_entry_path(f));                          previous->read_only = previous->read_only || mount_entry_read_only(f); /* Propagate the read-only flag to the remaining entry */ -                        f->path_malloc = mfree(f->path_malloc); +                        mount_entry_done(f);                          continue;                  } @@ -379,7 +390,7 @@ static void drop_inaccessible(MountEntry *m, unsigned *n) {                   * it, as inaccessible paths really should drop the entire subtree. */                  if (clear && path_startswith(mount_entry_path(f), clear)) {                          log_debug("%s is masked by %s.", mount_entry_path(f), clear); -                        f->path_malloc = mfree(f->path_malloc); +                        mount_entry_done(f);                          continue;                  } @@ -419,7 +430,7 @@ static void drop_nop(MountEntry *m, unsigned *n) {                          /* We found it, let's see if it's the same mode, if so, we can drop this entry */                          if (found && p->mode == f->mode) {                                  log_debug("%s is redundant by %s", mount_entry_path(f), mount_entry_path(p)); -                                f->path_malloc = mfree(f->path_malloc); +                                mount_entry_done(f);                                  continue;                          }                  } @@ -447,7 +458,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne                  if (!path_startswith(mount_entry_path(f), root_directory)) {                          log_debug("%s is outside of root directory.", mount_entry_path(f)); -                        f->path_malloc = mfree(f->path_malloc); +                        mount_entry_done(f);                          continue;                  } @@ -458,7 +469,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne          *n = t - m;  } -static int mount_dev(MountEntry *m) { +static int mount_private_dev(MountEntry *m) {          static const char devnodes[] =                  "/dev/null\0"                  "/dev/zero\0" @@ -597,6 +608,62 @@ fail:          return r;  } +static int mount_bind_dev(MountEntry *m) { +        int r; + +        assert(m); + +        /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's +         * /dev. This is only used when RootDirectory= is set. */ + +        r = path_is_mount_point(mount_entry_path(m), NULL, 0); +        if (r < 0) +                return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m"); +        if (r > 0) /* make this a NOP if /dev is already a mount point */ +                return 0; + +        if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) +                return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m)); + +        return 1; +} + +static int mount_sysfs(MountEntry *m) { +        int r; + +        assert(m); + +        r = path_is_mount_point(mount_entry_path(m), NULL, 0); +        if (r < 0) +                return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m"); +        if (r > 0) /* make this a NOP if /sys is already a mount point */ +                return 0; + +        /* Bind mount the host's version so that we get all child mounts of it, too. */ +        if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0) +                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + +        return 1; +} + +static int mount_procfs(MountEntry *m) { +        int r; + +        assert(m); + +        r = path_is_mount_point(mount_entry_path(m), NULL, 0); +        if (r < 0) +                return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m"); +        if (r > 0) /* make this a NOP if /proc is already a mount point */ +                return 0; + +        /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */ +        if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0) +                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m)); + +        return 1; +} +  static int mount_entry_chase(                  const char *root_directory,                  MountEntry *m, @@ -684,6 +751,7 @@ static int apply_mount(          case BIND_MOUNT_RECURSIVE:                  /* Also chase the source mount */ +                  r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);                  if (r <= 0)                          return r; @@ -700,7 +768,16 @@ static int apply_mount(                  break;          case PRIVATE_DEV: -                return mount_dev(m); +                return mount_private_dev(m); + +        case BIND_DEV: +                return mount_bind_dev(m); + +        case SYSFS: +                return mount_sysfs(m); + +        case PROCFS: +                return mount_procfs(m);          default:                  assert_not_reached("Unknown mode"); @@ -722,7 +799,7 @@ static int make_read_only(MountEntry *m, char **blacklist) {          if (mount_entry_read_only(m))                  r = bind_remount_recursive(mount_entry_path(m), true, blacklist); -        else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/ +        else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't*/                  if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)                          r = -errno;          } else @@ -738,6 +815,17 @@ static int make_read_only(MountEntry *m, char **blacklist) {          return r;  } +static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) { +        assert(ns_info); + +        /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts, +         * they need to be around in the first place... */ + +        return ns_info->mount_apivfs || +                ns_info->protect_control_groups || +                ns_info->protect_kernel_tunables; +} +  static unsigned namespace_calculate_mounts(                  const NameSpaceInfo *ns_info,                  char** read_write_paths, @@ -774,11 +862,13 @@ static unsigned namespace_calculate_mounts(                  (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +                  (ns_info->protect_control_groups ? 1 : 0) +                  (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + -                protect_home_cnt + protect_system_cnt; +                protect_home_cnt + protect_system_cnt + +                (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);  }  int setup_namespace(                  const char* root_directory, +                const char* root_image,                  const NameSpaceInfo *ns_info,                  char** read_write_paths,                  char** read_only_paths, @@ -789,16 +879,57 @@ int setup_namespace(                  const char* var_tmp_dir,                  ProtectHome protect_home,                  ProtectSystem protect_system, -                unsigned long mount_flags) { +                unsigned long mount_flags, +                DissectImageFlags dissect_image_flags) { +        _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; +        _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL; +        _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; +        _cleanup_free_ void *root_hash = NULL;          MountEntry *m, *mounts = NULL; +        size_t root_hash_size = 0;          bool make_slave = false;          unsigned n_mounts;          int r = 0; +        assert(ns_info); +          if (mount_flags == 0)                  mount_flags = MS_SHARED; +        if (root_image) { +                dissect_image_flags |= DISSECT_IMAGE_REQUIRE_ROOT; + +                if (protect_system == PROTECT_SYSTEM_STRICT && strv_isempty(read_write_paths)) +                        dissect_image_flags |= DISSECT_IMAGE_READ_ONLY; + +                r = loop_device_make_by_path(root_image, +                                             dissect_image_flags & DISSECT_IMAGE_READ_ONLY ? O_RDONLY : O_RDWR, +                                             &loop_device); +                if (r < 0) +                        return r; + +                r = root_hash_load(root_image, &root_hash, &root_hash_size); +                if (r < 0) +                        return r; + +                r = dissect_image(loop_device->fd, root_hash, root_hash_size, dissect_image_flags, &dissected_image); +                if (r < 0) +                        return r; + +                r = dissected_image_decrypt(dissected_image, NULL, root_hash, root_hash_size, dissect_image_flags, &decrypted_image); +                if (r < 0) +                        return r; + +                if (!root_directory) { +                        /* Create a mount point for the image, if it's still missing. We use the same mount point for +                         * all images, which is safe, since they all live in their own namespaces after all, and hence +                         * won't see each other. */ +                        root_directory = "/run/systemd/unit-root"; +                        (void) mkdir(root_directory, 0700); +                } +        } +          n_mounts = namespace_calculate_mounts(                          ns_info,                          read_write_paths, @@ -878,6 +1009,12 @@ int setup_namespace(                  if (r < 0)                          goto finish; +                if (namespace_info_mount_apivfs(ns_info)) { +                        r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths); +                        if (r < 0) +                                goto finish; +                } +                  assert(mounts + n_mounts == m);                  /* Prepend the root directory where that's necessary */ @@ -907,7 +1044,19 @@ int setup_namespace(                  }          } -        if (root_directory) { +        if (root_image) { +                r = dissected_image_mount(dissected_image, root_directory, dissect_image_flags); +                if (r < 0) +                        goto finish; + +                r = decrypted_image_relinquish(decrypted_image); +                if (r < 0) +                        goto finish; + +                loop_device_relinquish(loop_device); + +        } else if (root_directory) { +                  /* Turn directory into bind mount, if it isn't one yet */                  r = path_is_mount_point(root_directory, NULL, AT_SYMLINK_FOLLOW);                  if (r < 0) @@ -964,7 +1113,7 @@ int setup_namespace(  finish:          for (m = mounts; m < mounts + n_mounts; m++) -                free(m->path_malloc); +                mount_entry_done(m);          return r;  } diff --git a/src/core/namespace.h b/src/core/namespace.h index de3edc419c..f54954bd86 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -25,6 +25,7 @@ typedef struct BindMount BindMount;  #include <stdbool.h> +#include "dissect-image.h"  #include "macro.h"  typedef enum ProtectHome { @@ -50,6 +51,7 @@ struct NameSpaceInfo {          bool protect_control_groups:1;          bool protect_kernel_tunables:1;          bool protect_kernel_modules:1; +        bool mount_apivfs:1;  };  struct BindMount { @@ -62,6 +64,7 @@ struct BindMount {  int setup_namespace(                  const char *root_directory, +                const char *root_image,                  const NameSpaceInfo *ns_info,                  char **read_write_paths,                  char **read_only_paths, @@ -72,7 +75,8 @@ int setup_namespace(                  const char *var_tmp_dir,                  ProtectHome protect_home,                  ProtectSystem protect_system, -                unsigned long mount_flags); +                unsigned long mount_flags, +                DissectImageFlags dissected_image_flags);  int setup_tmp_dirs(                  const char *id, diff --git a/src/core/unit.c b/src/core/unit.c index 44f1d5e206..90d7eea956 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -862,6 +862,12 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {                          return r;          } +        if (c->root_image) { +                r = unit_require_mounts_for(u, c->root_image); +                if (r < 0) +                        return r; +        } +          if (!MANAGER_IS_SYSTEM(u->manager))                  return 0; diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index fd9db5ba87..59bd7d9e84 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -191,6 +191,14 @@ int main(int argc, char *argv[]) {                  goto finish;          } +        if (!arg_root_hash) { +                r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size); +                if (r < 0) { +                        log_error_errno(r, "Failed to read root hash file for %s: %m", arg_image); +                        goto finish; +                } +        } +          r = dissect_image(d->fd, arg_root_hash, arg_root_hash_size, arg_flags, &m);          if (r == -ENOPKG) {                  log_error_errno(r, "Couldn't identify a suitable partition table or file system in %s.", arg_image); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a8d33ad907..b172b44933 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3507,53 +3507,6 @@ static int run(int master,          return 1; /* loop again */  } -static int load_root_hash(const char *image) { -        _cleanup_free_ char *text = NULL, *fn = NULL; -        char *n, *e; -        void *k; -        size_t l; -        int r; - -        assert_se(image); - -        /* Try to load the root hash from a file next to the image file if it exists. */ - -        if (arg_root_hash) -                return 0; - -        fn = new(char, strlen(image) + strlen(".roothash") + 1); -        if (!fn) -                return log_oom(); - -        n = stpcpy(fn, image); -        e = endswith(fn, ".raw"); -        if (e) -                n = e; - -        strcpy(n, ".roothash"); - -        r = read_one_line_file(fn, &text); -        if (r == -ENOENT) -                return 0; -        if (r < 0) { -                log_warning_errno(r, "Failed to read %s, ignoring: %m", fn); -                return 0; -        } - -        r = unhexmem(text, strlen(text), &k, &l); -        if (r < 0) -                return log_error_errno(r, "Invalid root hash: %s", text); -        if (l < sizeof(sd_id128_t)) { -                free(k); -                return log_error_errno(r, "Root hash too short: %s", text); -        } - -        arg_root_hash = k; -        arg_root_hash_size = l; - -        return 0; -} -  int main(int argc, char *argv[]) {          _cleanup_free_ char *console = NULL; @@ -3769,9 +3722,13 @@ int main(int argc, char *argv[]) {                                  goto finish;                          } -                        r = load_root_hash(arg_image); -                        if (r < 0) -                                goto finish; +                        if (!arg_root_hash) { +                                r = root_hash_load(arg_image, &arg_root_hash, &arg_root_hash_size); +                                if (r < 0) { +                                        log_error_errno(r, "Failed to load root hash file for %s: %m", arg_image); +                                        goto finish; +                                } +                        }                  }                  if (!mkdtemp(tmprootdir)) { diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 829be2c6da..20c1085697 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -208,7 +208,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen                                "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",                                "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",                                "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", -                              "ProtectKernelModules", "ProtectControlGroups")) { +                              "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {                  r = parse_boolean(eq);                  if (r < 0) @@ -266,7 +266,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen                                "StandardInput", "StandardOutput", "StandardError",                                "Description", "Slice", "Type", "WorkingDirectory",                                "RootDirectory", "SyslogIdentifier", "ProtectSystem", -                              "ProtectHome", "SELinuxContext", "Restart")) +                              "ProtectHome", "SELinuxContext", "Restart", "RootImage"))                  r = sd_bus_message_append(m, "v", "s", eq);          else if (streq(field, "SyslogLevel")) { @@ -484,7 +484,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen                  for (p = eq;;) {                          _cleanup_free_ char *word = NULL; -                        int offset; +                        size_t offset;                          r = extract_first_word(&p, &word, NULL, EXTRACT_QUOTES);                          if (r < 0) { @@ -500,6 +500,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen                          }                          offset = word[0] == '-'; +                        offset += word[offset] == '+'; +                          if (!path_is_absolute(word + offset)) {                                  log_error("Failed to parse %s value %s", field, eq);                                  return -EINVAL; diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index c17486cba2..410a7764ed 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -28,14 +28,19 @@  #include "blkid-util.h"  #include "dissect-image.h"  #include "fd-util.h" +#include "fileio.h" +#include "fs-util.h"  #include "gpt.h" +#include "hexdecoct.h"  #include "mount-util.h"  #include "path-util.h"  #include "stat-util.h"  #include "stdio-util.h"  #include "string-table.h"  #include "string-util.h" +#include "strv.h"  #include "udev-util.h" +#include "xattr-util.h"  static int probe_filesystem(const char *node, char **ret_fstype) {  #ifdef HAVE_BLKID @@ -657,7 +662,9 @@ static int mount_partition(                  DissectImageFlags flags) {          const char *p, *options = NULL, *node, *fstype; +        _cleanup_free_ char *chased = NULL;          bool rw; +        int r;          assert(m);          assert(where); @@ -674,9 +681,13 @@ static int mount_partition(          rw = m->rw && !(flags & DISSECT_IMAGE_READ_ONLY); -        if (directory) -                p = strjoina(where, directory); -        else +        if (directory) { +                r = chase_symlinks(directory, where, CHASE_PREFIX_ROOT, &chased); +                if (r < 0) +                        return r; + +                p = chased; +        } else                  p = where;          /* If requested, turn on discard support. */ @@ -710,22 +721,23 @@ int dissected_image_mount(DissectedImage *m, const char *where, DissectImageFlag                  return r;          if (m->partitions[PARTITION_ESP].found) { -                const char *mp, *x; +                const char *mp;                  /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */ -                mp = "/efi"; -                x = strjoina(where, mp); -                r = dir_is_empty(x); -                if (r == -ENOENT) { -                        mp = "/boot"; -                        x = strjoina(where, mp); -                        r = dir_is_empty(x); -                } -                if (r > 0) { -                        r = mount_partition(m->partitions + PARTITION_ESP, where, mp, flags); +                FOREACH_STRING(mp, "/efi", "/boot") { +                        _cleanup_free_ char *p = NULL; + +                        r = chase_symlinks(mp, where, CHASE_PREFIX_ROOT, &p);                          if (r < 0) -                                return r; +                                continue; + +                        r = dir_is_empty(p); +                        if (r > 0) { +                                r = mount_partition(m->partitions + PARTITION_ESP, where, mp, flags); +                                if (r < 0) +                                        return r; +                        }                  }          } @@ -1111,6 +1123,62 @@ int decrypted_image_relinquish(DecryptedImage *d) {          return 0;  } +int root_hash_load(const char *image, void **ret, size_t *ret_size) { +        _cleanup_free_ char *text = NULL; +        _cleanup_free_ void *k = NULL; +        size_t l; +        int r; + +        assert(image); +        assert(ret); +        assert(ret_size); + +        if (is_device_path(image)) { +                /* If we are asked to load the root hash for a device node, exit early */ +                *ret = NULL; +                *ret_size = 0; +                return 0; +        } + +        r = getxattr_malloc(image, "user.verity.roothash", &text, true); +        if (r < 0) { +                char *fn, *e, *n; + +                if (!IN_SET(r, -ENODATA, -EOPNOTSUPP, -ENOENT)) +                        return r; + +                fn = newa(char, strlen(image) + strlen(".roothash") + 1); +                n = stpcpy(fn, image); +                e = endswith(fn, ".raw"); +                if (e) +                        n = e; + +                strcpy(n, ".roothash"); + +                r = read_one_line_file(fn, &text); +                if (r == -ENOENT) { +                        *ret = NULL; +                        *ret_size = 0; +                        return 0; +                } +                if (r < 0) +                        return r; +        } + +        r = unhexmem(text, strlen(text), &k, &l); +        if (r < 0) +                return r; +        if (l < sizeof(sd_id128_t)) +                return -EINVAL; + +        *ret = k; +        *ret_size = l; + +        k = NULL; + +        return 1; +} +  static const char *const partition_designator_table[] = {          [PARTITION_ROOT] = "root",          [PARTITION_ROOT_SECONDARY] = "root-secondary", diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h index 26319bd8e7..cdb083be6f 100644 --- a/src/shared/dissect-image.h +++ b/src/shared/dissect-image.h @@ -94,3 +94,5 @@ int decrypted_image_relinquish(DecryptedImage *d);  const char* partition_designator_to_string(int i) _const_;  int partition_designator_from_string(const char *name) _pure_; + +int root_hash_load(const char *image, void **ret, size_t *ret_size); diff --git a/src/test/test-ns.c b/src/test/test-ns.c index c99bcb371b..0125d905a6 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -77,6 +77,7 @@ int main(int argc, char *argv[]) {                  log_info("Not chrooted");          r = setup_namespace(root_directory, +                            NULL,                              &ns_info,                              (char **) writable,                              (char **) readonly, @@ -86,6 +87,7 @@ int main(int argc, char *argv[]) {                              var_tmp_dir,                              PROTECT_HOME_NO,                              PROTECT_SYSTEM_NO, +                            0,                              0);          if (r < 0) {                  log_error_errno(r, "Failed to setup namespace: %m"); | 
