diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/namespace.c | 171 | 
1 files changed, 113 insertions, 58 deletions
| diff --git a/src/core/namespace.c b/src/core/namespace.c index 3234fab4bc..985e343096 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -70,6 +70,14 @@ typedef struct TargetMount {          bool ignore; /* Ignore if path does not exist */  } TargetMount; +/* + * The following Protect tables are to protect paths and mark some of them + * READONLY, in case a path is covered by an option from another table, then + * it is marked READWRITE in the current one, and the more restrictive mode is + * applied from that other table. This way all options can be combined in a + * safe and comprehensible way for users. + */ +  /* ProtectKernelTunables= option and the related filesystem APIs */  static const TargetMount protect_kernel_tunables_table[] = {          { "/proc/sys",                  READONLY,       false }, @@ -89,6 +97,45 @@ static const TargetMount protect_kernel_tunables_table[] = {          { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */  }; +/* ProtectSystem=yes table */ +static const TargetMount protect_system_yes_table[] = { +        { "/usr",       READONLY,       false }, +        { "/boot",      READONLY,       true  }, +        { "/efi",       READONLY,       true  }, +}; + +/* ProtectSystem=full includes ProtectSystem=yes */ +static const TargetMount protect_system_full_table[] = { +        { "/usr",       READONLY,       false }, +        { "/boot",      READONLY,       true  }, +        { "/efi",       READONLY,       true  }, +        { "/etc",       READONLY,       false }, +}; + +/* + * ProtectSystem=strict table. In this strict mode, we mount everything + * read-only, except for /proc, /dev, /sys which are the kernel API VFS, + * which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. + * (And of course /home and friends are also left writable, as ProtectHome= + * shall manage those, orthogonally). + */ +static const TargetMount protect_system_strict_table[] = { +        { "/",          READONLY,       false }, +        { "/proc",      READWRITE,      false },      /* ProtectKernelTunables= */ +        { "/sys",       READWRITE,      false },      /* ProtectKernelTunables= */ +        { "/dev",       READWRITE,      false },      /* PrivateDevices= */ +        { "/home",      READWRITE,      true  },      /* ProtectHome= */ +        { "/run/user",  READWRITE,      true  },      /* ProtectHome= */ +        { "/root",      READWRITE,      true  },      /* ProtectHome= */ +}; + +static void set_bind_mount(BindMount **p, const char *path, MountMode mode, bool ignore) { +        (*p)->path = path; +        (*p)->mode = mode; +        (*p)->ignore = ignore; +} +  static int append_mounts(BindMount **p, char **strv, MountMode mode) {          char **i; @@ -105,27 +152,71 @@ static int append_mounts(BindMount **p, char **strv, MountMode mode) {                  if (!path_is_absolute(*i))                          return -EINVAL; -                (*p)->path = *i; -                (*p)->mode = mode; -                (*p)->ignore = ignore; +                set_bind_mount(p, *i, mode, ignore);                  (*p)++;          }          return 0;  } -static void append_protect_kernel_tunables(BindMount **p, const char *root_directory) { -        unsigned int i; +static int append_target_mounts(BindMount **p, const char *root_directory, const TargetMount *mounts, const size_t size) { +        unsigned i;          assert(p); +        assert(mounts); -        for (i = 0; i < ELEMENTSOF(protect_kernel_tunables_table); i++) { -                const TargetMount *t = &protect_kernel_tunables_table[i]; -                (*p)->path = prefix_roota(root_directory, t->path); -                (*p)->mode = t->mode; -                (*p)->ignore = t->ignore; +        for (i = 0; i < size; i++) { +                /* +                 * Here we assume that the ignore field is set during +                 * declaration we do not support "-" at the beginning. +                 */ +                const TargetMount *m = &mounts[i]; +                const char *path = prefix_roota(root_directory, m->path); + +                if (!path_is_absolute(path)) +                        return -EINVAL; + +                set_bind_mount(p, path, m->mode, m->ignore);                  (*p)++;          } + +        return 0; +} + +static int append_protect_kernel_tunables(BindMount **p, const char *root_directory) { +        assert(p); + +        return append_target_mounts(p, root_directory, protect_kernel_tunables_table, +                                    ELEMENTSOF(protect_kernel_tunables_table)); +} + +static int append_protect_system(BindMount **p, const char *root_directory, ProtectSystem protect_system) { +        int r = 0; + +        assert(p); + +        if (protect_system == PROTECT_SYSTEM_NO) +                return 0; + +        switch (protect_system) { +        case PROTECT_SYSTEM_STRICT: +                r = append_target_mounts(p, root_directory, protect_system_strict_table, +                                         ELEMENTSOF(protect_system_strict_table)); +                break; +        case PROTECT_SYSTEM_YES: +                r = append_target_mounts(p, root_directory, protect_system_yes_table, +                                         ELEMENTSOF(protect_system_yes_table)); +                break; +        case PROTECT_SYSTEM_FULL: +                r = append_target_mounts(p, root_directory, protect_system_full_table, +                                         ELEMENTSOF(protect_system_full_table)); +                break; +        default: +                r = -EINVAL; +                break; +        } + +        return r;  }  static int mount_path_compare(const void *a, const void *b) { @@ -538,6 +629,14 @@ static unsigned namespace_calculate_mounts(                  ProtectHome protect_home,                  ProtectSystem protect_system) { +        unsigned protect_system_cnt = +                (protect_system == PROTECT_SYSTEM_STRICT ? +                 ELEMENTSOF(protect_system_strict_table) : +                 ((protect_system == PROTECT_SYSTEM_FULL) ? +                  ELEMENTSOF(protect_system_full_table) : +                  ((protect_system == PROTECT_SYSTEM_YES) ? +                   ELEMENTSOF(protect_system_yes_table) : 0))); +          return !!tmp_dir + !!var_tmp_dir +                  strv_length(read_write_paths) +                  strv_length(read_only_paths) + @@ -546,10 +645,7 @@ static unsigned namespace_calculate_mounts(                  (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +                  (protect_cgroups ? 1 : 0) +                  (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) + -                (protect_system == PROTECT_SYSTEM_STRICT ? -                 (2 + !private_dev + !protect_sysctl) : -                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) + -                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0))); +                protect_system_cnt;  }  int setup_namespace( @@ -648,50 +744,9 @@ int setup_namespace(                                  return r;                  } -                if (protect_system == PROTECT_SYSTEM_STRICT) { -                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the -                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables= -                         * protect those, and these options should be fully orthogonal. (And of course /home and -                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see -                         * above). */ - -                        m->path = prefix_roota(root_directory, "/"); -                        m->mode = READONLY; -                        m++; - -                        m->path = prefix_roota(root_directory, "/proc"); -                        m->mode = READWRITE; -                        m++; - -                        if (!private_dev) { -                                m->path = prefix_roota(root_directory, "/dev"); -                                m->mode = READWRITE; -                                m++; -                        } -                        if (!protect_sysctl) { -                                m->path = prefix_roota(root_directory, "/sys"); -                                m->mode = READWRITE; -                                m++; -                        } - -                } else if (protect_system != PROTECT_SYSTEM_NO) { -                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir; - -                        /* In any other mode we simply mark the relevant three directories ready-only. */ - -                        usr_dir = prefix_roota(root_directory, "/usr"); -                        boot_dir = prefix_roota(root_directory, "/boot"); -                        boot_dir = strjoina("-", boot_dir); -                        efi_dir = prefix_roota(root_directory, "/efi"); -                        efi_dir = strjoina("-", efi_dir); -                        etc_dir = prefix_roota(root_directory, "/etc"); - -                        r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL -                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir) -                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY); -                        if (r < 0) -                                return r; -                } +                r = append_protect_system(&m, root_directory, protect_system); +                if (r < 0) +                        return r;                  assert(mounts + n == m); | 
