diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/core/dbus-execute.c | 5 | ||||
| -rw-r--r-- | src/core/execute.c | 67 | ||||
| -rw-r--r-- | src/core/execute.h | 1 | ||||
| -rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 1 | ||||
| -rw-r--r-- | src/core/namespace.c | 54 | ||||
| -rw-r--r-- | src/core/namespace.h | 14 | ||||
| -rw-r--r-- | src/core/unit.c | 5 | ||||
| -rw-r--r-- | src/shared/bus-unit-util.c | 3 | ||||
| -rw-r--r-- | src/test/test-execute.c | 14 | ||||
| -rw-r--r-- | src/test/test-ns.c | 12 | 
10 files changed, 147 insertions, 29 deletions
| diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index eec4500c8c..b8720d7d3d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = {          SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), +        SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),          SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property(                                "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",                                "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",                                "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", -                              "ProtectControlGroups")) { +                              "ProtectKernelModules", "ProtectControlGroups")) {                  int b;                  r = sd_bus_message_read(message, "b", &b); @@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property(                                  c->remove_ipc = b;                          else if (streq(name, "ProtectKernelTunables"))                                  c->protect_kernel_tunables = b; +                        else if (streq(name, "ProtectKernelModules")) +                                c->protect_kernel_modules = b;                          else if (streq(name, "ProtectControlGroups"))                                  c->protect_control_groups = b; diff --git a/src/core/execute.c b/src/core/execute.c index 0c983f4953..869522704a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1436,6 +1436,50 @@ finish:          return r;  } +static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { +        static const int module_syscalls[] = { +                SCMP_SYS(delete_module), +                SCMP_SYS(finit_module), +                SCMP_SYS(init_module), +        }; + +        scmp_filter_ctx *seccomp; +        unsigned i; +        int r; + +        assert(c); + +        /* Turn of module syscalls on ProtectKernelModules=yes */ + +        if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) +                return 0; + +        seccomp = seccomp_init(SCMP_ACT_ALLOW); +        if (!seccomp) +                return -ENOMEM; + +        r = seccomp_add_secondary_archs(seccomp); +        if (r < 0) +                goto finish; + +        for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { +                r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), +                                     module_syscalls[i], 0); +                if (r < 0) +                        goto finish; +        } + +        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); +        if (r < 0) +                goto finish; + +        r = seccomp_load(seccomp); + +finish: +        seccomp_release(seccomp); +        return r; +} +  static int apply_private_devices(Unit *u, const ExecContext *c) {          const SystemCallFilterSet *set;          scmp_filter_ctx *seccomp; @@ -1722,6 +1766,7 @@ static bool exec_needs_mount_namespace(              context->protect_system != PROTECT_SYSTEM_NO ||              context->protect_home != PROTECT_HOME_NO ||              context->protect_kernel_tunables || +            context->protect_kernel_modules ||              context->protect_control_groups)                  return true; @@ -2070,6 +2115,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) {                  c->memory_deny_write_execute ||                  c->restrict_realtime ||                  c->protect_kernel_tunables || +                c->protect_kernel_modules || +                c->private_devices ||                  context_has_syscall_filters(c);  } @@ -2449,6 +2496,12 @@ static int exec_child(          if (needs_mount_namespace) {                  _cleanup_free_ char **rw = NULL;                  char *tmp = NULL, *var = NULL; +                NameSpaceInfo ns_info = { +                        .private_dev = context->private_devices, +                        .protect_control_groups = context->protect_control_groups, +                        .protect_kernel_tunables = context->protect_kernel_tunables, +                        .protect_kernel_modules = context->protect_kernel_modules, +                };                  /* The runtime struct only contains the parent                   * of the private /tmp, which is @@ -2471,14 +2524,12 @@ static int exec_child(                  r = setup_namespace(                                  (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, +                                &ns_info,                                  rw,                                  context->read_only_paths,                                  context->inaccessible_paths,                                  tmp,                                  var, -                                context->private_devices, -                                context->protect_kernel_tunables, -                                context->protect_control_groups,                                  context->protect_home,                                  context->protect_system,                                  context->mount_flags); @@ -2690,6 +2741,14 @@ static int exec_child(                          }                  } +                if (context->protect_kernel_modules) { +                        r = apply_protect_kernel_modules(unit, context); +                        if (r < 0) { +                                *exit_status = EXIT_SECCOMP; +                                return r; +                        } +                } +                  if (context->private_devices) {                          r = apply_private_devices(unit, context);                          if (r < 0) { @@ -3131,6 +3190,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {                  "%sPrivateTmp: %s\n"                  "%sPrivateDevices: %s\n"                  "%sProtectKernelTunables: %s\n" +                "%sProtectKernelModules: %s\n"                  "%sProtectControlGroups: %s\n"                  "%sPrivateNetwork: %s\n"                  "%sPrivateUsers: %s\n" @@ -3146,6 +3206,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {                  prefix, yes_no(c->private_tmp),                  prefix, yes_no(c->private_devices),                  prefix, yes_no(c->protect_kernel_tunables), +                prefix, yes_no(c->protect_kernel_modules),                  prefix, yes_no(c->protect_control_groups),                  prefix, yes_no(c->private_network),                  prefix, yes_no(c->private_users), diff --git a/src/core/execute.h b/src/core/execute.h index 449180c903..1de439c3ad 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -175,6 +175,7 @@ struct ExecContext {          ProtectSystem protect_system;          ProtectHome protect_home;          bool protect_kernel_tunables; +        bool protect_kernel_modules;          bool protect_control_groups;          bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index c49c1d6732..a700d853cc 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -90,6 +90,7 @@ $1.InaccessiblePaths,            config_parse_namespace_path_strv,   0,  $1.PrivateTmp,                   config_parse_bool,                  0,                             offsetof($1, exec_context.private_tmp)  $1.PrivateDevices,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_devices)  $1.ProtectKernelTunables,        config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_modules)  $1.ProtectControlGroups,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_control_groups)  $1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)  $1.PrivateUsers,                 config_parse_bool,                  0,                             offsetof($1, exec_context.private_users) diff --git a/src/core/namespace.c b/src/core/namespace.c index 43a2f4ba6e..1195e9a854 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -97,6 +97,14 @@ static const TargetMount protect_kernel_tunables_table[] = {          { "/sys/fs/cgroup",             READWRITE,      false }, /* READONLY is set by ProtectControlGroups= option */  }; +/* ProtectKernelModules= option */ +static const TargetMount protect_kernel_modules_table[] = { +#ifdef HAVE_SPLIT_USR +        { "/lib/modules",               INACCESSIBLE,   true  }, +#endif +        { "/usr/lib/modules",           INACCESSIBLE,   true  }, +}; +  /*   * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of   * system should be protected by ProtectSystem= @@ -207,6 +215,13 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct                                      ELEMENTSOF(protect_kernel_tunables_table));  } +static int append_protect_kernel_modules(BindMount **p, const char *root_directory) { +        assert(p); + +        return append_target_mounts(p, root_directory, protect_kernel_modules_table, +                                    ELEMENTSOF(protect_kernel_modules_table)); +} +  static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) {          int r = 0; @@ -660,14 +675,12 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned  }  static unsigned namespace_calculate_mounts( +                const NameSpaceInfo *ns_info,                  char** read_write_paths,                  char** read_only_paths,                  char** inaccessible_paths,                  const char* tmp_dir,                  const char* var_tmp_dir, -                bool private_dev, -                bool protect_sysctl, -                bool protect_cgroups,                  ProtectHome protect_home,                  ProtectSystem protect_system) { @@ -690,22 +703,21 @@ static unsigned namespace_calculate_mounts(                  strv_length(read_write_paths) +                  strv_length(read_only_paths) +                  strv_length(inaccessible_paths) + -                private_dev + -                (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + -                (protect_cgroups ? 1 : 0) + +                ns_info->private_dev + +                (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + +                (ns_info->protect_control_groups ? 1 : 0) + +                (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +                  protect_home_cnt + protect_system_cnt;  }  int setup_namespace(                  const char* root_directory, +                const NameSpaceInfo *ns_info,                  char** read_write_paths,                  char** read_only_paths,                  char** inaccessible_paths,                  const char* tmp_dir,                  const char* var_tmp_dir, -                bool private_dev, -                bool protect_sysctl, -                bool protect_cgroups,                  ProtectHome protect_home,                  ProtectSystem protect_system,                  unsigned long mount_flags) { @@ -718,13 +730,12 @@ int setup_namespace(          if (mount_flags == 0)                  mount_flags = MS_SHARED; -        n = namespace_calculate_mounts(read_write_paths, +        n = namespace_calculate_mounts(ns_info, +                                       read_write_paths,                                         read_only_paths,                                         inaccessible_paths,                                         tmp_dir, var_tmp_dir, -                                       private_dev, protect_sysctl, -                                       protect_cgroups, protect_home, -                                       protect_system); +                                       protect_home, protect_system);          /* Set mount slave mode */          if (root_directory || n > 0) @@ -756,16 +767,25 @@ int setup_namespace(                          m++;                  } -                if (private_dev) { +                if (ns_info->private_dev) {                          m->path = prefix_roota(root_directory, "/dev");                          m->mode = PRIVATE_DEV;                          m++;                  } -                if (protect_sysctl) -                        append_protect_kernel_tunables(&m, root_directory); +                if (ns_info->protect_kernel_tunables) { +                        r = append_protect_kernel_tunables(&m, root_directory); +                        if (r < 0) +                                return r; +                } + +                if (ns_info->protect_kernel_modules) { +                        r = append_protect_kernel_modules(&m, root_directory); +                        if (r < 0) +                                return r; +                } -                if (protect_cgroups) { +                if (ns_info->protect_control_groups) {                          m->path = prefix_roota(root_directory, "/sys/fs/cgroup");                          m->mode = READONLY;                          m++; diff --git a/src/core/namespace.h b/src/core/namespace.h index 6505bcc499..6310638e9a 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -4,6 +4,7 @@    This file is part of systemd.    Copyright 2010 Lennart Poettering +  Copyright 2016 Djalal Harouni    systemd is free software; you can redistribute it and/or modify it    under the terms of the GNU Lesser General Public License as published by @@ -19,6 +20,8 @@    along with systemd; If not, see <http://www.gnu.org/licenses/>.  ***/ +typedef struct NameSpaceInfo NameSpaceInfo; +  #include <stdbool.h>  #include "macro.h" @@ -40,15 +43,20 @@ typedef enum ProtectSystem {          _PROTECT_SYSTEM_INVALID = -1  } ProtectSystem; +struct NameSpaceInfo { +        bool private_dev:1; +        bool protect_control_groups:1; +        bool protect_kernel_tunables:1; +        bool protect_kernel_modules:1; +}; +  int setup_namespace(const char *chroot, +                    const NameSpaceInfo *ns_info,                      char **read_write_paths,                      char **read_only_paths,                      char **inaccessible_paths,                      const char *tmp_dir,                      const char *var_tmp_dir, -                    bool private_dev, -                    bool protect_sysctl, -                    bool protect_cgroups,                      ProtectHome protect_home,                      ProtectSystem protect_system,                      unsigned long mount_flags); diff --git a/src/core/unit.c b/src/core/unit.c index 690f7f7dd9..67668bdc48 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3399,7 +3399,10 @@ int unit_patch_contexts(Unit *u) {                          ec->no_new_privileges = true;                  if (ec->private_devices) -                        ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); +                        ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); + +                if (ec->protect_kernel_modules) +                        ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);                  if (ec->dynamic_user) {                          if (!ec->user) { diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index a550a370b5..f639e0e832 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen                                "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",                                "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",                                "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", -                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { +                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", +                              "ProtectKernelModules", "ProtectControlGroups")) {                  r = parse_boolean(eq);                  if (r < 0) diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 8b4ff22495..e8ff02adaf 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -140,6 +140,19 @@ static void test_exec_privatedevices_capabilities(Manager *m) {          }          test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED);          test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED); +        test(m, "exec-privatedevices-yes-capability-sys-rawio.service", 0, CLD_EXITED); +        test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED); +} + +static void test_exec_protectkernelmodules(Manager *m) { +        if (detect_container() > 0) { +                log_notice("testing in container, skipping protectkernelmodules tests"); +                return; +        } + +        test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED); +        test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED); +        test(m, "exec-protectkernelmodules-yes-mount-propagation.service", 0, CLD_EXITED);  }  static void test_exec_readonlypaths(Manager *m) { @@ -368,6 +381,7 @@ int main(int argc, char *argv[]) {                  test_exec_privatetmp,                  test_exec_privatedevices,                  test_exec_privatedevices_capabilities, +                test_exec_protectkernelmodules,                  test_exec_readonlypaths,                  test_exec_readwritepaths,                  test_exec_inaccessiblepaths, diff --git a/src/test/test-ns.c b/src/test/test-ns.c index c4d4da6d05..da7a8b0565 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -45,6 +45,14 @@ int main(int argc, char *argv[]) {                  "/home/lennart/projects",                  NULL          }; + +        static const NameSpaceInfo ns_info = { +                .private_dev = true, +                .protect_control_groups = true, +                .protect_kernel_tunables = true, +                .protect_kernel_modules = true, +        }; +          char *root_directory;          char *projects_directory;          int r; @@ -69,14 +77,12 @@ int main(int argc, char *argv[]) {                  log_info("Not chrooted");          r = setup_namespace(root_directory, +                            &ns_info,                              (char **) writable,                              (char **) readonly,                              (char **) inaccessible,                              tmp_dir,                              var_tmp_dir, -                            true, -                            true, -                            true,                              PROTECT_HOME_NO,                              PROTECT_SYSTEM_NO,                              0); | 
