summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2016-10-13 18:36:29 +0200
committerGitHub <noreply@github.com>2016-10-13 18:36:29 +0200
commit8bfdf29b2492f7df721d20455ee10b2fd158395b (patch)
treecf83f461afa4925386f5d7467d3480d01c79e5c6 /src
parentf5df066d1d28920e49cf03d5950330138ea4f236 (diff)
parent4982dbcc300d4599aa6ac143e922d6fbee31a860 (diff)
Merge pull request #4243 from endocode/djalal/sandbox-first-protection-kernelmodules-v1
core:sandbox: Add ProtectKernelModules= and some fixes
Diffstat (limited to 'src')
-rw-r--r--src/core/dbus-execute.c5
-rw-r--r--src/core/execute.c67
-rw-r--r--src/core/execute.h1
-rw-r--r--src/core/load-fragment-gperf.gperf.m41
-rw-r--r--src/core/namespace.c54
-rw-r--r--src/core/namespace.h14
-rw-r--r--src/core/unit.c5
-rw-r--r--src/shared/bus-unit-util.c3
-rw-r--r--src/test/test-execute.c14
-rw-r--r--src/test/test-ns.c12
10 files changed, 147 insertions, 29 deletions
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index eec4500c8c..b8720d7d3d 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property(
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
- "ProtectControlGroups")) {
+ "ProtectKernelModules", "ProtectControlGroups")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property(
c->remove_ipc = b;
else if (streq(name, "ProtectKernelTunables"))
c->protect_kernel_tunables = b;
+ else if (streq(name, "ProtectKernelModules"))
+ c->protect_kernel_modules = b;
else if (streq(name, "ProtectControlGroups"))
c->protect_control_groups = b;
diff --git a/src/core/execute.c b/src/core/execute.c
index 0c983f4953..869522704a 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1436,6 +1436,50 @@ finish:
return r;
}
+static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
+ static const int module_syscalls[] = {
+ SCMP_SYS(delete_module),
+ SCMP_SYS(finit_module),
+ SCMP_SYS(init_module),
+ };
+
+ scmp_filter_ctx *seccomp;
+ unsigned i;
+ int r;
+
+ assert(c);
+
+ /* Turn of module syscalls on ProtectKernelModules=yes */
+
+ if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
+ return 0;
+
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ r = seccomp_add_secondary_archs(seccomp);
+ if (r < 0)
+ goto finish;
+
+ for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
+ r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
+ module_syscalls[i], 0);
+ if (r < 0)
+ goto finish;
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ goto finish;
+
+ r = seccomp_load(seccomp);
+
+finish:
+ seccomp_release(seccomp);
+ return r;
+}
+
static int apply_private_devices(Unit *u, const ExecContext *c) {
const SystemCallFilterSet *set;
scmp_filter_ctx *seccomp;
@@ -1722,6 +1766,7 @@ static bool exec_needs_mount_namespace(
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
context->protect_kernel_tunables ||
+ context->protect_kernel_modules ||
context->protect_control_groups)
return true;
@@ -2070,6 +2115,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
c->memory_deny_write_execute ||
c->restrict_realtime ||
c->protect_kernel_tunables ||
+ c->protect_kernel_modules ||
+ c->private_devices ||
context_has_syscall_filters(c);
}
@@ -2449,6 +2496,12 @@ static int exec_child(
if (needs_mount_namespace) {
_cleanup_free_ char **rw = NULL;
char *tmp = NULL, *var = NULL;
+ NameSpaceInfo ns_info = {
+ .private_dev = context->private_devices,
+ .protect_control_groups = context->protect_control_groups,
+ .protect_kernel_tunables = context->protect_kernel_tunables,
+ .protect_kernel_modules = context->protect_kernel_modules,
+ };
/* The runtime struct only contains the parent
* of the private /tmp, which is
@@ -2471,14 +2524,12 @@ static int exec_child(
r = setup_namespace(
(params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL,
+ &ns_info,
rw,
context->read_only_paths,
context->inaccessible_paths,
tmp,
var,
- context->private_devices,
- context->protect_kernel_tunables,
- context->protect_control_groups,
context->protect_home,
context->protect_system,
context->mount_flags);
@@ -2690,6 +2741,14 @@ static int exec_child(
}
}
+ if (context->protect_kernel_modules) {
+ r = apply_protect_kernel_modules(unit, context);
+ if (r < 0) {
+ *exit_status = EXIT_SECCOMP;
+ return r;
+ }
+ }
+
if (context->private_devices) {
r = apply_private_devices(unit, context);
if (r < 0) {
@@ -3131,6 +3190,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
"%sProtectKernelTunables: %s\n"
+ "%sProtectKernelModules: %s\n"
"%sProtectControlGroups: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateUsers: %s\n"
@@ -3146,6 +3206,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
prefix, yes_no(c->protect_kernel_tunables),
+ prefix, yes_no(c->protect_kernel_modules),
prefix, yes_no(c->protect_control_groups),
prefix, yes_no(c->private_network),
prefix, yes_no(c->private_users),
diff --git a/src/core/execute.h b/src/core/execute.h
index 449180c903..1de439c3ad 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -175,6 +175,7 @@ struct ExecContext {
ProtectSystem protect_system;
ProtectHome protect_home;
bool protect_kernel_tunables;
+ bool protect_kernel_modules;
bool protect_control_groups;
bool no_new_privileges;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index c49c1d6732..a700d853cc 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -90,6 +90,7 @@ $1.InaccessiblePaths, config_parse_namespace_path_strv, 0,
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables)
+$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules)
$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups)
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 43a2f4ba6e..1195e9a854 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -97,6 +97,14 @@ static const TargetMount protect_kernel_tunables_table[] = {
{ "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
};
+/* ProtectKernelModules= option */
+static const TargetMount protect_kernel_modules_table[] = {
+#ifdef HAVE_SPLIT_USR
+ { "/lib/modules", INACCESSIBLE, true },
+#endif
+ { "/usr/lib/modules", INACCESSIBLE, true },
+};
+
/*
* ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of
* system should be protected by ProtectSystem=
@@ -207,6 +215,13 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct
ELEMENTSOF(protect_kernel_tunables_table));
}
+static int append_protect_kernel_modules(BindMount **p, const char *root_directory) {
+ assert(p);
+
+ return append_target_mounts(p, root_directory, protect_kernel_modules_table,
+ ELEMENTSOF(protect_kernel_modules_table));
+}
+
static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) {
int r = 0;
@@ -660,14 +675,12 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned
}
static unsigned namespace_calculate_mounts(
+ const NameSpaceInfo *ns_info,
char** read_write_paths,
char** read_only_paths,
char** inaccessible_paths,
const char* tmp_dir,
const char* var_tmp_dir,
- bool private_dev,
- bool protect_sysctl,
- bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system) {
@@ -690,22 +703,21 @@ static unsigned namespace_calculate_mounts(
strv_length(read_write_paths) +
strv_length(read_only_paths) +
strv_length(inaccessible_paths) +
- private_dev +
- (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
- (protect_cgroups ? 1 : 0) +
+ ns_info->private_dev +
+ (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
+ (ns_info->protect_control_groups ? 1 : 0) +
+ (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
protect_home_cnt + protect_system_cnt;
}
int setup_namespace(
const char* root_directory,
+ const NameSpaceInfo *ns_info,
char** read_write_paths,
char** read_only_paths,
char** inaccessible_paths,
const char* tmp_dir,
const char* var_tmp_dir,
- bool private_dev,
- bool protect_sysctl,
- bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags) {
@@ -718,13 +730,12 @@ int setup_namespace(
if (mount_flags == 0)
mount_flags = MS_SHARED;
- n = namespace_calculate_mounts(read_write_paths,
+ n = namespace_calculate_mounts(ns_info,
+ read_write_paths,
read_only_paths,
inaccessible_paths,
tmp_dir, var_tmp_dir,
- private_dev, protect_sysctl,
- protect_cgroups, protect_home,
- protect_system);
+ protect_home, protect_system);
/* Set mount slave mode */
if (root_directory || n > 0)
@@ -756,16 +767,25 @@ int setup_namespace(
m++;
}
- if (private_dev) {
+ if (ns_info->private_dev) {
m->path = prefix_roota(root_directory, "/dev");
m->mode = PRIVATE_DEV;
m++;
}
- if (protect_sysctl)
- append_protect_kernel_tunables(&m, root_directory);
+ if (ns_info->protect_kernel_tunables) {
+ r = append_protect_kernel_tunables(&m, root_directory);
+ if (r < 0)
+ return r;
+ }
+
+ if (ns_info->protect_kernel_modules) {
+ r = append_protect_kernel_modules(&m, root_directory);
+ if (r < 0)
+ return r;
+ }
- if (protect_cgroups) {
+ if (ns_info->protect_control_groups) {
m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
m->mode = READONLY;
m++;
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 6505bcc499..6310638e9a 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -4,6 +4,7 @@
This file is part of systemd.
Copyright 2010 Lennart Poettering
+ Copyright 2016 Djalal Harouni
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +20,8 @@
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
+typedef struct NameSpaceInfo NameSpaceInfo;
+
#include <stdbool.h>
#include "macro.h"
@@ -40,15 +43,20 @@ typedef enum ProtectSystem {
_PROTECT_SYSTEM_INVALID = -1
} ProtectSystem;
+struct NameSpaceInfo {
+ bool private_dev:1;
+ bool protect_control_groups:1;
+ bool protect_kernel_tunables:1;
+ bool protect_kernel_modules:1;
+};
+
int setup_namespace(const char *chroot,
+ const NameSpaceInfo *ns_info,
char **read_write_paths,
char **read_only_paths,
char **inaccessible_paths,
const char *tmp_dir,
const char *var_tmp_dir,
- bool private_dev,
- bool protect_sysctl,
- bool protect_cgroups,
ProtectHome protect_home,
ProtectSystem protect_system,
unsigned long mount_flags);
diff --git a/src/core/unit.c b/src/core/unit.c
index 690f7f7dd9..67668bdc48 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3399,7 +3399,10 @@ int unit_patch_contexts(Unit *u) {
ec->no_new_privileges = true;
if (ec->private_devices)
- ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD);
+ ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO));
+
+ if (ec->protect_kernel_modules)
+ ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);
if (ec->dynamic_user) {
if (!ec->user) {
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index a550a370b5..f639e0e832 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
- "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
+ "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
+ "ProtectKernelModules", "ProtectControlGroups")) {
r = parse_boolean(eq);
if (r < 0)
diff --git a/src/test/test-execute.c b/src/test/test-execute.c
index 8b4ff22495..e8ff02adaf 100644
--- a/src/test/test-execute.c
+++ b/src/test/test-execute.c
@@ -140,6 +140,19 @@ static void test_exec_privatedevices_capabilities(Manager *m) {
}
test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED);
+ test(m, "exec-privatedevices-yes-capability-sys-rawio.service", 0, CLD_EXITED);
+ test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED);
+}
+
+static void test_exec_protectkernelmodules(Manager *m) {
+ if (detect_container() > 0) {
+ log_notice("testing in container, skipping protectkernelmodules tests");
+ return;
+ }
+
+ test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED);
+ test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED);
+ test(m, "exec-protectkernelmodules-yes-mount-propagation.service", 0, CLD_EXITED);
}
static void test_exec_readonlypaths(Manager *m) {
@@ -368,6 +381,7 @@ int main(int argc, char *argv[]) {
test_exec_privatetmp,
test_exec_privatedevices,
test_exec_privatedevices_capabilities,
+ test_exec_protectkernelmodules,
test_exec_readonlypaths,
test_exec_readwritepaths,
test_exec_inaccessiblepaths,
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index c4d4da6d05..da7a8b0565 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -45,6 +45,14 @@ int main(int argc, char *argv[]) {
"/home/lennart/projects",
NULL
};
+
+ static const NameSpaceInfo ns_info = {
+ .private_dev = true,
+ .protect_control_groups = true,
+ .protect_kernel_tunables = true,
+ .protect_kernel_modules = true,
+ };
+
char *root_directory;
char *projects_directory;
int r;
@@ -69,14 +77,12 @@ int main(int argc, char *argv[]) {
log_info("Not chrooted");
r = setup_namespace(root_directory,
+ &ns_info,
(char **) writable,
(char **) readonly,
(char **) inaccessible,
tmp_dir,
var_tmp_dir,
- true,
- true,
- true,
PROTECT_HOME_NO,
PROTECT_SYSTEM_NO,
0);