From 502d704e5ed2d288069471f4e3611115cde107d6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 13:31:21 +0200 Subject: core:sandbox: Add ProtectKernelModules= option This is useful to turn off explicit module load and unload operations on modular kernels. This option removes CAP_SYS_MODULE from the capability bounding set for the unit, and installs a system call filter to block module system calls. This option will not prevent the kernel from loading modules using the module auto-load feature which is a system wide operation. --- man/systemd.exec.xml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 986985ad35..3bea4976b3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1404,6 +1404,23 @@ logging. This does not affect commands prefixed with +. + + ProtectKernelModules= + + Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recomended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes CAP_SYS_MODULE from the capability bounding set for + the unit, and installs a system call filter to block module system calls. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + sysctl.d5 + kernel.modules_disabled mechanism and + /proc/sys/kernel/modules_disabled documentation. + + Personality= -- cgit v1.2.3-54-g00ecf From 2cd0a735470894bd2d25147442285744764633a1 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 7 Oct 2016 20:38:05 +0200 Subject: core:sandbox: remove CAP_SYS_RAWIO on PrivateDevices=yes The rawio system calls were filtered, but CAP_SYS_RAWIO allows to access raw data through /proc, ioctl and some other exotic system calls... --- man/systemd.exec.xml | 4 ++-- src/core/unit.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3bea4976b3..c46c0f6dd8 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -946,8 +946,8 @@ /dev/port and others. This is useful to securely turn off physical device access by the executed process. Defaults to false. Enabling this option will install a system call filter to block low-level I/O system calls that are grouped in the @raw-io set, will also remove - CAP_MKNOD from the capability bounding set for the unit (see above), and set - DevicePolicy=closed (see + CAP_MKNOD and CAP_SYS_RAWIO from the capability bounding set for + the unit (see above), and set DevicePolicy=closed (see systemd.resource-control5 for details). Note that using this setting will disconnect propagation of mounts from the service to the host (propagation in the opposite direction continues to work). This means that this setting may not be used for diff --git a/src/core/unit.c b/src/core/unit.c index 71f95c0b96..67668bdc48 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3399,7 +3399,7 @@ int unit_patch_contexts(Unit *u) { ec->no_new_privileges = true; if (ec->private_devices) - ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); + ec->capability_bounding_set &= ~((UINT64_C(1) << CAP_MKNOD) | (UINT64_C(1) << CAP_SYS_RAWIO)); if (ec->protect_kernel_modules) ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); -- cgit v1.2.3-54-g00ecf From ac246d9868bd476297e2702e0a7ef52294f9cfa8 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sat, 8 Oct 2016 17:48:35 +0200 Subject: doc: minor hint about InaccessiblePaths= in regard of ProtectKernelTunables= --- man/systemd.exec.xml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index c46c0f6dd8..4a68695348 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1046,7 +1046,10 @@ boot-time, with the sysctl.d5 mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for most services. For this setting the same restrictions regarding mount propagation and privileges apply as for - ReadOnlyPaths= and related calls, see above. Defaults to off. + ReadOnlyPaths= and related calls, see above. Defaults to off. + Note that this option does not prevent kernel tuning through IPC interfaces and exeternal programs. However + InaccessiblePaths= can be used to make some IPC file system objects + inaccessible. -- cgit v1.2.3-54-g00ecf From c575770b75b6cd15684fbacd249147bf5fd6ead7 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 14:11:16 +0200 Subject: core:sandbox: lets make /lib/modules/ inaccessible on ProtectKernelModules= Lets go further and make /lib/modules/ inaccessible for services that do not have business with modules, this is a minor improvment but it may help on setups with custom modules and they are limited... in regard of kernel auto-load feature. This change introduce NameSpaceInfo struct which we may embed later inside ExecContext but for now lets just reduce the argument number to setup_namespace() and merge ProtectKernelModules feature. --- man/systemd.exec.xml | 5 ++++- src/core/execute.c | 11 ++++++++--- src/core/namespace.c | 54 +++++++++++++++++++++++++++++++++++----------------- src/core/namespace.h | 14 +++++++++++--- src/test/test-ns.c | 12 +++++++++--- 5 files changed, 69 insertions(+), 27 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 4a68695348..249fcb0363 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1415,7 +1415,10 @@ kernels. It is recomended to turn this on for most services that do not need special file systems or extra kernel modules to work. Default to off. Enabling this option removes CAP_SYS_MODULE from the capability bounding set for - the unit, and installs a system call filter to block module system calls. + the unit, and installs a system call filter to block module system calls, + also /usr/lib/modules is made inaccessible. For this + setting the same restrictions regarding mount propagation and privileges + apply as for ReadOnlyPaths= and related calls, see above. Note that limited automatic module loading due to user configuration or kernel mapping tables might still happen as side effect of requested user operations, both privileged and unprivileged. To disable module auto-load feature please see diff --git a/src/core/execute.c b/src/core/execute.c index 7a278b7d31..dc078d96f0 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1766,6 +1766,7 @@ static bool exec_needs_mount_namespace( context->protect_system != PROTECT_SYSTEM_NO || context->protect_home != PROTECT_HOME_NO || context->protect_kernel_tunables || + context->protect_kernel_modules || context->protect_control_groups) return true; @@ -2493,6 +2494,12 @@ static int exec_child( if (needs_mount_namespace) { _cleanup_free_ char **rw = NULL; char *tmp = NULL, *var = NULL; + NameSpaceInfo ns_info = { + .private_dev = context->private_devices, + .protect_control_groups = context->protect_control_groups, + .protect_kernel_tunables = context->protect_kernel_tunables, + .protect_kernel_modules = context->protect_kernel_modules, + }; /* The runtime struct only contains the parent * of the private /tmp, which is @@ -2515,14 +2522,12 @@ static int exec_child( r = setup_namespace( (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, + &ns_info, rw, context->read_only_paths, context->inaccessible_paths, tmp, var, - context->private_devices, - context->protect_kernel_tunables, - context->protect_control_groups, context->protect_home, context->protect_system, context->mount_flags); diff --git a/src/core/namespace.c b/src/core/namespace.c index 43a2f4ba6e..1195e9a854 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -97,6 +97,14 @@ static const TargetMount protect_kernel_tunables_table[] = { { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ }; +/* ProtectKernelModules= option */ +static const TargetMount protect_kernel_modules_table[] = { +#ifdef HAVE_SPLIT_USR + { "/lib/modules", INACCESSIBLE, true }, +#endif + { "/usr/lib/modules", INACCESSIBLE, true }, +}; + /* * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of * system should be protected by ProtectSystem= @@ -207,6 +215,13 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct ELEMENTSOF(protect_kernel_tunables_table)); } +static int append_protect_kernel_modules(BindMount **p, const char *root_directory) { + assert(p); + + return append_target_mounts(p, root_directory, protect_kernel_modules_table, + ELEMENTSOF(protect_kernel_modules_table)); +} + static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) { int r = 0; @@ -660,14 +675,12 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned } static unsigned namespace_calculate_mounts( + const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system) { @@ -690,22 +703,21 @@ static unsigned namespace_calculate_mounts( strv_length(read_write_paths) + strv_length(read_only_paths) + strv_length(inaccessible_paths) + - private_dev + - (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + - (protect_cgroups ? 1 : 0) + + ns_info->private_dev + + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (ns_info->protect_control_groups ? 1 : 0) + + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + protect_home_cnt + protect_system_cnt; } int setup_namespace( const char* root_directory, + const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags) { @@ -718,13 +730,12 @@ int setup_namespace( if (mount_flags == 0) mount_flags = MS_SHARED; - n = namespace_calculate_mounts(read_write_paths, + n = namespace_calculate_mounts(ns_info, + read_write_paths, read_only_paths, inaccessible_paths, tmp_dir, var_tmp_dir, - private_dev, protect_sysctl, - protect_cgroups, protect_home, - protect_system); + protect_home, protect_system); /* Set mount slave mode */ if (root_directory || n > 0) @@ -756,16 +767,25 @@ int setup_namespace( m++; } - if (private_dev) { + if (ns_info->private_dev) { m->path = prefix_roota(root_directory, "/dev"); m->mode = PRIVATE_DEV; m++; } - if (protect_sysctl) - append_protect_kernel_tunables(&m, root_directory); + if (ns_info->protect_kernel_tunables) { + r = append_protect_kernel_tunables(&m, root_directory); + if (r < 0) + return r; + } + + if (ns_info->protect_kernel_modules) { + r = append_protect_kernel_modules(&m, root_directory); + if (r < 0) + return r; + } - if (protect_cgroups) { + if (ns_info->protect_control_groups) { m->path = prefix_roota(root_directory, "/sys/fs/cgroup"); m->mode = READONLY; m++; diff --git a/src/core/namespace.h b/src/core/namespace.h index 6505bcc499..6310638e9a 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -4,6 +4,7 @@ This file is part of systemd. Copyright 2010 Lennart Poettering + Copyright 2016 Djalal Harouni systemd is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by @@ -19,6 +20,8 @@ along with systemd; If not, see . ***/ +typedef struct NameSpaceInfo NameSpaceInfo; + #include #include "macro.h" @@ -40,15 +43,20 @@ typedef enum ProtectSystem { _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; +struct NameSpaceInfo { + bool private_dev:1; + bool protect_control_groups:1; + bool protect_kernel_tunables:1; + bool protect_kernel_modules:1; +}; + int setup_namespace(const char *chroot, + const NameSpaceInfo *ns_info, char **read_write_paths, char **read_only_paths, char **inaccessible_paths, const char *tmp_dir, const char *var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags); diff --git a/src/test/test-ns.c b/src/test/test-ns.c index c4d4da6d05..da7a8b0565 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -45,6 +45,14 @@ int main(int argc, char *argv[]) { "/home/lennart/projects", NULL }; + + static const NameSpaceInfo ns_info = { + .private_dev = true, + .protect_control_groups = true, + .protect_kernel_tunables = true, + .protect_kernel_modules = true, + }; + char *root_directory; char *projects_directory; int r; @@ -69,14 +77,12 @@ int main(int argc, char *argv[]) { log_info("Not chrooted"); r = setup_namespace(root_directory, + &ns_info, (char **) writable, (char **) readonly, (char **) inaccessible, tmp_dir, var_tmp_dir, - true, - true, - true, PROTECT_HOME_NO, PROTECT_SYSTEM_NO, 0); -- cgit v1.2.3-54-g00ecf