From 59eeb84ba65483c5543d1bc840c2ac75642ef638 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 22 Aug 2016 18:43:59 +0200 Subject: core: add two new service settings ProtectKernelTunables= and ProtectControlGroups= If enabled, these will block write access to /sys, /proc/sys and /proc/sys/fs/cgroup. --- man/systemd.exec.xml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index bcedebd5bb..07128b489e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1059,6 +1059,26 @@ Defaults to off. + + ProtectKernelTunables= + + Takes a boolean argument. If true, kernel variables accessible through + /proc/sys and /sys will be made read-only to all processes of the + unit. Usually, tunable kernel variables should only be written at boot-time, with the + sysctl.d5 mechanism. Almost + no services need to write to these at runtime; it is hence recommended to turn this on for most + services. Defaults to off. + + + + ProtectControlGroups= + + Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible + through /sys/fs/cgroup will be made read-only to all processes of the unit. Except for + container managers no services should require write access to the control groups hierarchies; it is hence + recommended to turn this on for most services. Defaults to off. + + MountFlags= -- cgit v1.2.3-54-g00ecf From 3f815163ff8fdcdbd329680580df36f94e15325d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 15:57:21 +0200 Subject: core: introduce ProtectSystem=strict Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a new setting "strict". If set, the entire directory tree of the system is mounted read-only, but the API file systems /proc, /dev, /sys are excluded (they may be managed with PrivateDevices= and ProtectKernelTunables=). Also, /home and /root are excluded as those are left for ProtectHome= to manage. In this mode, all "real" file systems (i.e. non-API file systems) are mounted read-only, and specific directories may only be excluded via ReadWriteDirectories=, thus implementing an effective whitelist instead of blacklist of writable directories. While we are at, also add /efi to the list of paths always affected by ProtectSystem=. This is a follow-up for b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for /boot. Our namespacing logic should respect that too. --- man/systemd.exec.xml | 33 ++++++++++++++++--------------- src/core/namespace.c | 56 +++++++++++++++++++++++++++++++++++++++++++--------- src/core/namespace.h | 1 + 3 files changed, 65 insertions(+), 25 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 07128b489e..1b672fe0c9 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1020,22 +1020,23 @@ ProtectSystem= - Takes a boolean argument or - full. If true, mounts the - /usr and /boot - directories read-only for processes invoked by this unit. If - set to full, the /etc - directory is mounted read-only, too. This setting ensures that - any modification of the vendor-supplied operating system (and - optionally its configuration) is prohibited for the service. - It is recommended to enable this setting for all long-running - services, unless they are involved with system updates or need - to modify the operating system in other ways. Note however - that processes retaining the CAP_SYS_ADMIN capability can undo - the effect of this setting. This setting is hence particularly - useful for daemons which have this capability removed, for - example with CapabilityBoundingSet=. - Defaults to off. + Takes a boolean argument or the special values full or + strict. If true, mounts the /usr and /boot + directories read-only for processes invoked by this unit. If set to full, the + /etc directory is mounted read-only, too. If set to strict the entire + file system hierarchy is mounted read-only, except for the API file system subtrees /dev, + /proc and /sys (protect these directories using + PrivateDevices=, ProtectKernelTunables=, + ProtectControlGroups=). This setting ensures that any modification of the vendor-supplied + operating system (and optionally its configuration, and local mounts) is prohibited for the service. It is + recommended to enable this setting for all long-running services, unless they are involved with system updates + or need to modify the operating system in other ways. If this option is used, + ReadWritePaths= may be used to exclude specific directories from being made read-only. Note + that processes retaining the CAP_SYS_ADMIN capability (and with no system call filter that + prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence + particularly useful for daemons which have this either the @mount set filtered using + SystemCallFilter=, or have the CAP_SYS_ADMIN capability removed, for + example with CapabilityBoundingSet=. Defaults to off. diff --git a/src/core/namespace.c b/src/core/namespace.c index e08d7459c5..498cd139bf 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -472,9 +472,11 @@ int setup_namespace( private_dev + (protect_sysctl ? 3 : 0) + (protect_cgroups != protect_sysctl) + - (protect_home != PROTECT_HOME_NO ? 3 : 0) + - (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) + - (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0); + (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) + + (protect_system == PROTECT_SYSTEM_STRICT ? + (2 + !private_dev + !protect_sysctl) : + ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) + + (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0))); if (n > 0) { m = mounts = (BindMount *) alloca0(n * sizeof(BindMount)); @@ -529,9 +531,13 @@ int setup_namespace( m++; } - if (protect_home != PROTECT_HOME_NO) { + if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) { const char *home_dir, *run_user_dir, *root_dir; + /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in + * strict system protection mode, then also add entries for these directories, but mark them + * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */ + home_dir = prefix_roota(root_directory, "/home"); home_dir = strjoina("-", home_dir); run_user_dir = prefix_roota(root_directory, "/run/user"); @@ -540,22 +546,53 @@ int setup_namespace( root_dir = strjoina("-", root_dir); r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir), - protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE); + protect_home == PROTECT_HOME_READ_ONLY ? READONLY : + protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE); if (r < 0) return r; } - if (protect_system != PROTECT_SYSTEM_NO) { - const char *usr_dir, *boot_dir, *etc_dir; + if (protect_system == PROTECT_SYSTEM_STRICT) { + /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the + * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. (And of course /home and + * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see + * above). */ + + m->path = prefix_roota(root_directory, "/"); + m->mode = READONLY; + m++; + + m->path = prefix_roota(root_directory, "/proc"); + m->mode = READWRITE; + m++; + + if (!private_dev) { + m->path = prefix_roota(root_directory, "/dev"); + m->mode = READWRITE; + m++; + } + if (!protect_sysctl) { + m->path = prefix_roota(root_directory, "/sys"); + m->mode = READWRITE; + m++; + } + + } else if (protect_system != PROTECT_SYSTEM_NO) { + const char *usr_dir, *boot_dir, *efi_dir, *etc_dir; + + /* In any other mode we simply mark the relevant three directories ready-only. */ usr_dir = prefix_roota(root_directory, "/usr"); boot_dir = prefix_roota(root_directory, "/boot"); boot_dir = strjoina("-", boot_dir); + efi_dir = prefix_roota(root_directory, "/efi"); + efi_dir = strjoina("-", efi_dir); etc_dir = prefix_roota(root_directory, "/etc"); r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL - ? STRV_MAKE(usr_dir, boot_dir, etc_dir) - : STRV_MAKE(usr_dir, boot_dir), READONLY); + ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir) + : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY); if (r < 0) return r; } @@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { [PROTECT_SYSTEM_NO] = "no", [PROTECT_SYSTEM_YES] = "yes", [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", }; DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem); diff --git a/src/core/namespace.h b/src/core/namespace.h index 3845336287..6505bcc499 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -35,6 +35,7 @@ typedef enum ProtectSystem { PROTECT_SYSTEM_NO, PROTECT_SYSTEM_YES, PROTECT_SYSTEM_FULL, + PROTECT_SYSTEM_STRICT, _PROTECT_SYSTEM_MAX, _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; -- cgit v1.2.3-54-g00ecf From 63bb64a056113d4be5fefb16604accf08c8c204a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 16:12:46 +0200 Subject: core: imply ProtectHome=read-only and ProtectSystem=strict if DynamicUser=1 Let's make sure that services that use DynamicUser=1 cannot leave files in the file system should the system accidentally have a world-writable directory somewhere. This effectively ensures that directories need to be whitelisted rather than blacklisted for access when DynamicUser=1 is set. --- man/systemd.exec.xml | 12 ++++++++---- src/core/unit.c | 6 ++++++ 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 1b672fe0c9..e4d9c0ef1b 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -160,14 +160,18 @@ use. However, UID/GIDs are recycled after a unit is terminated. Care should be taken that any processes running as part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by these users/groups around, as a different unit might get the same UID/GID assigned later on, and thus gain access to - these files or directories. If DynamicUser= is enabled, RemoveIPC= and + these files or directories. If DynamicUser= is enabled, RemoveIPC=, PrivateTmp= are implied. This ensures that the lifetime of IPC objects and temporary files created by the executed processes is bound to the runtime of the service, and hence the lifetime of the dynamic user/group. Since /tmp and /var/tmp are usually the only world-writable directories on a system this ensures that a unit making use of dynamic user/group allocation - cannot leave files around after unit termination. Use RuntimeDirectory= (see below) in order - to assign a writable runtime directory to a service, owned by the dynamic user/group and removed automatically - when the unit is terminated. Defaults to off. + cannot leave files around after unit termination. Moreover ProtectSystem=strict and + ProtectHome=read-only are implied, thus prohibiting the service to write to arbitrary file + system locations. In order to allow the service to write to certain directories, they have to be whitelisted + using ReadWritePaths=, but care must be taken so that that UID/GID recycling doesn't + create security issues involving files created by the service. Use RuntimeDirectory= (see + below) in order to assign a writable runtime directory to a service, owned by the dynamic user/group and + removed automatically when the unit is terminated. Defaults to off. diff --git a/src/core/unit.c b/src/core/unit.c index de22f657c6..5d284a359d 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3377,8 +3377,14 @@ int unit_patch_contexts(Unit *u) { return -ENOMEM; } + /* If the dynamic user option is on, let's make sure that the unit can't leave its UID/GID + * around in the file system or on IPC objects. Hence enforce a strict sandbox. */ + ec->private_tmp = true; ec->remove_ipc = true; + ec->protect_system = PROTECT_SYSTEM_STRICT; + if (ec->protect_home == PROTECT_HOME_NO) + ec->protect_home = PROTECT_HOME_READ_ONLY; } } -- cgit v1.2.3-54-g00ecf From b2656f1b1ca94fc8b6a0eb44986df78d23ff7950 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 26 Aug 2016 12:22:23 +0200 Subject: man: in user-facing documentaiton don't reference C function names Let's drop the reference to the cap_from_name() function in the documentation for the capabilities setting, as it is hardly helpful. Our readers are not necessarily C hackers knowing the semantics of cap_from_name(). Moreover, the strings we accept are just the plain capability names as listed in capabilities(7) hence there's really no point in confusing the user with anything else. --- man/systemd.exec.xml | 64 +++++++++++++++++++++------------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index e4d9c0ef1b..67182f17dc 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -821,49 +821,37 @@ Controls which capabilities to include in the capability bounding set for the executed process. See capabilities7 for - details. Takes a whitespace-separated list of capability names as read by cap_from_name3, - e.g. CAP_SYS_ADMIN, CAP_DAC_OVERRIDE, - CAP_SYS_PTRACE. Capabilities listed will be included in the bounding set, all others are - removed. If the list of capabilities is prefixed with ~, all but the listed capabilities - will be included, the effect of the assignment inverted. Note that this option also affects the respective - capabilities in the effective, permitted and inheritable capability sets. If this option is not used, the - capability bounding set is not modified on process execution, hence no limits on the capabilities of the - process are enforced. This option may appear more than once, in which case the bounding sets are merged. If the - empty string is assigned to this option, the bounding set is reset to the empty capability set, and all prior - settings have no effect. If set to ~ (without any further argument), the bounding set is - reset to the full set of available capabilities, also undoing any previous settings. This does not affect - commands prefixed with +. + details. Takes a whitespace-separated list of capability names, e.g. CAP_SYS_ADMIN, + CAP_DAC_OVERRIDE, CAP_SYS_PTRACE. Capabilities listed will be + included in the bounding set, all others are removed. If the list of capabilities is prefixed with + ~, all but the listed capabilities will be included, the effect of the assignment + inverted. Note that this option also affects the respective capabilities in the effective, permitted and + inheritable capability sets. If this option is not used, the capability bounding set is not modified on process + execution, hence no limits on the capabilities of the process are enforced. This option may appear more than + once, in which case the bounding sets are merged. If the empty string is assigned to this option, the bounding + set is reset to the empty capability set, and all prior settings have no effect. If set to + ~ (without any further argument), the bounding set is reset to the full set of available + capabilities, also undoing any previous settings. This does not affect commands prefixed with + +. AmbientCapabilities= - Controls which capabilities to include in the - ambient capability set for the executed process. Takes a - whitespace-separated list of capability names as read by - cap_from_name3, - e.g. CAP_SYS_ADMIN, - CAP_DAC_OVERRIDE, - CAP_SYS_PTRACE. This option may appear more than - once in which case the ambient capability sets are merged. - If the list of capabilities is prefixed with ~, all - but the listed capabilities will be included, the effect of the - assignment inverted. If the empty string is - assigned to this option, the ambient capability set is reset to - the empty capability set, and all prior settings have no effect. - If set to ~ (without any further argument), the - ambient capability set is reset to the full set of available - capabilities, also undoing any previous settings. Note that adding - capabilities to ambient capability set adds them to the process's - inherited capability set. - - Ambient capability sets are useful if you want to execute a process - as a non-privileged user but still want to give it some capabilities. - Note that in this case option keep-caps is - automatically added to SecureBits= to retain the - capabilities over the user change. AmbientCapabilities= does not affect - commands prefixed with +. + Controls which capabilities to include in the ambient capability set for the executed + process. Takes a whitespace-separated list of capability names, e.g. CAP_SYS_ADMIN, + CAP_DAC_OVERRIDE, CAP_SYS_PTRACE. This option may appear more than + once in which case the ambient capability sets are merged. If the list of capabilities is prefixed with + ~, all but the listed capabilities will be included, the effect of the assignment + inverted. If the empty string is assigned to this option, the ambient capability set is reset to the empty + capability set, and all prior settings have no effect. If set to ~ (without any further + argument), the ambient capability set is reset to the full set of available capabilities, also undoing any + previous settings. Note that adding capabilities to ambient capability set adds them to the process's inherited + capability set. Ambient capability sets are useful if you want to execute a process as a + non-privileged user but still want to give it some capabilities. Note that in this case option + keep-caps is automatically added to SecureBits= to retain the + capabilities over the user change. AmbientCapabilities= does not affect commands prefixed + with +. -- cgit v1.2.3-54-g00ecf From effbd6d2eadb61bd236d118afc7901940c4c6b37 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 26 Aug 2016 12:24:37 +0200 Subject: man: rework documentation for ReadOnlyPaths= and related settings This reworks the documentation for ReadOnlyPaths=, ReadWritePaths=, InaccessiblePaths=. It no longer claims that we'd follow symlinks relative to the host file system. (Which wasn't true actually, as we didn't follow symlinks at all in the most recent releases, and we know do follow them, but relative to RootDirectory=). This also replaces all references to the fact that all fs namespacing options can be undone with enough privileges and disable propagation by a single one in the documentation of ReadOnlyPaths= and friends, and then directs the read to this in all other places. Moreover a hint is added to the documentation of SystemCallFilter=, suggesting usage of ~@mount in case any of the fs namespacing related options are used. --- man/systemd.exec.xml | 214 ++++++++++++++++++++++----------------------------- 1 file changed, 92 insertions(+), 122 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 67182f17dc..84f81fe38e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -877,48 +877,34 @@ ReadOnlyPaths= InaccessiblePaths= - Sets up a new file system namespace for - executed processes. These options may be used to limit access - a process might have to the main file system hierarchy. Each - setting takes a space-separated list of paths relative to - the host's root directory (i.e. the system running the service manager). - Note that if entries contain symlinks, they are resolved from the host's root directory as well. - Entries (files or directories) listed in - ReadWritePaths= are accessible from - within the namespace with the same access rights as from - outside. Entries listed in - ReadOnlyPaths= are accessible for - reading only, writing will be refused even if the usual file - access controls would permit this. Entries listed in - InaccessiblePaths= will be made - inaccessible for processes inside the namespace, and may not - countain any other mountpoints, including those specified by - ReadWritePaths= or - ReadOnlyPaths=. - Note that restricting access with these options does not extend - to submounts of a directory that are created later on. - Non-directory paths can be specified as well. These - options may be specified more than once, in which case all - paths listed will have limited access from within the - namespace. If the empty string is assigned to this option, the - specific list is reset, and all prior assignments have no - effect. - Paths in - ReadOnlyPaths= - and - InaccessiblePaths= - may be prefixed with - -, in which case - they will be ignored when they do not - exist. Note that using this - setting will disconnect propagation of - mounts from the service to the host - (propagation in the opposite direction - continues to work). This means that - this setting may not be used for - services which shall be able to - install mount points in the main mount - namespace. + Sets up a new file system namespace for executed processes. These options may be used to limit + access a process might have to the file system hierarchy. Each setting takes a space-separated list of paths + relative to the host's root directory (i.e. the system running the service manager). Note that if paths + contain symlinks, they are resolved relative to the root directory set with + RootDirectory=. + + Paths listed in ReadWritePaths= are accessible from within the namespace with the same + access modes as from outside of it. Paths listed in ReadOnlyPaths= are accessible for + reading only, writing will be refused even if the usual file access controls would permit this. Nest + ReadWritePaths= inside of ReadOnlyPaths= in order to provide writable + subdirectories within read-only directories. Use ReadWritePaths= in order to whitelist + specific paths for write access if ProtectSystem=strict is used. Paths listed in + InaccessiblePaths= will be made inaccessible for processes inside the namespace (along with + everything below them in the file system hierarchy). + + Note that restricting access with these options does not extend to submounts of a directory that are + created later on. Non-directory paths may be specified as well. These options may be specified more than once, + in which case all paths listed will have limited access from within the namespace. If the empty string is + assigned to this option, the specific list is reset, and all prior assignments have no effect. + + Paths in ReadOnlyPaths= and InaccessiblePaths= may be prefixed with + -, in which case they will be ignored when they do not exist. Note that using this setting + will disconnect propagation of mounts from the service to the host (propagation in the opposite direction + continues to work). This means that this setting may not be used for services which shall be able to install + mount points in the main mount namespace. Note that the effect of these settings may be undone by privileged + processes. In order to set up an effective sandboxed environment for a unit it is thus recommended to combine + these settings with either CapabilityBoundingSet=~CAP_SYS_ADMIN or + SystemCallFilter=~@mount. @@ -933,37 +919,30 @@ private /tmp and /var/tmp namespace by using the JoinsNamespaceOf= directive, see systemd.unit5 for - details. Note that using this setting will disconnect propagation of mounts from the service to the host - (propagation in the opposite direction continues to work). This means that this setting may not be used for - services which shall be able to install mount points in the main mount namespace. This setting is implied if - DynamicUser= is set. + details. This setting is implied if DynamicUser= is set. For this setting the same + restrictions regarding mount propagation and privileges apply as for ReadOnlyPaths= and + related calls, see above. + PrivateDevices= - Takes a boolean argument. If true, sets up a - new /dev namespace for the executed processes and only adds - API pseudo devices such as /dev/null, - /dev/zero or - /dev/random (as well as the pseudo TTY - subsystem) to it, but no physical devices such as - /dev/sda. This is useful to securely turn - off physical device access by the executed process. Defaults - to false. Enabling this option will also remove - CAP_MKNOD from the capability bounding - set for the unit (see above), and set - DevicePolicy=closed (see + Takes a boolean argument. If true, sets up a new /dev namespace for the executed processes and + only adds API pseudo devices such as /dev/null, /dev/zero or + /dev/random (as well as the pseudo TTY subsystem) to it, but no physical devices such as + /dev/sda. This is useful to securely turn off physical device access by the executed + process. Defaults to false. Enabling this option will also remove CAP_MKNOD from the + capability bounding set for the unit (see above), and set DevicePolicy=closed (see systemd.resource-control5 - for details). Note that using this setting will disconnect - propagation of mounts from the service to the host - (propagation in the opposite direction continues to work). - This means that this setting may not be used for services - which shall be able to install mount points in the main mount - namespace. The /dev namespace will be mounted read-only and 'noexec'. - The latter may break old programs which try to set up executable - memory by using mmap2 - of /dev/zero instead of using MAP_ANON. + for details). Note that using this setting will disconnect propagation of mounts from the service to the host + (propagation in the opposite direction continues to work). This means that this setting may not be used for + services which shall be able to install mount points in the main mount namespace. The /dev namespace will be + mounted read-only and 'noexec'. The latter may break old programs which try to set up executable memory by + using mmap2 of + /dev/zero instead of using MAP_ANON. This setting is implied if + DynamicUser= is set. For this setting the same restrictions regarding mount propagation and + privileges apply as for ReadOnlyPaths= and related calls, see above. @@ -1023,33 +1002,23 @@ operating system (and optionally its configuration, and local mounts) is prohibited for the service. It is recommended to enable this setting for all long-running services, unless they are involved with system updates or need to modify the operating system in other ways. If this option is used, - ReadWritePaths= may be used to exclude specific directories from being made read-only. Note - that processes retaining the CAP_SYS_ADMIN capability (and with no system call filter that - prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence - particularly useful for daemons which have this either the @mount set filtered using - SystemCallFilter=, or have the CAP_SYS_ADMIN capability removed, for - example with CapabilityBoundingSet=. Defaults to off. + ReadWritePaths= may be used to exclude specific directories from being made read-only. This + setting is implied if DynamicUser= is set. For this setting the same restrictions regarding + mount propagation and privileges apply as for ReadOnlyPaths= and related calls, see + above. Defaults to off. ProtectHome= - Takes a boolean argument or - read-only. If true, the directories - /home, /root and - /run/user - are made inaccessible and empty for processes invoked by this - unit. If set to read-only, the three - directories are made read-only instead. It is recommended to - enable this setting for all long-running services (in - particular network-facing ones), to ensure they cannot get - access to private user data, unless the services actually - require access to the user's private data. Note however that - processes retaining the CAP_SYS_ADMIN capability can undo the - effect of this setting. This setting is hence particularly - useful for daemons which have this capability removed, for - example with CapabilityBoundingSet=. - Defaults to off. + Takes a boolean argument or read-only. If true, the directories + /home, /root and /run/user are made inaccessible + and empty for processes invoked by this unit. If set to read-only, the three directories are + made read-only instead. It is recommended to enable this setting for all long-running services (in particular + network-facing ones), to ensure they cannot get access to private user data, unless the services actually + require access to the user's private data. This setting is implied if DynamicUser= is + set. For this setting the same restrictions regarding mount propagation and privileges apply as for + ReadOnlyPaths= and related calls, see above. @@ -1059,48 +1028,41 @@ /proc/sys and /sys will be made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at boot-time, with the sysctl.d5 mechanism. Almost - no services need to write to these at runtime; it is hence recommended to turn this on for most - services. Defaults to off. + no services need to write to these at runtime; it is hence recommended to turn this on for most services. For + this setting the same restrictions regarding mount propagation and privileges apply as for + ReadOnlyPaths= and related calls, see above. Defaults to off. ProtectControlGroups= - Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible - through /sys/fs/cgroup will be made read-only to all processes of the unit. Except for - container managers no services should require write access to the control groups hierarchies; it is hence - recommended to turn this on for most services. Defaults to off. + Takes a boolean argument. If true, the Linux Control Groups (cgroups7) hierarchies + accessible through /sys/fs/cgroup will be made read-only to all processes of the + unit. Except for container managers no services should require write access to the control groups hierarchies; + it is hence recommended to turn this on for most services. For this setting the same restrictions regarding + mount propagation and privileges apply as for ReadOnlyPaths= and related calls, see + above. Defaults to off. MountFlags= - Takes a mount propagation flag: - , or - , which control whether mounts in the - file system namespace set up for this unit's processes will - receive or propagate mounts or unmounts. See - mount2 - for details. Defaults to . Use - to ensure that mounts and unmounts are - propagated from the host to the container and vice versa. Use - to run processes so that none of their - mounts and unmounts will propagate to the host. Use - to also ensure that no mounts and - unmounts from the host will propagate into the unit processes' - namespace. Note that means that file - systems mounted on the host might stay mounted continuously in - the unit's namespace, and thus keep the device busy. Note that - the file system namespace related options - (PrivateTmp=, - PrivateDevices=, - ProtectSystem=, - ProtectHome=, - ReadOnlyPaths=, - InaccessiblePaths= and - ReadWritePaths=) require that mount - and unmount propagation from the unit's file system namespace - is disabled, and hence downgrade to + Takes a mount propagation flag: , or + , which control whether mounts in the file system namespace set up for this unit's + processes will receive or propagate mounts or unmounts. See mount2 for + details. Defaults to . Use to ensure that mounts and unmounts + are propagated from the host to the container and vice versa. Use to run processes so + that none of their mounts and unmounts will propagate to the host. Use to also ensure + that no mounts and unmounts from the host will propagate into the unit processes' namespace. Note that + means that file systems mounted on the host might stay mounted continuously in the + unit's namespace, and thus keep the device busy. Note that the file system namespace related options + (PrivateTmp=, PrivateDevices=, ProtectSystem=, + ProtectHome=, ProtectKernelTunables=, + ProtectControlGroups=, ReadOnlyPaths=, + InaccessiblePaths=, ReadWritePaths=) require that mount and unmount + propagation from the unit's file system namespace is disabled, and hence downgrade to . @@ -1335,7 +1297,15 @@ Note, that as new system calls are added to the kernel, additional system calls might be added to the groups - above, so the contents of the sets may change between systemd versions. + above, so the contents of the sets may change between systemd versions. + + It is recommended to combine the file system namespacing related options with + SystemCallFilter=~@mount, in order to prohibit the unit's processes to undo the + mappings. Specifically these are the options PrivateTmp=, + PrivateDevices=, ProtectSystem=, ProtectHome=, + ProtectKernelTunables=, ProtectControlGroups=, + ReadOnlyPaths=, InaccessiblePaths= and + ReadWritePaths=. -- cgit v1.2.3-54-g00ecf From 81c8aceed4a0cabd605788e46a266cc4cefdc16a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 26 Aug 2016 12:29:28 +0200 Subject: man: the exit code/signal is stored in $EXIT_CODE, not $EXIT_STATUS --- man/systemd.exec.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 84f81fe38e..6811e7cc53 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1612,8 +1612,8 @@ ExecStop= and ExecStopPost= processes, and encodes the service "result". Currently, the following values are defined: timeout (in case of an operation timeout), exit-code (if a service process exited with a non-zero exit code; see - $EXIT_STATUS below for the actual exit status returned), signal (if a - service process was terminated abnormally by a signal; see $EXIT_STATUS below for the actual + $EXIT_CODE below for the actual exit code returned), signal (if a + service process was terminated abnormally by a signal; see $EXIT_CODE below for the actual signal used for the termination), core-dump (if a service process terminated abnormally and dumped core), watchdog (if the watchdog keep-alive ping was enabled for the service but it missed the deadline), or resources (a catch-all condition in case a system operation -- cgit v1.2.3-54-g00ecf From 6757c06a1a8dd3755338ca76e598e0d81dc164f2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 26 Aug 2016 12:29:52 +0200 Subject: man: shorten the exit status table a bit Let's merge a couple of columns, to make the table a bit shorter. This effectively just drops whitespace, not contents, but makes the currently humungous table much much more compact. --- man/systemd.exec.xml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 6811e7cc53..403aa471c8 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1658,32 +1658,32 @@ timeout killed - TERMKILL + TERM, KILL exited - 0123255 + 0, 1, 2, 3, …, 255 exit-code exited - 0123255 + 0, 1, 2, 3, …, 255 signal killed - HUPINTKILL + HUP, INT, KILL, … core-dump dumped - ABRTSEGVQUIT + ABRT, SEGV, QUIT, … @@ -1693,12 +1693,12 @@ killed - TERMKILL + TERM, KILL exited - 0123255 + 0, 1, 2, 3, …, 255 -- cgit v1.2.3-54-g00ecf From e778185bb55320e8242b57c19079377fe33e01bc Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 19 Sep 2016 21:46:17 +0200 Subject: doc: documentation fixes for ReadWritePaths= and ProtectKernelTunables= Documentation fixes for ReadWritePaths= and ProtectKernelTunables= as reported by Evgeny Vereshchagin. --- man/systemd.exec.xml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 403aa471c8..79ceee3ec0 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -897,14 +897,14 @@ in which case all paths listed will have limited access from within the namespace. If the empty string is assigned to this option, the specific list is reset, and all prior assignments have no effect. - Paths in ReadOnlyPaths= and InaccessiblePaths= may be prefixed with - -, in which case they will be ignored when they do not exist. Note that using this setting - will disconnect propagation of mounts from the service to the host (propagation in the opposite direction - continues to work). This means that this setting may not be used for services which shall be able to install - mount points in the main mount namespace. Note that the effect of these settings may be undone by privileged - processes. In order to set up an effective sandboxed environment for a unit it is thus recommended to combine - these settings with either CapabilityBoundingSet=~CAP_SYS_ADMIN or - SystemCallFilter=~@mount. + Paths in ReadWritePaths=, ReadOnlyPaths= and + InaccessiblePaths= may be prefixed with -, in which case they will be ignored + when they do not exist. Note that using this setting will disconnect propagation of mounts from the service to + the host (propagation in the opposite direction continues to work). This means that this setting may not be used + for services which shall be able to install mount points in the main mount namespace. Note that the effect of + these settings may be undone by privileged processes. In order to set up an effective sandboxed environment for + a unit it is thus recommended to combine these settings with either + CapabilityBoundingSet=~CAP_SYS_ADMIN or SystemCallFilter=~@mount. @@ -1025,11 +1025,11 @@ ProtectKernelTunables= Takes a boolean argument. If true, kernel variables accessible through - /proc/sys and /sys will be made read-only to all processes of the - unit. Usually, tunable kernel variables should only be written at boot-time, with the - sysctl.d5 mechanism. Almost - no services need to write to these at runtime; it is hence recommended to turn this on for most services. For - this setting the same restrictions regarding mount propagation and privileges apply as for + /proc/sys, /sys and /proc/sysrq-trigger will be + made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at + boot-time, with the sysctl.d5 + mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for + most services. For this setting the same restrictions regarding mount propagation and privileges apply as for ReadOnlyPaths= and related calls, see above. Defaults to off. -- cgit v1.2.3-54-g00ecf From 9221aec8d09f3b55a08fcbe8012e48129474ab54 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 19 Sep 2016 21:46:17 +0200 Subject: doc: explicitly document that /dev/mem and /dev/port are blocked by PrivateDevices=true --- man/systemd.exec.xml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 79ceee3ec0..a3a431c82b 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -931,9 +931,10 @@ Takes a boolean argument. If true, sets up a new /dev namespace for the executed processes and only adds API pseudo devices such as /dev/null, /dev/zero or /dev/random (as well as the pseudo TTY subsystem) to it, but no physical devices such as - /dev/sda. This is useful to securely turn off physical device access by the executed - process. Defaults to false. Enabling this option will also remove CAP_MKNOD from the - capability bounding set for the unit (see above), and set DevicePolicy=closed (see + /dev/sda, system memory /dev/mem, system ports + /dev/port and others. This is useful to securely turn off physical device access by the + executed process. Defaults to false. Enabling this option will also remove CAP_MKNOD from + the capability bounding set for the unit (see above), and set DevicePolicy=closed (see systemd.resource-control5 for details). Note that using this setting will disconnect propagation of mounts from the service to the host (propagation in the opposite direction continues to work). This means that this setting may not be used for -- cgit v1.2.3-54-g00ecf From 49accde7bd915944d99c947dca0cf26ae0f24165 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 25 Sep 2016 11:30:11 +0200 Subject: core:sandbox: add more /proc/* entries to ProtectKernelTunables= Make ALSA entries, latency interface, mtrr, apm/acpi, suspend interface, filesystems configuration and IRQ tuning readonly. Most of these interfaces now days should be in /sys but they are still available through /proc, so just protect them. This patch does not touch /proc/net/... --- man/systemd.exec.xml | 6 ++++-- src/core/namespace.c | 11 +++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index a3a431c82b..f19e7f6ee9 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1026,8 +1026,10 @@ ProtectKernelTunables= Takes a boolean argument. If true, kernel variables accessible through - /proc/sys, /sys and /proc/sysrq-trigger will be - made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at + /proc/sys, /sys, /proc/sysrq-trigger, + /proc/latency_stats, /proc/acpi, + /proc/timer_stats, /proc/fs and /proc/irq will + be made read-only to all processes of the unit. Usually, tunable kernel variables should only be written at boot-time, with the sysctl.d5 mechanism. Almost no services need to write to these at runtime; it is hence recommended to turn this on for most services. For this setting the same restrictions regarding mount propagation and privileges apply as for diff --git a/src/core/namespace.c b/src/core/namespace.c index 8aa8b83c88..3234fab4bc 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -74,7 +74,18 @@ typedef struct TargetMount { static const TargetMount protect_kernel_tunables_table[] = { { "/proc/sys", READONLY, false }, { "/proc/sysrq-trigger", READONLY, true }, + { "/proc/latency_stats", READONLY, true }, + { "/proc/mtrr", READONLY, true }, + { "/proc/apm", READONLY, true }, + { "/proc/acpi", READONLY, true }, + { "/proc/timer_stats", READONLY, true }, + { "/proc/asound", READONLY, true }, + { "/proc/bus", READONLY, true }, + { "/proc/fs", READONLY, true }, + { "/proc/irq", READONLY, true }, { "/sys", READONLY, false }, + { "/sys/kernel/debug", READONLY, true }, + { "/sys/kernel/tracing", READONLY, true }, { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ }; -- cgit v1.2.3-54-g00ecf From 8f81a5f61bcf745bae3acad599d7a9da686643e3 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 25 Sep 2016 12:52:27 +0200 Subject: core: Use @raw-io syscall group to filter I/O syscalls when PrivateDevices= is set Instead of having a local syscall list, use the @raw-io group which contains the same set of syscalls to filter. --- man/systemd.exec.xml | 6 ++++-- src/core/execute.c | 55 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 22 deletions(-) (limited to 'man') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index f19e7f6ee9..f70e5c36d4 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -933,8 +933,10 @@ /dev/random (as well as the pseudo TTY subsystem) to it, but no physical devices such as /dev/sda, system memory /dev/mem, system ports /dev/port and others. This is useful to securely turn off physical device access by the - executed process. Defaults to false. Enabling this option will also remove CAP_MKNOD from - the capability bounding set for the unit (see above), and set DevicePolicy=closed (see + executed process. Defaults to false. Enabling this option will install a system call filter to block low-level + I/O system calls that are grouped in the @raw-io set, will also remove + CAP_MKNOD from the capability bounding set for the unit (see above), and set + DevicePolicy=closed (see systemd.resource-control5 for details). Note that using this setting will disconnect propagation of mounts from the service to the host (propagation in the opposite direction continues to work). This means that this setting may not be used for diff --git a/src/core/execute.c b/src/core/execute.c index 0488ba2ca9..3da7ef3be6 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1429,28 +1429,15 @@ finish: } static int apply_private_devices(Unit *u, const ExecContext *c) { - - static const int device_syscalls[] = { - SCMP_SYS(ioperm), - SCMP_SYS(iopl), - SCMP_SYS(pciconfig_iobase), - SCMP_SYS(pciconfig_read), - SCMP_SYS(pciconfig_write), -#ifdef __NR_s390_pci_mmio_read - SCMP_SYS(s390_pci_mmio_read), -#endif -#ifdef __NR_s390_pci_mmio_write - SCMP_SYS(s390_pci_mmio_write), -#endif - }; - + const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; - unsigned i; + const char *sys; + bool syscalls_found = false; int r; assert(c); - /* If PrivateDevices= is set, also turn off iopl and friends. */ + /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; @@ -1463,12 +1450,40 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - for (i = 0; i < ELEMENTSOF(device_syscalls); i++) { + for (set = syscall_filter_sets; set->set_name; set++) + if (streq(set->set_name, "@raw-io")) { + syscalls_found = true; + break; + } + + /* We should never fail here */ + if (!syscalls_found) { + r = -EOPNOTSUPP; + goto finish; + } + + NULSTR_FOREACH(sys, set->value) { + int id; + bool add = true; + +#ifndef __NR_s390_pci_mmio_read + if (streq(sys, "s390_pci_mmio_read")) + add = false; +#endif +#ifndef __NR_s390_pci_mmio_write + if (streq(sys, "s390_pci_mmio_write")) + add = false; +#endif + + if (!add) + continue; + + id = seccomp_syscall_resolve_name(sys); + r = seccomp_rule_add( seccomp, SCMP_ACT_ERRNO(EPERM), - device_syscalls[i], - 0); + id, 0); if (r < 0) goto finish; } -- cgit v1.2.3-54-g00ecf