summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO10
-rw-r--r--man/systemd.exec.xml65
-rw-r--r--src/core/dbus-execute.c7
-rw-r--r--src/core/execute.c168
-rw-r--r--src/core/execute.h1
-rw-r--r--src/core/load-fragment-gperf.gperf.m43
-rw-r--r--src/shared/bus-unit-util.c2
7 files changed, 222 insertions, 34 deletions
diff --git a/TODO b/TODO
index 723292cde0..33afe39783 100644
--- a/TODO
+++ b/TODO
@@ -56,11 +56,10 @@ Features:
* ProtectKeyRing= to take keyring calls away
-* PrivateUsers= which maps the all user ids except root and the one specified
- in User= to nobody
-
* ProtectControlGroups= which mounts all of /sys/fs/cgroup read-only
+* RemoveKeyRing= to remove all keyring entries of the specified user
+
* Add DataDirectory=, CacheDirectory= and LogDirectory= to match
RuntimeDirectory=, and create it as necessary when starting a service, owned by the right user.
@@ -80,6 +79,11 @@ Features:
* expose the "privileged" flag of ExecCommand on the bus, and open it up to
transient units
+* in nss-systemd, if we run inside of RootDirectory= with PrivateUsers= set,
+ find a way to map the User=/Group= of the service to the right name. This way
+ a user/group for a service only has to exist on the host for the right
+ mapping to work.
+
* allow attaching additional journald log fields to cgroups
* rework fopen_temporary() to make use of open_tmpfile_linkable() (problem: the
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 0fc658f180..2495998295 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -107,36 +107,29 @@
<varlistentry>
<term><varname>WorkingDirectory=</varname></term>
- <listitem><para>Takes a directory path relative to the service's root
- directory specified by <varname>RootDirectory=</varname>, or the
- special value <literal>~</literal>. Sets the working directory
- for executed processes. If set to <literal>~</literal>, the
- home directory of the user specified in
- <varname>User=</varname> is used. If not set, defaults to the
- root directory when systemd is running as a system instance
- and the respective user's home directory if run as user. If
- the setting is prefixed with the <literal>-</literal>
- character, a missing working directory is not considered
- fatal. If <varname>RootDirectory=</varname> is not set, then
- <varname>WorkingDirectory=</varname> is relative to the root of
- the system running the service manager.
- Note that setting this parameter might result in
- additional dependencies to be added to the unit (see
- above).</para></listitem>
+ <listitem><para>Takes a directory path relative to the service's root directory specified by
+ <varname>RootDirectory=</varname>, or the special value <literal>~</literal>. Sets the working directory for
+ executed processes. If set to <literal>~</literal>, the home directory of the user specified in
+ <varname>User=</varname> is used. If not set, defaults to the root directory when systemd is running as a
+ system instance and the respective user's home directory if run as user. If the setting is prefixed with the
+ <literal>-</literal> character, a missing working directory is not considered fatal. If
+ <varname>RootDirectory=</varname> is not set, then <varname>WorkingDirectory=</varname> is relative to the root
+ of the system running the service manager. Note that setting this parameter might result in additional
+ dependencies to be added to the unit (see above).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RootDirectory=</varname></term>
- <listitem><para>Takes a directory path relative to the host's root directory
- (i.e. the root of the system running the service manager). Sets the
- root directory for executed processes, with the <citerefentry
- project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>2</manvolnum></citerefentry>
- system call. If this is used, it must be ensured that the
- process binary and all its auxiliary files are available in
- the <function>chroot()</function> jail. Note that setting this
- parameter might result in additional dependencies to be added
- to the unit (see above).</para></listitem>
+ <listitem><para>Takes a directory path relative to the host's root directory (i.e. the root of the system
+ running the service manager). Sets the root directory for executed processes, with the <citerefentry
+ project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>2</manvolnum></citerefentry> system
+ call. If this is used, it must be ensured that the process binary and all its auxiliary files are available in
+ the <function>chroot()</function> jail. Note that setting this parameter might result in additional
+ dependencies to be added to the unit (see above).</para>
+
+ <para>The <varname>PrivateUsers=</varname> setting is particularly useful in conjunction with
+ <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
</varlistentry>
<varlistentry>
@@ -999,6 +992,28 @@
</varlistentry>
<varlistentry>
+ <term><varname>PrivateUsers=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If true, sets up a new user namespace for the executed processes and
+ configures a minimal user and group mapping, that maps the <literal>root</literal> user and group as well as
+ the unit's own user and group to themselves and everything else to the <literal>nobody</literal> user and
+ group. This is useful to securely detach the user and group databases used by the unit from the rest of the
+ system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and
+ other resources owned by users/groups not equalling <literal>root</literal> or the unit's own will stay visible
+ from within the unit but appear owned by the <literal>nobody</literal> user and group. If this mode is enabled,
+ all unit processes are run without privileges in the host user namespace (regardless if the unit's own
+ user/group is <literal>root</literal> or not). Specifically this means that the process will have zero process
+ capabilities on the host's user namespace, but full capabilities within the service's user namespace. Settings
+ such as <varname>CapabilityBoundingSet=</varname> will affect only the latter, and there's no way to acquire
+ additional capabilities in the host's user namespace. Defaults to off.</para>
+
+ <para>This setting is particularly useful in conjunction with <varname>RootDirectory=</varname>, as the need to
+ synchronize the user and group databases in the root directory and on the host is reduced, as the only users
+ and groups who need to be matched are <literal>root</literal>, <literal>nobody</literal> and the unit's own
+ user and group.</para></listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><varname>ProtectSystem=</varname></term>
<listitem><para>Takes a boolean argument or
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 346c8b973e..e35d3ccd2e 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -705,8 +705,9 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
- SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectSystem", "s", bus_property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1068,7 +1069,7 @@ int bus_exec_context_set_transient_property(
} else if (STR_IN_SET(name,
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
- "PrivateTmp", "PrivateDevices", "PrivateNetwork",
+ "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser")) {
int b;
@@ -1090,6 +1091,8 @@ int bus_exec_context_set_transient_property(
c->private_devices = b;
else if (streq(name, "PrivateNetwork"))
c->private_network = b;
+ else if (streq(name, "PrivateUsers"))
+ c->private_users = b;
else if (streq(name, "NoNewPrivileges"))
c->no_new_privileges = b;
else if (streq(name, "SyslogLevelPrefix"))
diff --git a/src/core/execute.c b/src/core/execute.c
index 7aafb1b058..6019df7ea6 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -25,6 +25,7 @@
#include <signal.h>
#include <string.h>
#include <sys/capability.h>
+#include <sys/eventfd.h>
#include <sys/mman.h>
#include <sys/personality.h>
#include <sys/prctl.h>
@@ -1552,6 +1553,159 @@ static bool exec_needs_mount_namespace(
return false;
}
+static int setup_private_users(uid_t uid, gid_t gid) {
+ _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
+ _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
+ _cleanup_close_ int unshare_ready_fd = -1;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ uint64_t c = 1;
+ siginfo_t si;
+ ssize_t n;
+ int r;
+
+ /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
+ * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
+ * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
+ * which waits for the parent to create the new user namespace while staying in the original namespace. The
+ * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
+ * continues execution normally. */
+
+ if (uid != 0 && uid_is_valid(uid))
+ asprintf(&uid_map,
+ "0 0 1\n" /* Map root → root */
+ UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
+ uid, uid); /* The case where the above is the same */
+ else
+ uid_map = strdup("0 0 1\n");
+ if (!uid_map)
+ return -ENOMEM;
+
+ if (gid != 0 && gid_is_valid(gid))
+ asprintf(&gid_map,
+ "0 0 1\n" /* Map root → root */
+ GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
+ gid, gid);
+ else
+ gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
+ if (!gid_map)
+ return -ENOMEM;
+
+ /* Create a communication channel so that the parent can tell the child when it finished creating the user
+ * namespace. */
+ unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
+ if (unshare_ready_fd < 0)
+ return -errno;
+
+ /* Create a communication channel so that the child can tell the parent a proper error code in case it
+ * failed. */
+ if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+ return -errno;
+
+ pid = fork();
+ if (pid < 0)
+ return -errno;
+
+ if (pid == 0) {
+ _cleanup_close_ int fd = -1;
+ const char *a;
+ pid_t ppid;
+
+ /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
+ * here, after the parent opened its own user namespace. */
+
+ ppid = getppid();
+ errno_pipe[0] = safe_close(errno_pipe[0]);
+
+ /* Wait until the parent unshared the user namespace */
+ if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* Disable the setgroups() system call in the child user namespace, for good. */
+ a = procfs_file_alloca(ppid, "setgroups");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ /* If the file is missing the kernel is too old, let's continue anyway. */
+ } else {
+ if (write(fd, "deny\n", 5) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ fd = safe_close(fd);
+ }
+
+ /* First write the GID map */
+ a = procfs_file_alloca(ppid, "gid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, gid_map, strlen(gid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ fd = safe_close(fd);
+
+ /* The write the UID map */
+ a = procfs_file_alloca(ppid, "uid_map");
+ fd = open(a, O_WRONLY|O_CLOEXEC);
+ if (fd < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+ if (write(fd, uid_map, strlen(uid_map)) < 0) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ (void) write(errno_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ errno_pipe[1] = safe_close(errno_pipe[1]);
+
+ if (unshare(CLONE_NEWUSER) < 0)
+ return -errno;
+
+ /* Let the child know that the namespace is ready now */
+ if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
+ return -errno;
+
+ /* Try to read an error code from the child */
+ n = read(errno_pipe[0], &r, sizeof(r));
+ if (n < 0)
+ return -errno;
+ if (n == sizeof(r)) { /* an error code was sent to us */
+ if (r < 0)
+ return r;
+ return -EIO;
+ }
+ if (n != 0) /* on success we should have read 0 bytes */
+ return -EIO;
+
+ r = wait_for_terminate(pid, &si);
+ if (r < 0)
+ return r;
+ pid = 0;
+
+ /* If something strange happened with the child, let's consider this fatal, too */
+ if (si.si_code != CLD_EXITED || si.si_status != 0)
+ return -EIO;
+
+ return 0;
+}
+
static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
assert(array);
assert(n);
@@ -2079,6 +2233,14 @@ static int exec_child(
}
#endif
+ if ((params->flags & EXEC_APPLY_PERMISSIONS) && context->private_users) {
+ r = setup_private_users(uid, gid);
+ if (r < 0) {
+ *exit_status = EXIT_USER;
+ return r;
+ }
+ }
+
/* We repeat the fd closing here, to make sure that
* nothing is leaked from the PAM modules. Note that
* we are more aggressive this time since socket_fd
@@ -2640,8 +2802,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sRootDirectory: %s\n"
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
- "%sPrivateNetwork: %s\n"
"%sPrivateDevices: %s\n"
+ "%sPrivateNetwork: %s\n"
+ "%sPrivateUsers: %s\n"
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sIgnoreSIGPIPE: %s\n"
@@ -2652,8 +2815,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, c->root_directory ? c->root_directory : "/",
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
- prefix, yes_no(c->private_network),
prefix, yes_no(c->private_devices),
+ prefix, yes_no(c->private_network),
+ prefix, yes_no(c->private_users),
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(c->ignore_sigpipe),
diff --git a/src/core/execute.h b/src/core/execute.h
index 2d05ca39aa..106154f81a 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -171,6 +171,7 @@ struct ExecContext {
bool private_tmp;
bool private_network;
bool private_devices;
+ bool private_users;
ProtectSystem protect_system;
ProtectHome protect_home;
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index 396f847213..251155b428 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -88,8 +88,9 @@ $1.ReadWritePaths, config_parse_namespace_path_strv, 0,
$1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths)
$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths)
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
-$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
+$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
+$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context)
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index c3a5233532..7774d607c7 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -202,7 +202,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
"CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting",
"SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies",
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
- "PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges",
+ "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
"RestrictRealtime", "DynamicUser")) {