From 29206d4619843252c2e04f20dc03c246547600a2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 14 Jul 2016 12:37:28 +0200 Subject: core: add a concept of "dynamic" user ids, that are allocated as long as a service is running This adds a new boolean setting DynamicUser= to service files. If set, a new user will be allocated dynamically when the unit is started, and released when it is stopped. The user ID is allocated from the range 61184..65519. The user will not be added to /etc/passwd (but an NSS module to be added later should make it show up in getent passwd). For now, care should be taken that the service writes no files to disk, since this might result in files owned by UIDs that might get assigned dynamically to a different service later on. Later patches will tighten sandboxing in order to ensure that this cannot happen, except for a few selected directories. A simple way to test this is: systemd-run -p DynamicUser=1 /bin/sleep 99999 --- src/core/execute.c | 75 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 18 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 7c178b97c3..c186f2a705 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1526,14 +1526,28 @@ static bool exec_needs_mount_namespace( return false; } +static void append_socket_pair(int *array, unsigned *n, int pair[2]) { + assert(array); + assert(n); + + if (!pair) + return; + + if (pair[0] >= 0) + array[(*n)++] = pair[0]; + if (pair[1] >= 0) + array[(*n)++] = pair[1]; +} + static int close_remaining_fds( const ExecParameters *params, ExecRuntime *runtime, + DynamicCreds *dcreds, int socket_fd, int *fds, unsigned n_fds) { unsigned n_dont_close = 0; - int dont_close[n_fds + 7]; + int dont_close[n_fds + 11]; assert(params); @@ -1551,11 +1565,14 @@ static int close_remaining_fds( n_dont_close += n_fds; } - if (runtime) { - if (runtime->netns_storage_socket[0] >= 0) - dont_close[n_dont_close++] = runtime->netns_storage_socket[0]; - if (runtime->netns_storage_socket[1] >= 0) - dont_close[n_dont_close++] = runtime->netns_storage_socket[1]; + if (runtime) + append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket); + + if (dcreds) { + if (dcreds->user) + append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket); + if (dcreds->group) + append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket); } return close_all_fds(dont_close, n_dont_close); @@ -1567,6 +1584,7 @@ static int exec_child( const ExecContext *context, const ExecParameters *params, ExecRuntime *runtime, + DynamicCreds *dcreds, char **argv, int socket_fd, int *fds, unsigned n_fds, @@ -1617,7 +1635,7 @@ static int exec_child( log_forget_fds(); - r = close_remaining_fds(params, runtime, socket_fd, fds, n_fds); + r = close_remaining_fds(params, runtime, dcreds, socket_fd, fds, n_fds); if (r < 0) { *exit_status = EXIT_FDS; return r; @@ -1650,25 +1668,42 @@ static int exec_child( } } - if (context->user) { - username = context->user; - r = get_user_creds(&username, &uid, &gid, &home, &shell); + if (context->dynamic_user && dcreds) { + + r = dynamic_creds_realize(dcreds, &uid, &gid); if (r < 0) { *exit_status = EXIT_USER; return r; } - } - if (context->group) { - const char *g = context->group; + if (uid == UID_INVALID || gid == GID_INVALID) { + *exit_status = EXIT_USER; + return -ESRCH; + } - r = get_group_creds(&g, &gid); - if (r < 0) { - *exit_status = EXIT_GROUP; - return r; + if (dcreds->user) + username = dcreds->user->name; + + } else { + if (context->user) { + username = context->user; + r = get_user_creds(&username, &uid, &gid, &home, &shell); + if (r < 0) { + *exit_status = EXIT_USER; + return r; + } } - } + if (context->group) { + const char *g = context->group; + + r = get_group_creds(&g, &gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return r; + } + } + } /* If a socket is connected to STDIN/STDOUT/STDERR, we * must sure to drop O_NONBLOCK */ @@ -2192,6 +2227,7 @@ int exec_spawn(Unit *unit, const ExecContext *context, const ExecParameters *params, ExecRuntime *runtime, + DynamicCreds *dcreds, pid_t *ret) { _cleanup_strv_free_ char **files_env = NULL; @@ -2250,6 +2286,7 @@ int exec_spawn(Unit *unit, context, params, runtime, + dcreds, argv, socket_fd, fds, n_fds, @@ -2723,6 +2760,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { if (c->group) fprintf(f, "%sGroup: %s\n", prefix, c->group); + fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user)); + if (strv_length(c->supplementary_groups) > 0) { fprintf(f, "%sSupplementaryGroups:", prefix); strv_fprintf(f, c->supplementary_groups); -- cgit v1.2.3-54-g00ecf From 409093fe10685ed55915ef256f09cdf144b6528b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 14 Jul 2016 19:19:49 +0200 Subject: nss: add new "nss-systemd" NSS module for mapping dynamic users With this NSS module all dynamic service users will be resolvable via NSS like any real user. --- Makefile-man.am | 7 + Makefile.am | 21 +++ README | 21 ++- man/nss-myhostname.xml | 5 +- man/nss-mymachines.xml | 5 +- man/nss-resolve.xml | 7 +- man/nss-systemd.xml | 107 +++++++++++++ src/core/execute.c | 6 + src/nss-systemd/Makefile | 1 + src/nss-systemd/nss-systemd.c | 332 ++++++++++++++++++++++++++++++++++++++++ src/nss-systemd/nss-systemd.sym | 17 ++ 11 files changed, 515 insertions(+), 14 deletions(-) create mode 100644 man/nss-systemd.xml create mode 120000 src/nss-systemd/Makefile create mode 100644 src/nss-systemd/nss-systemd.c create mode 100644 src/nss-systemd/nss-systemd.sym (limited to 'src/core/execute.c') diff --git a/Makefile-man.am b/Makefile-man.am index 8ab733360d..3ac1906a4a 100644 --- a/Makefile-man.am +++ b/Makefile-man.am @@ -23,6 +23,7 @@ MANPAGES += \ man/localtime.5 \ man/machine-id.5 \ man/machine-info.5 \ + man/nss-systemd.8 \ man/os-release.5 \ man/sd-bus-errors.3 \ man/sd-bus.3 \ @@ -255,6 +256,7 @@ MANPAGES_ALIAS += \ man/SD_WARNING.3 \ man/init.1 \ man/journald.conf.d.5 \ + man/libnss_systemd.so.2.8 \ man/poweroff.8 \ man/reboot.8 \ man/sd_bus_creds_get_audit_login_uid.3 \ @@ -587,6 +589,7 @@ man/SD_NOTICE.3: man/sd-daemon.3 man/SD_WARNING.3: man/sd-daemon.3 man/init.1: man/systemd.1 man/journald.conf.d.5: man/journald.conf.5 +man/libnss_systemd.so.2.8: man/nss-systemd.8 man/poweroff.8: man/halt.8 man/reboot.8: man/halt.8 man/sd_bus_creds_get_audit_login_uid.3: man/sd_bus_creds_get_pid.3 @@ -1071,6 +1074,9 @@ man/init.html: man/systemd.html man/journald.conf.d.html: man/journald.conf.html $(html-alias) +man/libnss_systemd.so.2.html: man/nss-systemd.html + $(html-alias) + man/poweroff.html: man/halt.html $(html-alias) @@ -2519,6 +2525,7 @@ EXTRA_DIST += \ man/nss-myhostname.xml \ man/nss-mymachines.xml \ man/nss-resolve.xml \ + man/nss-systemd.xml \ man/os-release.xml \ man/pam_systemd.xml \ man/resolved.conf.xml \ diff --git a/Makefile.am b/Makefile.am index 3d5ce1e2c3..a4241122d5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5001,6 +5001,27 @@ test_nss_LDADD = \ manual_tests += \ test-nss +# ------------------------------------------------------------------------------ +libnss_systemd_la_SOURCES = \ + src/nss-systemd/nss-systemd.sym \ + src/nss-systemd/nss-systemd.c + +libnss_systemd_la_LDFLAGS = \ + $(AM_LDFLAGS) \ + -module \ + -export-dynamic \ + -avoid-version \ + -shared \ + -shrext .so.2 \ + -Wl,--version-script=$(top_srcdir)/src/nss-systemd/nss-systemd.sym + +libnss_systemd_la_LIBADD = \ + libsystemd-internal.la \ + libbasic.la + +lib_LTLIBRARIES += \ + libnss_systemd.la + # ------------------------------------------------------------------------------ if HAVE_MYHOSTNAME libnss_myhostname_la_SOURCES = \ diff --git a/README b/README index ca8993cb12..19c15a70b0 100644 --- a/README +++ b/README @@ -201,7 +201,7 @@ USERS AND GROUPS: "systemd-coredump" system user and group to exist. NSS: - systemd ships with three NSS modules: + systemd ships with four glibc NSS modules: nss-myhostname resolves the local hostname to locally configured IP addresses, as well as "localhost" to @@ -210,15 +210,22 @@ NSS: nss-resolve enables DNS resolution via the systemd-resolved DNS/LLMNR caching stub resolver "systemd-resolved". - nss-mymachines enables resolution of all local containers - registered with machined to their respective IP addresses. + nss-mymachines enables resolution of all local containers registered + with machined to their respective IP addresses. It also maps UID/GIDs + ranges used by containers to useful names. - To make use of these NSS modules, please add them to the - "hosts: " line in /etc/nsswitch.conf. The "resolve" module - should replace the glibc "dns" module in this file. + nss-systemd enables resolution of all dynamically allocated service + users. (See the DynamicUser= setting in unit files.) - The three modules should be used in the following order: + To make use of these NSS modules, please add them to the "hosts:", + "passwd:" and "group:" lines in /etc/nsswitch.conf. The "resolve" + module should replace the glibc "dns" module in this file (and don't + worry, it chain-loads the "dns" module if it can't talk to resolved). + The four modules should be used in the following order: + + passwd: compat mymachines systemd + group: compat mymachines systemd hosts: files mymachines resolve myhostname SYSV INIT.D SCRIPTS: diff --git a/man/nss-myhostname.xml b/man/nss-myhostname.xml index a920ec334f..b1daaba02b 100644 --- a/man/nss-myhostname.xml +++ b/man/nss-myhostname.xml @@ -106,8 +106,8 @@ Here is an example /etc/nsswitch.conf file that enables nss-myhostname correctly: -passwd: compat mymachines -group: compat mymachines +passwd: compat mymachines systemd +group: compat mymachines systemd shadow: compat hosts: files mymachines resolve myhostname @@ -138,6 +138,7 @@ netgroup: nis See Also systemd1, + nss-systemd8, nss-resolve8, nss-mymachines8, nsswitch.conf5, diff --git a/man/nss-mymachines.xml b/man/nss-mymachines.xml index ec047449bf..a70119e256 100644 --- a/man/nss-mymachines.xml +++ b/man/nss-mymachines.xml @@ -82,8 +82,8 @@ Here is an example /etc/nsswitch.conf file that enables nss-mymachines correctly: - passwd: compat mymachines -group: compat mymachines + passwd: compat mymachines systemd +group: compat mymachines systemd shadow: compat hosts: files mymachines resolve myhostname @@ -103,6 +103,7 @@ netgroup: nis systemd1, systemd-machined.service8, + nss-systemd8, nss-resolve8, nss-myhostname8, nsswitch.conf5, diff --git a/man/nss-resolve.xml b/man/nss-resolve.xml index d9e56453e8..e6cc1d982a 100644 --- a/man/nss-resolve.xml +++ b/man/nss-resolve.xml @@ -81,8 +81,8 @@ Here is an example /etc/nsswitch.conf file that enables nss-resolve correctly: -passwd: compat mymachines -group: compat mymachines +passwd: compat mymachines systemd +group: compat mymachines systemd shadow: compat hosts: files mymachines resolve myhostname @@ -102,8 +102,9 @@ netgroup: nis systemd1, systemd-resolved8, - nss-mymachines8, + nss-systemd8, nss-myhostname8, + nss-mymachines8, nsswitch.conf5 diff --git a/man/nss-systemd.xml b/man/nss-systemd.xml new file mode 100644 index 0000000000..4228372e51 --- /dev/null +++ b/man/nss-systemd.xml @@ -0,0 +1,107 @@ + + + + + + + + + nss-systemd + systemd + + + + Developer + Lennart + Poettering + lennart@poettering.net + + + + + + nss-systemd + 8 + + + + nss-systemd + libnss_systemd.so.2 + Provide UNIX user and group name resolution for dynamic users and groups. + + + + libnss_systemd.so.2 + + + + Description + + nss-systemd is a plug-in module for the GNU Name Service Switch (NSS) functionality of the + GNU C Library (glibc), providing UNIX user and group name resolution for dynamic users and + groups allocated through the DynamicUser= option in systemd unit files. See + systemd.exec5 for details on + this option. + + To activate the NSS module, add systemd to the lines starting with + passwd: and group: in /etc/nsswitch.conf. + + It is recommended to place systemd after the files or + compat entry of the /etc/nsswitch.conf lines so that + /etc/passwd and /etc/group based mappings take precedence. + + + + Example + + Here is an example /etc/nsswitch.conf file that enables + nss-systemd correctly: + + passwd: compat mymachines systemd +group: compat mymachines systemd +shadow: compat + +hosts: files mymachines resolve myhostname +networks: files + +protocols: db files +services: db files +ethers: db files +rpc: db files + +netgroup: nis + + + + + See Also + + systemd1, + systemd.exec5, + nss-resolve8, + nss-myhostname8, + nss-mymachines8, + nsswitch.conf5, + getent1 + + + + diff --git a/src/core/execute.c b/src/core/execute.c index c186f2a705..26e9cd5339 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1670,6 +1670,12 @@ static int exec_child( if (context->dynamic_user && dcreds) { + /* Make sure we bypass our own NSS module for any NSS checks */ + if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) { + *exit_status = EXIT_USER; + return -errno; + } + r = dynamic_creds_realize(dcreds, &uid, &gid); if (r < 0) { *exit_status = EXIT_USER; diff --git a/src/nss-systemd/Makefile b/src/nss-systemd/Makefile new file mode 120000 index 0000000000..d0b0e8e008 --- /dev/null +++ b/src/nss-systemd/Makefile @@ -0,0 +1 @@ +../Makefile \ No newline at end of file diff --git a/src/nss-systemd/nss-systemd.c b/src/nss-systemd/nss-systemd.c new file mode 100644 index 0000000000..e7a4393bb0 --- /dev/null +++ b/src/nss-systemd/nss-systemd.c @@ -0,0 +1,332 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "sd-bus.h" + +#include "bus-common-errors.h" +#include "env-util.h" +#include "macro.h" +#include "nss-util.h" +#include "signal-util.h" +#include "user-util.h" +#include "util.h" + +NSS_GETPW_PROTOTYPES(systemd); +NSS_GETGR_PROTOTYPES(systemd); + +enum nss_status _nss_systemd_getpwnam_r( + const char *name, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + uint32_t translated; + size_t l; + int r; + + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + assert(name); + assert(pwd); + + /* Make sure that we don't go in circles when allocating a dynamic UID by checking our own database */ + if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) + goto not_found; + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByName", + &error, + &reply, + "s", + name); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "u", &translated); + if (r < 0) + goto fail; + + l = strlen(name); + if (buflen < l+1) { + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + memcpy(buffer, name, l+1); + + pwd->pw_name = buffer; + pwd->pw_uid = (uid_t) translated; + pwd->pw_gid = (uid_t) translated; + pwd->pw_gecos = (char*) "Dynamic User"; + pwd->pw_passwd = (char*) "*"; /* locked */ + pwd->pw_dir = (char*) "/"; + pwd->pw_shell = (char*) "/sbin/nologin"; + + *errnop = 0; + return NSS_STATUS_SUCCESS; + +not_found: + *errnop = 0; + return NSS_STATUS_NOTFOUND; + +fail: + *errnop = -r; + return NSS_STATUS_UNAVAIL; +} + +enum nss_status _nss_systemd_getpwuid_r( + uid_t uid, + struct passwd *pwd, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + const char *translated; + size_t l; + int r; + + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + if (!uid_is_valid(uid)) { + r = -EINVAL; + goto fail; + } + + if (uid <= SYSTEM_UID_MAX) + goto not_found; + + if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) + goto not_found; + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByUID", + &error, + &reply, + "u", + (uint32_t) uid); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "s", &translated); + if (r < 0) + goto fail; + + l = strlen(translated) + 1; + if (buflen < l) { + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + memcpy(buffer, translated, l); + + pwd->pw_name = buffer; + pwd->pw_uid = uid; + pwd->pw_gid = uid; + pwd->pw_gecos = (char*) "Dynamic User"; + pwd->pw_passwd = (char*) "*"; /* locked */ + pwd->pw_dir = (char*) "/"; + pwd->pw_shell = (char*) "/sbin/nologin"; + + *errnop = 0; + return NSS_STATUS_SUCCESS; + +not_found: + *errnop = 0; + return NSS_STATUS_NOTFOUND; + +fail: + *errnop = -r; + return NSS_STATUS_UNAVAIL; +} + +enum nss_status _nss_systemd_getgrnam_r( + const char *name, + struct group *gr, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + uint32_t translated; + size_t l; + int r; + + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + assert(name); + assert(gr); + + if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) + goto not_found; + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByName", + &error, + &reply, + "s", + name); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "u", &translated); + if (r < 0) + goto fail; + + l = sizeof(char*) + strlen(name) + 1; + if (buflen < l) { + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + memzero(buffer, sizeof(char*)); + strcpy(buffer + sizeof(char*), name); + + gr->gr_name = buffer + sizeof(char*); + gr->gr_gid = (gid_t) translated; + gr->gr_passwd = (char*) "*"; /* locked */ + gr->gr_mem = (char**) buffer; + + *errnop = 0; + return NSS_STATUS_SUCCESS; + +not_found: + *errnop = 0; + return NSS_STATUS_NOTFOUND; + +fail: + *errnop = -r; + return NSS_STATUS_UNAVAIL; +} + +enum nss_status _nss_systemd_getgrgid_r( + gid_t gid, + struct group *gr, + char *buffer, size_t buflen, + int *errnop) { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + const char *translated; + size_t l; + int r; + + BLOCK_SIGNALS(NSS_SIGNALS_BLOCK); + + if (!gid_is_valid(gid)) { + r = -EINVAL; + goto fail; + } + + if (gid <= SYSTEM_GID_MAX) + goto not_found; + + if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) + goto not_found; + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByUID", + &error, + &reply, + "u", + (uint32_t) gid); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "s", &translated); + if (r < 0) + goto fail; + + l = sizeof(char*) + strlen(translated) + 1; + if (buflen < l) { + *errnop = ENOMEM; + return NSS_STATUS_TRYAGAIN; + } + + memzero(buffer, sizeof(char*)); + strcpy(buffer + sizeof(char*), translated); + + gr->gr_name = buffer + sizeof(char*); + gr->gr_gid = gid; + gr->gr_passwd = (char*) "*"; /* locked */ + gr->gr_mem = (char**) buffer; + + *errnop = 0; + return NSS_STATUS_SUCCESS; + +not_found: + *errnop = 0; + return NSS_STATUS_NOTFOUND; + +fail: + *errnop = -r; + return NSS_STATUS_UNAVAIL; +} diff --git a/src/nss-systemd/nss-systemd.sym b/src/nss-systemd/nss-systemd.sym new file mode 100644 index 0000000000..955078788a --- /dev/null +++ b/src/nss-systemd/nss-systemd.sym @@ -0,0 +1,17 @@ +/*** + This file is part of systemd. + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. +***/ + +{ +global: + _nss_systemd_getpwnam_r; + _nss_systemd_getpwuid_r; + _nss_systemd_getgrnam_r; + _nss_systemd_getgrgid_r; +local: *; +}; -- cgit v1.2.3-54-g00ecf From 6af760f3b263d3ddfa80a4168ad0a0c5e59bae1f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 27 Jul 2016 15:25:55 +0200 Subject: core: inherit TERM from PID 1 for all services started on /dev/console This way, invoking nspawn from a shell in the best case inherits the TERM setting all the way down into the login shell spawned in the container. Fixes: #3697 --- src/basic/terminal-util.c | 2 +- src/core/execute.c | 60 +++++++++++++++++++++++++++++++++-------------- src/core/main.c | 2 +- 3 files changed, 45 insertions(+), 19 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/basic/terminal-util.c b/src/basic/terminal-util.c index df56d85317..f0a46c48cf 100644 --- a/src/basic/terminal-util.c +++ b/src/basic/terminal-util.c @@ -785,7 +785,7 @@ bool tty_is_vc_resolve(const char *tty) { } const char *default_term_for_tty(const char *tty) { - return tty && tty_is_vc_resolve(tty) ? "TERM=linux" : "TERM=vt220"; + return tty && tty_is_vc_resolve(tty) ? "linux" : "vt220"; } int fd_columns(int fd) { diff --git a/src/core/execute.c b/src/core/execute.c index 26e9cd5339..0bf80fc437 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -219,12 +219,36 @@ static void exec_context_tty_reset(const ExecContext *context, const ExecParamet (void) vt_disallocate(path); } +static bool is_terminal_input(ExecInput i) { + return IN_SET(i, + EXEC_INPUT_TTY, + EXEC_INPUT_TTY_FORCE, + EXEC_INPUT_TTY_FAIL); +} + static bool is_terminal_output(ExecOutput o) { - return - o == EXEC_OUTPUT_TTY || - o == EXEC_OUTPUT_SYSLOG_AND_CONSOLE || - o == EXEC_OUTPUT_KMSG_AND_CONSOLE || - o == EXEC_OUTPUT_JOURNAL_AND_CONSOLE; + return IN_SET(o, + EXEC_OUTPUT_TTY, + EXEC_OUTPUT_SYSLOG_AND_CONSOLE, + EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_JOURNAL_AND_CONSOLE); +} + +static bool exec_context_needs_term(const ExecContext *c) { + assert(c); + + /* Return true if the execution context suggests we should set $TERM to something useful. */ + + if (is_terminal_input(c->std_input)) + return true; + + if (is_terminal_output(c->std_output)) + return true; + + if (is_terminal_output(c->std_error)) + return true; + + return !!c->tty_path; } static int open_null_as(int flags, int nfd) { @@ -363,13 +387,6 @@ static int open_terminal_as(const char *path, mode_t mode, int nfd) { return r; } -static bool is_terminal_input(ExecInput i) { - return - i == EXEC_INPUT_TTY || - i == EXEC_INPUT_TTY_FORCE || - i == EXEC_INPUT_TTY_FAIL; -} - static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) { if (is_terminal_input(std_input) && !apply_tty_stdin) @@ -1444,12 +1461,21 @@ static int build_environment( our_env[n_env++] = x; } - if (is_terminal_input(c->std_input) || - c->std_output == EXEC_OUTPUT_TTY || - c->std_error == EXEC_OUTPUT_TTY || - c->tty_path) { + if (exec_context_needs_term(c)) { + const char *tty_path, *term = NULL; + + tty_path = exec_context_tty_path(c); + + /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit + * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager + * passes to PID 1 ends up all the way in the console login shown. */ + + if (path_equal(tty_path, "/dev/console") && getppid() == 1) + term = getenv("TERM"); + if (!term) + term = default_term_for_tty(tty_path); - x = strdup(default_term_for_tty(exec_context_tty_path(c))); + x = strappend("TERM=", term); if (!x) return -ENOMEM; our_env[n_env++] = x; diff --git a/src/core/main.c b/src/core/main.c index c46d886653..094bbef964 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1318,7 +1318,7 @@ static int fixup_environment(void) { return r; if (r == 0) { - term = strdup(default_term_for_tty("/dev/console") + 5); + term = strdup(default_term_for_tty("/dev/console")); if (!term) return -ENOMEM; } -- cgit v1.2.3-54-g00ecf From 70493828032abc74e5134563a915c4a3ccdde7f2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 27 Jul 2016 20:00:33 +0200 Subject: execute: don't set $SHELL and $HOME for services, if they don't contain interesting data --- src/core/execute.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 0bf80fc437..77a75245cb 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1724,6 +1724,17 @@ static int exec_child( *exit_status = EXIT_USER; return r; } + + /* Don't set $HOME or $SHELL if they are are not particularly enlightening anyway. */ + if (isempty(home) || path_equal(home, "/")) + home = NULL; + + if (isempty(shell) || PATH_IN_SET(shell, + "/bin/nologin", + "/sbin/nologin", + "/usr/bin/nologin", + "/usr/sbin/nologin")) + shell = NULL; } if (context->group) { -- cgit v1.2.3-54-g00ecf From d251207d555a1a0d97924980e49b0ba563b9fc67 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 3 Aug 2016 18:44:51 +0200 Subject: core: add new PrivateUsers= option to service execution This setting adds minimal user namespacing support to a service. When set the invoked processes will run in their own user namespace. Only a trivial mapping will be set up: the root user/group is mapped to root, and the user/group of the service will be mapped to itself, everything else is mapped to nobody. If this setting is used the service runs with no capabilities on the host, but configurable capabilities within the service. This setting is particularly useful in conjunction with RootDirectory= as the need to synchronize /etc/passwd and /etc/group between the host and the service OS tree is reduced, as only three UID/GIDs need to match: root, nobody and the user of the service itself. But even outside the RootDirectory= case this setting is useful to substantially reduce the attack surface of a service. Example command to test this: systemd-run -p PrivateUsers=1 -p User=foobar -t /bin/sh This runs a shell as user "foobar". When typing "ps" only processes owned by "root", by "foobar", and by "nobody" should be visible. --- man/systemd.exec.xml | 65 ++++++++----- src/core/dbus-execute.c | 7 +- src/core/execute.c | 168 +++++++++++++++++++++++++++++++++- src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.m4 | 3 +- src/shared/bus-unit-util.c | 2 +- 6 files changed, 215 insertions(+), 31 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 58ba582911..2190da55d4 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -107,36 +107,29 @@ WorkingDirectory= - Takes a directory path relative to the service's root - directory specified by RootDirectory=, or the - special value ~. Sets the working directory - for executed processes. If set to ~, the - home directory of the user specified in - User= is used. If not set, defaults to the - root directory when systemd is running as a system instance - and the respective user's home directory if run as user. If - the setting is prefixed with the - - character, a missing working directory is not considered - fatal. If RootDirectory= is not set, then - WorkingDirectory= is relative to the root of - the system running the service manager. - Note that setting this parameter might result in - additional dependencies to be added to the unit (see - above). + Takes a directory path relative to the service's root directory specified by + RootDirectory=, or the special value ~. Sets the working directory for + executed processes. If set to ~, the home directory of the user specified in + User= is used. If not set, defaults to the root directory when systemd is running as a + system instance and the respective user's home directory if run as user. If the setting is prefixed with the + - character, a missing working directory is not considered fatal. If + RootDirectory= is not set, then WorkingDirectory= is relative to the root + of the system running the service manager. Note that setting this parameter might result in additional + dependencies to be added to the unit (see above). RootDirectory= - Takes a directory path relative to the host's root directory - (i.e. the root of the system running the service manager). Sets the - root directory for executed processes, with the chroot2 - system call. If this is used, it must be ensured that the - process binary and all its auxiliary files are available in - the chroot() jail. Note that setting this - parameter might result in additional dependencies to be added - to the unit (see above). + Takes a directory path relative to the host's root directory (i.e. the root of the system + running the service manager). Sets the root directory for executed processes, with the chroot2 system + call. If this is used, it must be ensured that the process binary and all its auxiliary files are available in + the chroot() jail. Note that setting this parameter might result in additional + dependencies to be added to the unit (see above). + + The PrivateUsers= setting is particularly useful in conjunction with + RootDirectory=. For details, see below. @@ -998,6 +991,28 @@ accessible). + + PrivateUsers= + + Takes a boolean argument. If true, sets up a new user namespace for the executed processes and + configures a minimal user and group mapping, that maps the root user and group as well as + the unit's own user and group to themselves and everything else to the nobody user and + group. This is useful to securely detach the user and group databases used by the unit from the rest of the + system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and + other resources owned by users/groups not equalling root or the unit's own will stay visible + from within the unit but appear owned by the nobody user and group. If this mode is enabled, + all unit processes are run without privileges in the host user namespace (regardless if the unit's own + user/group is root or not). Specifically this means that the process will have zero process + capabilities on the host's user namespace, but full capabilities within the service's user namespace. Settings + such as CapabilityBoundingSet= will affect only the latter, and there's no way to acquire + additional capabilities in the host's user namespace. Defaults to off. + + This setting is particularly useful in conjunction with RootDirectory=, as the need to + synchronize the user and group databases in the root directory and on the host is reduced, as the only users + and groups who need to be matched are root, nobody and the unit's own + user and group. + + ProtectSystem= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 9c50cd93e5..4b3bbfbc7d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -705,8 +705,9 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectSystem", "s", bus_property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1068,7 +1069,7 @@ int bus_exec_context_set_transient_property( } else if (STR_IN_SET(name, "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", - "PrivateTmp", "PrivateDevices", "PrivateNetwork", + "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser")) { int b; @@ -1090,6 +1091,8 @@ int bus_exec_context_set_transient_property( c->private_devices = b; else if (streq(name, "PrivateNetwork")) c->private_network = b; + else if (streq(name, "PrivateUsers")) + c->private_users = b; else if (streq(name, "NoNewPrivileges")) c->no_new_privileges = b; else if (streq(name, "SyslogLevelPrefix")) diff --git a/src/core/execute.c b/src/core/execute.c index 26e9cd5339..cec3b3cf40 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1526,6 +1527,159 @@ static bool exec_needs_mount_namespace( return false; } +static int setup_private_users(uid_t uid, gid_t gid) { + _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; + _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 }; + _cleanup_close_ int unshare_ready_fd = -1; + _cleanup_(sigkill_waitp) pid_t pid = 0; + uint64_t c = 1; + siginfo_t si; + ssize_t n; + int r; + + /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to + * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which + * we however lack after opening the user namespace. To work around this we fork() a temporary child process, + * which waits for the parent to create the new user namespace while staying in the original namespace. The + * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and + * continues execution normally. */ + + if (uid != 0 && uid_is_valid(uid)) + asprintf(&uid_map, + "0 0 1\n" /* Map root → root */ + UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ + uid, uid); /* The case where the above is the same */ + else + uid_map = strdup("0 0 1\n"); + if (!uid_map) + return -ENOMEM; + + if (gid != 0 && gid_is_valid(gid)) + asprintf(&gid_map, + "0 0 1\n" /* Map root → root */ + GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */ + gid, gid); + else + gid_map = strdup("0 0 1\n"); /* The case where the above is the same */ + if (!gid_map) + return -ENOMEM; + + /* Create a communication channel so that the parent can tell the child when it finished creating the user + * namespace. */ + unshare_ready_fd = eventfd(0, EFD_CLOEXEC); + if (unshare_ready_fd < 0) + return -errno; + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return -errno; + + pid = fork(); + if (pid < 0) + return -errno; + + if (pid == 0) { + _cleanup_close_ int fd = -1; + const char *a; + pid_t ppid; + + /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from + * here, after the parent opened its own user namespace. */ + + ppid = getppid(); + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Wait until the parent unshared the user namespace */ + if (read(unshare_ready_fd, &c, sizeof(c)) < 0) { + r = -errno; + goto child_fail; + } + + /* Disable the setgroups() system call in the child user namespace, for good. */ + a = procfs_file_alloca(ppid, "setgroups"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT) { + r = -errno; + goto child_fail; + } + + /* If the file is missing the kernel is too old, let's continue anyway. */ + } else { + if (write(fd, "deny\n", 5) < 0) { + r = -errno; + goto child_fail; + } + + fd = safe_close(fd); + } + + /* First write the GID map */ + a = procfs_file_alloca(ppid, "gid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, gid_map, strlen(gid_map)) < 0) { + r = -errno; + goto child_fail; + } + fd = safe_close(fd); + + /* The write the UID map */ + a = procfs_file_alloca(ppid, "uid_map"); + fd = open(a, O_WRONLY|O_CLOEXEC); + if (fd < 0) { + r = -errno; + goto child_fail; + } + if (write(fd, uid_map, strlen(uid_map)) < 0) { + r = -errno; + goto child_fail; + } + + _exit(EXIT_SUCCESS); + + child_fail: + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + if (unshare(CLONE_NEWUSER) < 0) + return -errno; + + /* Let the child know that the namespace is ready now */ + if (write(unshare_ready_fd, &c, sizeof(c)) < 0) + return -errno; + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return -errno; + if (n == sizeof(r)) { /* an error code was sent to us */ + if (r < 0) + return r; + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate(pid, &si); + if (r < 0) + return r; + pid = 0; + + /* If something strange happened with the child, let's consider this fatal, too */ + if (si.si_code != CLD_EXITED || si.si_status != 0) + return -EIO; + + return 0; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -2037,6 +2191,14 @@ static int exec_child( } #endif + if (params->apply_permissions && context->private_users) { + r = setup_private_users(uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return r; + } + } + /* We repeat the fd closing here, to make sure that * nothing is leaked from the PAM modules. Note that * we are more aggressive this time since socket_fd @@ -2598,8 +2760,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sRootDirectory: %s\n" "%sNonBlocking: %s\n" "%sPrivateTmp: %s\n" - "%sPrivateNetwork: %s\n" "%sPrivateDevices: %s\n" + "%sPrivateNetwork: %s\n" + "%sPrivateUsers: %s\n" "%sProtectHome: %s\n" "%sProtectSystem: %s\n" "%sIgnoreSIGPIPE: %s\n" @@ -2610,8 +2773,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, c->root_directory ? c->root_directory : "/", prefix, yes_no(c->non_blocking), prefix, yes_no(c->private_tmp), - prefix, yes_no(c->private_network), prefix, yes_no(c->private_devices), + prefix, yes_no(c->private_network), + prefix, yes_no(c->private_users), prefix, protect_home_to_string(c->protect_home), prefix, protect_system_to_string(c->protect_system), prefix, yes_no(c->ignore_sigpipe), diff --git a/src/core/execute.h b/src/core/execute.h index 48cc18fbb3..5fac3e85e8 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -171,6 +171,7 @@ struct ExecContext { bool private_tmp; bool private_network; bool private_devices; + bool private_users; ProtectSystem protect_system; ProtectHome protect_home; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 396f847213..251155b428 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -88,8 +88,9 @@ $1.ReadWritePaths, config_parse_namespace_path_strv, 0, $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths) $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) -$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) +$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) +$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) $1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context) $1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context) $1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 14bf8ad627..9d8061b539 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -202,7 +202,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting", "SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies", "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", - "PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges", + "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser")) { -- cgit v1.2.3-54-g00ecf From c39f1ce24ddb1aa683991c5099dcc2afbfcbc57c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 26 Jul 2016 17:40:35 +0200 Subject: core: turn various execution flags into a proper flags parameter The ExecParameters structure contains a number of bit-flags, that were so far exposed as bool:1, change this to a proper, single binary bit flag field. This makes things a bit more expressive, and is helpful as we add more flags, since these booleans are passed around in various callers, for example service_spawn(), whose signature can be made much shorter now. Not all bit booleans from ExecParameters are moved into the flags field for now, but this can be added later. --- src/core/execute.c | 21 ++++++++----- src/core/execute.h | 17 ++++++++--- src/core/mount.c | 12 +++----- src/core/service.c | 90 +++++++++++++++++++----------------------------------- src/core/socket.c | 12 +++----- src/core/swap.c | 12 +++----- 6 files changed, 72 insertions(+), 92 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 77a75245cb..bc0fd27402 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -427,7 +427,7 @@ static int setup_input( return STDIN_FILENO; } - i = fixup_input(context->std_input, socket_fd, params->apply_tty_stdin); + i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); switch (i) { @@ -502,7 +502,7 @@ static int setup_output( return STDERR_FILENO; } - i = fixup_input(context->std_input, socket_fd, params->apply_tty_stdin); + i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN); o = fixup_output(context->std_output, socket_fd); if (fileno == STDERR_FILENO) { @@ -1675,7 +1675,7 @@ static int exec_child( exec_context_tty_reset(context, params); - if (params->confirm_spawn) { + if (params->flags & EXEC_CONFIRM_SPAWN) { char response; r = ask_for_confirmation(&response, argv); @@ -1940,7 +1940,7 @@ static int exec_child( umask(context->umask); - if (params->apply_permissions && !command->privileged) { + if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { r = enforce_groups(context, username, gid); if (r < 0) { *exit_status = EXIT_GROUP; @@ -2010,7 +2010,7 @@ static int exec_child( } r = setup_namespace( - params->apply_chroot ? context->root_directory : NULL, + (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, context->read_write_paths, context->read_only_paths, context->inaccessible_paths, @@ -2041,7 +2041,7 @@ static int exec_child( else wd = "/"; - if (params->apply_chroot) { + if (params->flags & EXEC_APPLY_CHROOT) { if (!needs_mount_namespace && context->root_directory) if (chroot(context->root_directory) < 0) { *exit_status = EXIT_CHROOT; @@ -2065,7 +2065,12 @@ static int exec_child( } #ifdef HAVE_SELINUX - if (params->apply_permissions && mac_selinux_use() && params->selinux_context_net && socket_fd >= 0 && !command->privileged) { + if ((params->flags & EXEC_APPLY_PERMISSIONS) && + mac_selinux_use() && + params->selinux_context_net && + socket_fd >= 0 && + !command->privileged) { + r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net); if (r < 0) { *exit_status = EXIT_SELINUX_CONTEXT; @@ -2090,7 +2095,7 @@ static int exec_child( return r; } - if (params->apply_permissions && !command->privileged) { + if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { bool use_address_families = context->address_families_whitelist || !set_isempty(context->address_families); diff --git a/src/core/execute.h b/src/core/execute.h index 48cc18fbb3..77418ea2ad 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -208,6 +208,17 @@ struct ExecContext { bool no_new_privileges_set:1; }; +typedef enum ExecFlags { + EXEC_CONFIRM_SPAWN = 1U << 0, + EXEC_APPLY_PERMISSIONS = 1U << 1, + EXEC_APPLY_CHROOT = 1U << 2, + EXEC_APPLY_TTY_STDIN = 1U << 3, + + /* The following are not usec by execute.c, but by consumers internally */ + EXEC_PASS_FDS = 1U << 4, + EXEC_IS_CONTROL = 1U << 5, +} ExecFlags; + struct ExecParameters { char **argv; char **environment; @@ -216,11 +227,7 @@ struct ExecParameters { char **fd_names; unsigned n_fds; - bool apply_permissions:1; - bool apply_chroot:1; - bool apply_tty_stdin:1; - - bool confirm_spawn:1; + ExecFlags flags; bool selinux_context_net:1; bool cgroup_delegate:1; diff --git a/src/core/mount.c b/src/core/mount.c index afb20af9e2..3f6ac7fcf9 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -701,12 +701,10 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { pid_t pid; int r; ExecParameters exec_params = { - .apply_permissions = true, - .apply_chroot = true, - .apply_tty_stdin = true, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, }; assert(m); @@ -732,7 +730,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { return r; exec_params.environment = UNIT(m)->manager->environment; - exec_params.confirm_spawn = UNIT(m)->manager->confirm_spawn; + exec_params.flags |= UNIT(m)->manager->confirm_spawn ? EXEC_CONFIRM_SPAWN : 0; exec_params.cgroup_supported = UNIT(m)->manager->cgroup_supported; exec_params.cgroup_path = UNIT(m)->cgroup_path; exec_params.cgroup_delegate = m->cgroup_context.delegate; diff --git a/src/core/service.c b/src/core/service.c index eb125cb999..b4db7d17ed 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1152,11 +1152,7 @@ static int service_spawn( Service *s, ExecCommand *c, usec_t timeout, - bool pass_fds, - bool apply_permissions, - bool apply_chroot, - bool apply_tty_stdin, - bool is_control, + ExecFlags flags, pid_t *_pid) { _cleanup_strv_free_ char **argv = NULL, **final_env = NULL, **our_env = NULL, **fd_names = NULL; @@ -1166,12 +1162,10 @@ static int service_spawn( pid_t pid; ExecParameters exec_params = { - .apply_permissions = apply_permissions, - .apply_chroot = apply_chroot, - .apply_tty_stdin = apply_tty_stdin, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = flags, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, }; int r; @@ -1194,7 +1188,7 @@ static int service_spawn( if (r < 0) return r; - if (pass_fds || + if ((flags & EXEC_PASS_FDS) || s->exec_context.std_input == EXEC_INPUT_SOCKET || s->exec_context.std_output == EXEC_OUTPUT_SOCKET || s->exec_context.std_error == EXEC_OUTPUT_SOCKET) { @@ -1218,7 +1212,7 @@ static int service_spawn( if (!our_env) return -ENOMEM; - if (is_control ? s->notify_access == NOTIFY_ALL : s->notify_access != NOTIFY_NONE) + if ((flags & EXEC_IS_CONTROL) ? s->notify_access == NOTIFY_ALL : s->notify_access != NOTIFY_NONE) if (asprintf(our_env + n_env++, "NOTIFY_SOCKET=%s", UNIT(s)->manager->notify_socket) < 0) return -ENOMEM; @@ -1226,7 +1220,7 @@ static int service_spawn( if (asprintf(our_env + n_env++, "MAINPID="PID_FMT, s->main_pid) < 0) return -ENOMEM; - if (!MANAGER_IS_SYSTEM(UNIT(s)->manager)) + if (MANAGER_IS_USER(UNIT(s)->manager)) if (asprintf(our_env + n_env++, "MANAGERPID="PID_FMT, getpid()) < 0) return -ENOMEM; @@ -1266,18 +1260,18 @@ static int service_spawn( if (!final_env) return -ENOMEM; - if (is_control && UNIT(s)->cgroup_path) { + if ((flags & EXEC_IS_CONTROL) && UNIT(s)->cgroup_path) { path = strjoina(UNIT(s)->cgroup_path, "/control"); (void) cg_create(SYSTEMD_CGROUP_CONTROLLER, path); } else path = UNIT(s)->cgroup_path; exec_params.argv = argv; + exec_params.environment = final_env; exec_params.fds = fds; exec_params.fd_names = fd_names; exec_params.n_fds = n_fds; - exec_params.environment = final_env; - exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn; + exec_params.flags |= UNIT(s)->manager->confirm_spawn ? EXEC_CONFIRM_SPAWN : 0; exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported; exec_params.cgroup_path = path; exec_params.cgroup_delegate = s->cgroup_context.delegate; @@ -1465,11 +1459,9 @@ static void service_enter_stop_post(Service *s, ServiceResult f) { r = service_spawn(s, s->control_command, s->timeout_stop_usec, - false, - !s->permissions_start_only, - !s->root_directory_start_only, - true, - true, + (s->permissions_start_only ? 0 : EXEC_APPLY_PERMISSIONS) | + (s->root_directory_start_only ? 0 : EXEC_APPLY_CHROOT) | + EXEC_APPLY_TTY_STDIN | EXEC_IS_CONTROL, &s->control_pid); if (r < 0) goto fail; @@ -1580,11 +1572,9 @@ static void service_enter_stop(Service *s, ServiceResult f) { r = service_spawn(s, s->control_command, s->timeout_stop_usec, - false, - !s->permissions_start_only, - !s->root_directory_start_only, - false, - true, + (s->permissions_start_only ? 0 : EXEC_APPLY_PERMISSIONS) | + (s->root_directory_start_only ? 0 : EXEC_APPLY_CHROOT) | + EXEC_IS_CONTROL, &s->control_pid); if (r < 0) goto fail; @@ -1661,11 +1651,9 @@ static void service_enter_start_post(Service *s) { r = service_spawn(s, s->control_command, s->timeout_start_usec, - false, - !s->permissions_start_only, - !s->root_directory_start_only, - false, - true, + (s->permissions_start_only ? 0 : EXEC_APPLY_PERMISSIONS)| + (s->root_directory_start_only ? 0 : EXEC_APPLY_CHROOT)| + EXEC_IS_CONTROL, &s->control_pid); if (r < 0) goto fail; @@ -1735,11 +1723,7 @@ static void service_enter_start(Service *s) { r = service_spawn(s, c, timeout, - true, - true, - true, - true, - false, + EXEC_PASS_FDS|EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, &pid); if (r < 0) goto fail; @@ -1798,11 +1782,9 @@ static void service_enter_start_pre(Service *s) { r = service_spawn(s, s->control_command, s->timeout_start_usec, - false, - !s->permissions_start_only, - !s->root_directory_start_only, - true, - true, + (s->permissions_start_only ? 0 : EXEC_APPLY_PERMISSIONS) | + (s->root_directory_start_only ? 0: EXEC_APPLY_CHROOT) | + EXEC_IS_CONTROL|EXEC_APPLY_TTY_STDIN, &s->control_pid); if (r < 0) goto fail; @@ -1877,11 +1859,9 @@ static void service_enter_reload(Service *s) { r = service_spawn(s, s->control_command, s->timeout_start_usec, - false, - !s->permissions_start_only, - !s->root_directory_start_only, - false, - true, + (s->permissions_start_only ? 0 : EXEC_APPLY_PERMISSIONS) | + (s->root_directory_start_only ? 0 : EXEC_APPLY_CHROOT) | + EXEC_IS_CONTROL, &s->control_pid); if (r < 0) goto fail; @@ -1919,12 +1899,10 @@ static void service_run_next_control(Service *s) { r = service_spawn(s, s->control_command, timeout, - false, - !s->permissions_start_only, - !s->root_directory_start_only, - s->control_command_id == SERVICE_EXEC_START_PRE || - s->control_command_id == SERVICE_EXEC_STOP_POST, - true, + (s->permissions_start_only ? 0 : EXEC_APPLY_PERMISSIONS) | + (s->root_directory_start_only ? 0 : EXEC_APPLY_CHROOT) | + (IN_SET(s->control_command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_STOP_POST) ? EXEC_APPLY_TTY_STDIN : 0)| + EXEC_IS_CONTROL, &s->control_pid); if (r < 0) goto fail; @@ -1962,11 +1940,7 @@ static void service_run_next_main(Service *s) { r = service_spawn(s, s->main_command, s->timeout_start_usec, - true, - true, - true, - true, - false, + EXEC_PASS_FDS|EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, &pid); if (r < 0) goto fail; diff --git a/src/core/socket.c b/src/core/socket.c index ff55885fb3..82363e2157 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1664,12 +1664,10 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { pid_t pid; int r; ExecParameters exec_params = { - .apply_permissions = true, - .apply_chroot = true, - .apply_tty_stdin = true, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, }; assert(s); @@ -1700,7 +1698,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { exec_params.argv = argv; exec_params.environment = UNIT(s)->manager->environment; - exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn; + exec_params.flags |= UNIT(s)->manager->confirm_spawn ? EXEC_CONFIRM_SPAWN : 0; exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported; exec_params.cgroup_path = UNIT(s)->cgroup_path; exec_params.cgroup_delegate = s->cgroup_context.delegate; diff --git a/src/core/swap.c b/src/core/swap.c index 66a318d01f..0ba4c4d881 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -611,12 +611,10 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { pid_t pid; int r; ExecParameters exec_params = { - .apply_permissions = true, - .apply_chroot = true, - .apply_tty_stdin = true, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, }; assert(s); @@ -642,7 +640,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { goto fail; exec_params.environment = UNIT(s)->manager->environment; - exec_params.confirm_spawn = UNIT(s)->manager->confirm_spawn; + exec_params.flags |= UNIT(s)->manager->confirm_spawn ? EXEC_CONFIRM_SPAWN : 0; exec_params.cgroup_supported = UNIT(s)->manager->cgroup_supported; exec_params.cgroup_path = UNIT(s)->cgroup_path; exec_params.cgroup_delegate = s->cgroup_context.delegate; -- cgit v1.2.3-54-g00ecf From af9d16e10a23899b821af19e54e339486a86bd82 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 27 Jul 2016 11:50:37 +0200 Subject: core: use the correct APIs to determine whether a dual timestamp is initialized --- src/core/execute.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index bc0fd27402..9028139723 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2969,12 +2969,12 @@ void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) { "%sPID: "PID_FMT"\n", prefix, s->pid); - if (s->start_timestamp.realtime > 0) + if (dual_timestamp_is_set(&s->start_timestamp)) fprintf(f, "%sStart Timestamp: %s\n", prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime)); - if (s->exit_timestamp.realtime > 0) + if (dual_timestamp_is_set(&s->exit_timestamp)) fprintf(f, "%sExit Timestamp: %s\n" "%sExit Code: %s\n" -- cgit v1.2.3-54-g00ecf From b08af3b12706f352f651e70e117f6d6dcf11a911 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 4 Aug 2016 22:11:29 +0200 Subject: core: only set the watchdog variables in ExecStart= lines --- src/core/execute.c | 2 +- src/core/execute.h | 1 + src/core/service.c | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 9028139723..7aafb1b058 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1425,7 +1425,7 @@ static int build_environment( our_env[n_env++] = x; } - if (p->watchdog_usec > 0) { + if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) { if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid()) < 0) return -ENOMEM; our_env[n_env++] = x; diff --git a/src/core/execute.h b/src/core/execute.h index 2b4238ed7e..2d05ca39aa 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -218,6 +218,7 @@ typedef enum ExecFlags { EXEC_PASS_FDS = 1U << 4, EXEC_IS_CONTROL = 1U << 5, EXEC_SETENV_RESULT = 1U << 6, + EXEC_SET_WATCHDOG = 1U << 7, } ExecFlags; struct ExecParameters { diff --git a/src/core/service.c b/src/core/service.c index a6793b813b..c21653f5fa 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1743,7 +1743,7 @@ static void service_enter_start(Service *s) { r = service_spawn(s, c, timeout, - EXEC_PASS_FDS|EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + EXEC_PASS_FDS|EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG, &pid); if (r < 0) goto fail; @@ -1955,7 +1955,7 @@ static void service_run_next_main(Service *s) { r = service_spawn(s, s->main_command, s->timeout_start_usec, - EXEC_PASS_FDS|EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + EXEC_PASS_FDS|EXEC_APPLY_PERMISSIONS|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN|EXEC_SET_WATCHDOG, &pid); if (r < 0) goto fail; -- cgit v1.2.3-54-g00ecf From 92b25bcabb75846928f5ea1417808dab981d5e25 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 29 Jul 2016 16:22:30 +0200 Subject: core: make use of uid_is_valid() when checking for UID validity --- src/core/execute.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 6019df7ea6..18bb421cab 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1862,7 +1862,7 @@ static int exec_child( return r; } - if (uid == UID_INVALID || gid == GID_INVALID) { + if (!uid_is_valid(uid) || !gid_is_valid(gid)) { *exit_status = EXIT_USER; return -ESRCH; } -- cgit v1.2.3-54-g00ecf From 00d9ef8560c252d8504be99cb38d1a54d35a9144 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 1 Aug 2016 19:24:40 +0200 Subject: core: add RemoveIPC= setting This adds the boolean RemoveIPC= setting to service, socket, mount and swap units (i.e. all unit types that may invoke processes). if turned on, and the unit's user/group is not root, all IPC objects of the user/group are removed when the service is shut down. The life-cycle of the IPC objects is hence bound to the unit life-cycle. This is particularly relevant for units with dynamic users, as it is essential that no objects owned by the dynamic users survive the service exiting. In fact, this patch adds code to imply RemoveIPC= if DynamicUser= is set. In order to communicate the UID/GID of an executed process back to PID 1 this adds a new "user lookup" socket pair, that is inherited into the forked processes, and closed before the exec(). This is needed since we cannot do NSS from PID 1 due to deadlock risks, However need to know the used UID/GID in order to clean up IPC owned by it if the unit shuts down. --- man/systemd.exec.xml | 60 ++++--- src/core/dbus-execute.c | 5 +- src/core/dbus-mount.c | 2 + src/core/dbus-service.c | 3 + src/core/dbus-socket.c | 2 + src/core/dbus-swap.c | 2 + src/core/execute.c | 46 ++++- src/core/execute.h | 1 + src/core/manager.c | 411 ++++++++++++++++++++++++++++++++++++++++++++- src/core/manager.h | 23 ++- src/core/mount.c | 2 + src/core/service.c | 3 + src/core/socket.c | 2 + src/core/swap.c | 2 + src/core/unit.c | 174 ++++++++++++++++++- src/core/unit.h | 18 +- src/login/logind-user.c | 11 +- src/shared/bus-unit-util.c | 2 +- src/shared/clean-ipc.c | 66 +++++--- src/shared/clean-ipc.h | 4 +- src/test/test-ipcrm.c | 2 +- 21 files changed, 777 insertions(+), 64 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index bf82326096..bcedebd5bb 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -160,14 +160,14 @@ use. However, UID/GIDs are recycled after a unit is terminated. Care should be taken that any processes running as part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by these users/groups around, as a different unit might get the same UID/GID assigned later on, and thus gain access to - these files or directories. If DynamicUser= is enabled, PrivateTmp= is - implied. This ensures that the lifetime of temporary files created by the executed processes is bound to the - runtime of the service, and hence the lifetime of the dynamic user/group. Since /tmp and - /var/tmp are usually the only world-writable directories on a system this ensures that a - unit making use of dynamic user/group allocation cannot leave files around after unit termination. Use - RuntimeDirectory= (see below) in order to assign a writable runtime directory to a service, - owned by the dynamic user/group and removed automatically when the unit is terminated. Defaults to - off. + these files or directories. If DynamicUser= is enabled, RemoveIPC= and + PrivateTmp= are implied. This ensures that the lifetime of IPC objects and temporary files + created by the executed processes is bound to the runtime of the service, and hence the lifetime of the dynamic + user/group. Since /tmp and /var/tmp are usually the only + world-writable directories on a system this ensures that a unit making use of dynamic user/group allocation + cannot leave files around after unit termination. Use RuntimeDirectory= (see below) in order + to assign a writable runtime directory to a service, owned by the dynamic user/group and removed automatically + when the unit is terminated. Defaults to off. @@ -185,6 +185,18 @@ user. This does not affect commands prefixed with +. + + RemoveIPC= + + Takes a boolean parameter. If set, all System V and POSIX IPC objects owned by the user and + group the processes of this unit are run as are removed when the unit is stopped. This setting only has an + effect if at least one of User=, Group= and + DynamicUser= are used. It has no effect on IPC objects owned by the root user. Specifically, + this removes System V semaphores, as well as System V and POSIX shared memory segments and message queues. If + multiple units use the same user or group the IPC objects are removed when the last of these units is + stopped. This setting is implied if DynamicUser= is set. + + Nice= @@ -920,27 +932,19 @@ PrivateTmp= - Takes a boolean argument. If true, sets up a - new file system namespace for the executed processes and - mounts private /tmp and - /var/tmp directories inside it that is - not shared by processes outside of the namespace. This is - useful to secure access to temporary files of the process, but - makes sharing between processes via /tmp - or /var/tmp impossible. If this is - enabled, all temporary files created by a service in these - directories will be removed after the service is stopped. - Defaults to false. It is possible to run two or more units - within the same private /tmp and - /var/tmp namespace by using the + Takes a boolean argument. If true, sets up a new file system namespace for the executed + processes and mounts private /tmp and /var/tmp directories inside it + that is not shared by processes outside of the namespace. This is useful to secure access to temporary files of + the process, but makes sharing between processes via /tmp or /var/tmp + impossible. If this is enabled, all temporary files created by a service in these directories will be removed + after the service is stopped. Defaults to false. It is possible to run two or more units within the same + private /tmp and /var/tmp namespace by using the JoinsNamespaceOf= directive, see - systemd.unit5 - for details. Note that using this setting will disconnect - propagation of mounts from the service to the host - (propagation in the opposite direction continues to work). - This means that this setting may not be used for services - which shall be able to install mount points in the main mount - namespace. + systemd.unit5 for + details. Note that using this setting will disconnect propagation of mounts from the service to the host + (propagation in the opposite direction continues to work). This means that this setting may not be used for + services which shall be able to install mount points in the main mount namespace. This setting is implied if + DynamicUser= is set. diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index e35d3ccd2e..7e33a2d201 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -695,6 +695,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("User", "s", NULL, offsetof(ExecContext, user), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Group", "s", NULL, offsetof(ExecContext, group), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DynamicUser", "b", bus_property_get_bool, offsetof(ExecContext, dynamic_user), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RemoveIPC", "b", bus_property_get_bool, offsetof(ExecContext, remove_ipc), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SupplementaryGroups", "as", NULL, offsetof(ExecContext, supplementary_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PAMName", "s", NULL, offsetof(ExecContext, pam_name), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ReadWriteDirectories", "as", NULL, offsetof(ExecContext, read_write_paths), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), @@ -1071,7 +1072,7 @@ int bus_exec_context_set_transient_property( "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1103,6 +1104,8 @@ int bus_exec_context_set_transient_property( c->restrict_realtime = b; else if (streq(name, "DynamicUser")) c->dynamic_user = b; + else if (streq(name, "RemoveIPC")) + c->remove_ipc = b; unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b)); } diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c index b4bbee0648..3c6bda4073 100644 --- a/src/core/dbus-mount.c +++ b/src/core/dbus-mount.c @@ -117,6 +117,8 @@ const sd_bus_vtable bus_mount_vtable[] = { SD_BUS_PROPERTY("DirectoryMode", "u", bus_property_get_mode, offsetof(Mount, directory_mode), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SloppyOptions", "b", bus_property_get_bool, offsetof(Mount, sloppy_options), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Mount, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", NULL, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", NULL, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), BUS_EXEC_COMMAND_VTABLE("ExecMount", offsetof(Mount, exec_command[MOUNT_EXEC_MOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_COMMAND_VTABLE("ExecUnmount", offsetof(Mount, exec_command[MOUNT_EXEC_UNMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_COMMAND_VTABLE("ExecRemount", offsetof(Mount, exec_command[MOUNT_EXEC_REMOUNT]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c index 5eac876a6e..3c55e0f7fe 100644 --- a/src/core/dbus-service.c +++ b/src/core/dbus-service.c @@ -65,6 +65,9 @@ const sd_bus_vtable bus_service_vtable[] = { SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("USBFunctionDescriptors", "s", NULL, offsetof(Service, usb_function_descriptors), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("USBFunctionStrings", "s", NULL, offsetof(Service, usb_function_strings), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", NULL, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", NULL, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_COMMAND_LIST_VTABLE("ExecStart", offsetof(Service, exec_command[SERVICE_EXEC_START]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c index 9a071a1355..21adb64e15 100644 --- a/src/core/dbus-socket.c +++ b/src/core/dbus-socket.c @@ -152,6 +152,8 @@ const sd_bus_vtable bus_socket_vtable[] = { SD_BUS_PROPERTY("SocketProtocol", "i", bus_property_get_int, offsetof(Socket, socket_protocol), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TriggerLimitIntervalUSec", "t", bus_property_get_usec, offsetof(Socket, trigger_limit.interval), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TriggerLimitBurst", "u", bus_property_get_unsigned, offsetof(Socket, trigger_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("UID", "u", NULL, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", NULL, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Socket, exec_command[SOCKET_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPost", offsetof(Socket, exec_command[SOCKET_EXEC_START_POST]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_COMMAND_LIST_VTABLE("ExecStopPre", offsetof(Socket, exec_command[SOCKET_EXEC_STOP_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c index 292f8738c6..85a2c26b98 100644 --- a/src/core/dbus-swap.c +++ b/src/core/dbus-swap.c @@ -84,6 +84,8 @@ const sd_bus_vtable bus_swap_vtable[] = { SD_BUS_PROPERTY("TimeoutUSec", "t", bus_property_get_usec, offsetof(Swap, timeout_usec), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Swap, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Swap, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("UID", "u", NULL, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), + SD_BUS_PROPERTY("GID", "u", NULL, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE), BUS_EXEC_COMMAND_VTABLE("ExecActivate", offsetof(Swap, exec_command[SWAP_EXEC_ACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), BUS_EXEC_COMMAND_VTABLE("ExecDeactivate", offsetof(Swap, exec_command[SWAP_EXEC_DEACTIVATE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION), SD_BUS_VTABLE_END diff --git a/src/core/execute.c b/src/core/execute.c index 18bb421cab..4c786a2e33 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1723,11 +1723,12 @@ static int close_remaining_fds( const ExecParameters *params, ExecRuntime *runtime, DynamicCreds *dcreds, + int user_lookup_fd, int socket_fd, int *fds, unsigned n_fds) { unsigned n_dont_close = 0; - int dont_close[n_fds + 11]; + int dont_close[n_fds + 12]; assert(params); @@ -1755,9 +1756,40 @@ static int close_remaining_fds( append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket); } + if (user_lookup_fd >= 0) + dont_close[n_dont_close++] = user_lookup_fd; + return close_all_fds(dont_close, n_dont_close); } +static int send_user_lookup( + Unit *unit, + int user_lookup_fd, + uid_t uid, + gid_t gid) { + + assert(unit); + + /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID + * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was + * specified. */ + + if (user_lookup_fd < 0) + return 0; + + if (!uid_is_valid(uid) && !gid_is_valid(gid)) + return 0; + + if (writev(user_lookup_fd, + (struct iovec[]) { + { .iov_base = &uid, .iov_len = sizeof(uid) }, + { .iov_base = &gid, .iov_len = sizeof(gid) }, + { .iov_base = unit->id, .iov_len = strlen(unit->id) }}, 3) < 0) + return -errno; + + return 0; +} + static int exec_child( Unit *unit, ExecCommand *command, @@ -1769,6 +1801,7 @@ static int exec_child( int socket_fd, int *fds, unsigned n_fds, char **files_env, + int user_lookup_fd, int *exit_status) { _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL; @@ -1815,7 +1848,7 @@ static int exec_child( log_forget_fds(); - r = close_remaining_fds(params, runtime, dcreds, socket_fd, fds, n_fds); + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds); if (r < 0) { *exit_status = EXIT_FDS; return r; @@ -1902,6 +1935,14 @@ static int exec_child( } } + r = send_user_lookup(unit, user_lookup_fd, uid, gid); + if (r < 0) { + *exit_status = EXIT_USER; + return r; + } + + user_lookup_fd = safe_close(user_lookup_fd); + /* If a socket is connected to STDIN/STDOUT/STDERR, we * must sure to drop O_NONBLOCK */ if (socket_fd >= 0) @@ -2501,6 +2542,7 @@ int exec_spawn(Unit *unit, socket_fd, fds, n_fds, files_env, + unit->manager->user_lookup_fds[1], &exit_status); if (r < 0) { log_open(); diff --git a/src/core/execute.h b/src/core/execute.h index 106154f81a..6082c42aba 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -178,6 +178,7 @@ struct ExecContext { bool no_new_privileges; bool dynamic_user; + bool remove_ipc; /* This is not exposed to the user but available * internally. We need it to make sure that whenever we spawn diff --git a/src/core/manager.c b/src/core/manager.c index c20e185d78..6f2477eef4 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -45,6 +45,7 @@ #include "bus-error.h" #include "bus-kernel.h" #include "bus-util.h" +#include "clean-ipc.h" #include "dbus-job.h" #include "dbus-manager.h" #include "dbus-unit.h" @@ -81,6 +82,7 @@ #include "transaction.h" #include "umask-util.h" #include "unit-name.h" +#include "user-util.h" #include "util.h" #include "virt.h" #include "watchdog.h" @@ -98,6 +100,7 @@ static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, ui static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); static int manager_run_generators(Manager *m); @@ -590,6 +593,8 @@ int manager_new(UnitFileScope scope, bool test_run, Manager **_m) { m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->cgroup_inotify_fd = m->ask_password_inotify_fd = -1; + m->user_lookup_fds[0] = m->user_lookup_fds[1] = -1; + m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */ m->have_ask_password = -EINVAL; /* we don't know */ @@ -812,6 +817,59 @@ static int manager_setup_cgroups_agent(Manager *m) { return 0; } +static int manager_setup_user_lookup_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing UID/GID resolution results from forked off processes to PID + * 1. Background: we can't do name lookups (NSS) from PID 1, since it might involve IPC and thus activation, + * and we might hence deadlock on ourselves. Hence we do all user/group lookups asynchronously from the forked + * off processes right before executing the binaries to start. In order to be able to clean up any IPC objects + * created by a unit (see RemoveIPC=) we need to know in PID 1 the used UID/GID of the executed processes, + * hence we establish this communication channel so that forked off processes can pass their UID/GID + * information back to PID 1. The forked off processes send their resolved UID/GID to PID 1 in a simple + * datagram, along with their unit name, so that we can share one communication socket pair among all units for + * this purpose. + * + * You might wonder why we need a communication channel for this that is independent of the usual notification + * socket scheme (i.e. $NOTIFY_SOCKET). The primary difference is about trust: data sent via the $NOTIFY_SOCKET + * channel is only accepted if it originates from the right unit and if reception was enabled for it. The user + * lookup socket OTOH is only accessible by PID 1 and its children until they exec(), and always available. + * + * Note that this function is called under two circumstances: when we first initialize (in which case we + * allocate both the socket pair and the event source to listen on it), and when we deserialize after a reload + * (in which case the socket pair already exists but we still need to allocate the event source for it). */ + + if (m->user_lookup_fds[0] < 0) { + + /* Free all secondary fields */ + safe_close_pair(m->user_lookup_fds); + m->user_lookup_event_source = sd_event_source_unref(m->user_lookup_event_source); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->user_lookup_fds) < 0) + return log_error_errno(errno, "Failed to allocate user lookup socket: %m"); + + (void) fd_inc_rcvbuf(m->user_lookup_fds[0], NOTIFY_RCVBUF_SIZE); + } + + if (!m->user_lookup_event_source) { + r = sd_event_add_io(m->event, &m->user_lookup_event_source, m->user_lookup_fds[0], EPOLLIN, manager_dispatch_user_lookup_fd, m); + if (r < 0) + return log_error_errno(errno, "Failed to allocate user lookup event source: %m"); + + /* Process even earlier than the notify event source, so that we always know first about valid UID/GID + * resolutions */ + r = sd_event_source_set_priority(m->user_lookup_event_source, SD_EVENT_PRIORITY_NORMAL-8); + if (r < 0) + return log_error_errno(errno, "Failed to set priority ot user lookup event source: %m"); + + (void) sd_event_source_set_description(m->user_lookup_event_source, "user-lookup"); + } + + return 0; +} + static int manager_connect_bus(Manager *m, bool reexecuting) { bool try_bus_connect; @@ -853,8 +911,7 @@ enum { _GC_OFFSET_MAX }; -static void unit_gc_mark_good(Unit *u, unsigned gc_marker) -{ +static void unit_gc_mark_good(Unit *u, unsigned gc_marker) { Iterator i; Unit *other; @@ -1021,12 +1078,14 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->time_change_event_source); sd_event_source_unref(m->jobs_in_progress_event_source); sd_event_source_unref(m->run_queue_event_source); + sd_event_source_unref(m->user_lookup_event_source); safe_close(m->signal_fd); safe_close(m->notify_fd); safe_close(m->cgroups_agent_fd); safe_close(m->time_change_fd); safe_close(m->kdbus_fd); + safe_close_pair(m->user_lookup_fds); manager_close_ask_password(m); @@ -1052,6 +1111,9 @@ Manager* manager_free(Manager *m) { assert(hashmap_isempty(m->units_requiring_mounts_for)); hashmap_free(m->units_requiring_mounts_for); + hashmap_free(m->uid_refs); + hashmap_free(m->gid_refs); + free(m); return NULL; } @@ -1221,6 +1283,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { if (q < 0 && r == 0) r = q; + q = manager_setup_user_lookup_fd(m); + if (q < 0 && r == 0) + r = q; + /* We might have deserialized the kdbus control fd, but if we * didn't, then let's create the bus now. */ manager_connect_bus(m, !!serialization); @@ -1232,6 +1298,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { /* Release any dynamic users no longer referenced */ dynamic_user_vacuum(m, true); + /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */ + manager_vacuum_uid_refs(m); + manager_vacuum_gid_refs(m); + if (serialization) { assert(m->n_reloading > 0); m->n_reloading--; @@ -2396,6 +2466,20 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) { fprintf(f, "cgroups-agent-fd=%i\n", copy); } + if (m->user_lookup_fds[0] >= 0) { + int copy0, copy1; + + copy0 = fdset_put_dup(fds, m->user_lookup_fds[0]); + if (copy0 < 0) + return copy0; + + copy1 = fdset_put_dup(fds, m->user_lookup_fds[1]); + if (copy1 < 0) + return copy1; + + fprintf(f, "user-lookup=%i %i\n", copy0, copy1); + } + if (m->kdbus_fd >= 0) { int copy; @@ -2412,6 +2496,9 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) { if (r < 0) return r; + manager_serialize_uid_refs(m, f); + manager_serialize_gid_refs(m, f); + fputc('\n', f); HASHMAP_FOREACH_KEY(u, t, m->units, i) { @@ -2578,6 +2665,18 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { m->cgroups_agent_fd = fdset_remove(fds, fd); } + } else if (startswith(l, "user-lookup=")) { + int fd0, fd1; + + if (sscanf(l + 12, "%i %i", &fd0, &fd1) != 2 || fd0 < 0 || fd1 < 0 || fd0 == fd1 || !fdset_contains(fds, fd0) || !fdset_contains(fds, fd1)) + log_debug("Failed to parse user lookup fd: %s", l + 12); + else { + m->user_lookup_event_source = sd_event_source_unref(m->user_lookup_event_source); + safe_close_pair(m->user_lookup_fds); + m->user_lookup_fds[0] = fdset_remove(fds, fd0); + m->user_lookup_fds[1] = fdset_remove(fds, fd1); + } + } else if (startswith(l, "kdbus-fd=")) { int fd; @@ -2590,6 +2689,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { } else if (startswith(l, "dynamic-user=")) dynamic_user_deserialize_one(m, l + 13, fds); + else if (startswith(l, "destroy-ipc-uid=")) + manager_deserialize_uid_refs_one(m, l + 16); + else if (startswith(l, "destroy-ipc-gid=")) + manager_deserialize_gid_refs_one(m, l + 16); else { int k; @@ -2672,6 +2775,8 @@ int manager_reload(Manager *m) { lookup_paths_flush_generator(&m->lookup_paths); lookup_paths_free(&m->lookup_paths); dynamic_user_vacuum(m, false); + m->uid_refs = hashmap_free(m->uid_refs); + m->gid_refs = hashmap_free(m->gid_refs); q = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL); if (q < 0 && r >= 0) @@ -2705,12 +2810,20 @@ int manager_reload(Manager *m) { if (q < 0 && r >= 0) r = q; + q = manager_setup_user_lookup_fd(m); + if (q < 0 && r >= 0) + r = q; + /* Third, fire things up! */ manager_coldplug(m); /* Release any dynamic users no longer referenced */ dynamic_user_vacuum(m, true); + /* Release any references to UIDs/GIDs no longer referenced, and destroy any IPC owned by them */ + manager_vacuum_uid_refs(m); + manager_vacuum_gid_refs(m); + /* Sync current state of bus names with our set of listening units */ if (m->api_bus) manager_sync_bus_names(m, m->api_bus); @@ -3144,6 +3257,300 @@ ManagerState manager_state(Manager *m) { return MANAGER_RUNNING; } +#define DESTROY_IPC_FLAG (UINT32_C(1) << 31) + +static void manager_unref_uid_internal( + Manager *m, + Hashmap **uid_refs, + uid_t uid, + bool destroy_now, + int (*_clean_ipc)(uid_t uid)) { + + uint32_t c, n; + + assert(m); + assert(uid_refs); + assert(uid_is_valid(uid)); + assert(_clean_ipc); + + /* A generic implementation, covering both manager_unref_uid() and manager_unref_gid(), under the assumption + * that uid_t and gid_t are actually defined the same way, with the same validity rules. + * + * We store a hashmap where the UID/GID is they key and the value is a 32bit reference counter, whose highest + * bit is used as flag for marking UIDs/GIDs whose IPC objects to remove when the last reference to the UID/GID + * is dropped. The flag is set to on, once at least one reference from a unit where RemoveIPC= is set is added + * on a UID/GID. It is reset when the UID's/GID's reference counter drops to 0 again. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + assert(n > 0); + n--; + + if (destroy_now && n == 0) { + hashmap_remove(*uid_refs, UID_TO_PTR(uid)); + + if (c & DESTROY_IPC_FLAG) { + log_debug("%s " UID_FMT " is no longer referenced, cleaning up its IPC.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + } else { + c = n | (c & DESTROY_IPC_FLAG); + assert_se(hashmap_update(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)) >= 0); + } +} + +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now) { + manager_unref_uid_internal(m, &m->uid_refs, uid, destroy_now, clean_ipc_by_uid); +} + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now) { + manager_unref_uid_internal(m, &m->gid_refs, (uid_t) gid, destroy_now, clean_ipc_by_gid); +} + +static int manager_ref_uid_internal( + Manager *m, + Hashmap **uid_refs, + uid_t uid, + bool clean_ipc) { + + uint32_t c, n; + int r; + + assert(m); + assert(uid_refs); + assert(uid_is_valid(uid)); + + /* A generic implementation, covering both manager_ref_uid() and manager_ref_gid(), under the assumption + * that uid_t and gid_t are actually defined the same way, with the same validity rules. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (uid == 0) /* We don't keep track of root, and will never destroy it */ + return 0; + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) + return r; + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + + n = c & ~DESTROY_IPC_FLAG; + n++; + + if (n & DESTROY_IPC_FLAG) /* check for overflow */ + return -EOVERFLOW; + + c = n | (c & DESTROY_IPC_FLAG) | (clean_ipc ? DESTROY_IPC_FLAG : 0); + + return hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); +} + +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc) { + return manager_ref_uid_internal(m, &m->uid_refs, uid, clean_ipc); +} + +int manager_ref_gid(Manager *m, gid_t gid, bool clean_ipc) { + return manager_ref_uid_internal(m, &m->gid_refs, (uid_t) gid, clean_ipc); +} + +static void manager_vacuum_uid_refs_internal( + Manager *m, + Hashmap **uid_refs, + int (*_clean_ipc)(uid_t uid)) { + + Iterator i; + void *p, *k; + + assert(m); + assert(uid_refs); + assert(_clean_ipc); + + HASHMAP_FOREACH_KEY(p, k, *uid_refs, i) { + uint32_t c, n; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + n = c & ~DESTROY_IPC_FLAG; + if (n > 0) + continue; + + if (c & DESTROY_IPC_FLAG) { + log_debug("Found unreferenced %s " UID_FMT " after reload/reexec. Cleaning up.", + _clean_ipc == clean_ipc_by_uid ? "UID" : "GID", + uid); + (void) _clean_ipc(uid); + } + + assert_se(hashmap_remove(*uid_refs, k) == p); + } +} + +void manager_vacuum_uid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m, &m->uid_refs, clean_ipc_by_uid); +} + +void manager_vacuum_gid_refs(Manager *m) { + manager_vacuum_uid_refs_internal(m, &m->gid_refs, clean_ipc_by_gid); +} + +static void manager_serialize_uid_refs_internal( + Manager *m, + FILE *f, + Hashmap **uid_refs, + const char *field_name) { + + Iterator i; + void *p, *k; + + assert(m); + assert(f); + assert(uid_refs); + assert(field_name); + + /* Serialize the UID reference table. Or actually, just the IPC destruction flag of it, as the actual counter + * of it is better rebuild after a reload/reexec. */ + + HASHMAP_FOREACH_KEY(p, k, *uid_refs, i) { + uint32_t c; + uid_t uid; + + uid = PTR_TO_UID(k); + c = PTR_TO_UINT32(p); + + if (!(c & DESTROY_IPC_FLAG)) + continue; + + fprintf(f, "%s=" UID_FMT "\n", field_name, uid); + } +} + +void manager_serialize_uid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(m, f, &m->uid_refs, "destroy-ipc-uid"); +} + +void manager_serialize_gid_refs(Manager *m, FILE *f) { + manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid"); +} + +static void manager_deserialize_uid_refs_one_internal( + Manager *m, + Hashmap** uid_refs, + const char *value) { + + uid_t uid; + uint32_t c; + int r; + + assert(m); + assert(uid_refs); + assert(value); + + r = parse_uid(value, &uid); + if (r < 0 || uid == 0) { + log_debug("Unable to parse UID reference serialization"); + return; + } + + r = hashmap_ensure_allocated(uid_refs, &trivial_hash_ops); + if (r < 0) { + log_oom(); + return; + } + + c = PTR_TO_UINT32(hashmap_get(*uid_refs, UID_TO_PTR(uid))); + if (c & DESTROY_IPC_FLAG) + return; + + c |= DESTROY_IPC_FLAG; + + r = hashmap_replace(*uid_refs, UID_TO_PTR(uid), UINT32_TO_PTR(c)); + if (r < 0) { + log_debug("Failed to add UID reference entry"); + return; + } +} + +void manager_deserialize_uid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(m, &m->uid_refs, value); +} + +void manager_deserialize_gid_refs_one(Manager *m, const char *value) { + manager_deserialize_uid_refs_one_internal(m, &m->gid_refs, value); +} + +int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + struct buffer { + uid_t uid; + gid_t gid; + char unit_name[UNIT_NAME_MAX+1]; + } _packed_ buffer; + + Manager *m = userdata; + ssize_t l; + size_t n; + Unit *u; + + assert_se(source); + assert_se(m); + + /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the resulting UID/GID + * in a datagram. We parse the datagram here and pass it off to the unit, so that it can add a reference to the + * UID/GID so that it can destroy the UID/GID's IPC objects when the reference counter drops to 0. */ + + l = recv(fd, &buffer, sizeof(buffer), MSG_DONTWAIT); + if (l < 0) { + if (errno == EINTR || errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to read from user lookup fd: %m"); + } + + if ((size_t) l <= offsetof(struct buffer, unit_name)) { + log_warning("Received too short user lookup message, ignoring."); + return 0; + } + + if ((size_t) l > offsetof(struct buffer, unit_name) + UNIT_NAME_MAX) { + log_warning("Received too long user lookup message, ignoring."); + return 0; + } + + if (!uid_is_valid(buffer.uid) && !gid_is_valid(buffer.gid)) { + log_warning("Got user lookup message with invalid UID/GID pair, ignoring."); + return 0; + } + + n = (size_t) l - offsetof(struct buffer, unit_name); + if (memchr(buffer.unit_name, 0, n)) { + log_warning("Received lookup message with embedded NUL character, ignoring."); + return 0; + } + + buffer.unit_name[n] = 0; + u = manager_get_unit(m, buffer.unit_name); + if (!u) { + log_debug("Got user lookup message but unit doesn't exist, ignoring."); + return 0; + } + + log_unit_debug(u, "User lookup succeeded: uid=" UID_FMT " gid=" GID_FMT, buffer.uid, buffer.gid); + + unit_notify_user_lookup(u, buffer.uid, buffer.gid); + return 0; +} + static const char *const manager_state_table[_MANAGER_STATE_MAX] = { [MANAGER_INITIALIZING] = "initializing", [MANAGER_STARTING] = "starting", diff --git a/src/core/manager.h b/src/core/manager.h index c681d5dc46..b9f2e4b5a1 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -143,6 +143,9 @@ struct Manager { sd_event_source *jobs_in_progress_event_source; + int user_lookup_fds[2]; + sd_event_source *user_lookup_event_source; + UnitFileScope unit_file_scope; LookupPaths lookup_paths; Set *unit_path_cache; @@ -234,7 +237,6 @@ struct Manager { bool dispatching_dbus_queue:1; bool taint_usr:1; - bool test_run:1; /* If non-zero, exit with the following value when the systemd @@ -301,6 +303,10 @@ struct Manager { /* Dynamic users/groups, indexed by their name */ Hashmap *dynamic_users; + /* Keep track of all UIDs and GIDs any of our services currently use. This is useful for the RemoveIPC= logic. */ + Hashmap *uid_refs; + Hashmap *gid_refs; + /* When the user hits C-A-D more than 7 times per 2s, reboot immediately... */ RateLimit ctrl_alt_del_ratelimit; @@ -378,5 +384,20 @@ ManagerState manager_state(Manager *m); int manager_update_failed_units(Manager *m, Unit *u, bool failed); +void manager_unref_uid(Manager *m, uid_t uid, bool destroy_now); +int manager_ref_uid(Manager *m, uid_t uid, bool clean_ipc); + +void manager_unref_gid(Manager *m, gid_t gid, bool destroy_now); +int manager_ref_gid(Manager *m, gid_t gid, bool destroy_now); + +void manager_vacuum_uid_refs(Manager *m); +void manager_vacuum_gid_refs(Manager *m); + +void manager_serialize_uid_refs(Manager *m, FILE *f); +void manager_deserialize_uid_refs_one(Manager *m, const char *value); + +void manager_serialize_gid_refs(Manager *m, FILE *f); +void manager_deserialize_gid_refs_one(Manager *m, const char *value); + const char *manager_state_to_string(ManagerState m) _const_; ManagerState manager_state_from_string(const char *s) _pure_; diff --git a/src/core/mount.c b/src/core/mount.c index f3ccf6d48a..f2ac8d171f 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -769,6 +769,8 @@ static void mount_enter_dead(Mount *m, MountResult f) { exec_context_destroy_runtime_directory(&m->exec_context, manager_get_runtime_prefix(UNIT(m)->manager)); + unit_unref_uid_gid(UNIT(m), true); + dynamic_creds_destroy(&m->dynamic_creds); } diff --git a/src/core/service.c b/src/core/service.c index 4a37702f52..1951ba9222 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1471,6 +1471,9 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) /* Also, remove the runtime directory */ exec_context_destroy_runtime_directory(&s->exec_context, manager_get_runtime_prefix(UNIT(s)->manager)); + /* Get rid of the IPC bits of the user */ + unit_unref_uid_gid(UNIT(s), true); + /* Release the user, and destroy it if we are the only remaining owner */ dynamic_creds_destroy(&s->dynamic_creds); diff --git a/src/core/socket.c b/src/core/socket.c index 50872e8366..70d55dd9ed 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1905,6 +1905,8 @@ static void socket_enter_dead(Socket *s, SocketResult f) { exec_context_destroy_runtime_directory(&s->exec_context, manager_get_runtime_prefix(UNIT(s)->manager)); + unit_unref_uid_gid(UNIT(s), true); + dynamic_creds_destroy(&s->dynamic_creds); } diff --git a/src/core/swap.c b/src/core/swap.c index 2c802da3b5..fb222b6858 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -683,6 +683,8 @@ static void swap_enter_dead(Swap *s, SwapResult f) { exec_context_destroy_runtime_directory(&s->exec_context, manager_get_runtime_prefix(UNIT(s)->manager)); + unit_unref_uid_gid(UNIT(s), true); + dynamic_creds_destroy(&s->dynamic_creds); } diff --git a/src/core/unit.c b/src/core/unit.c index 952604e0db..4b8d81c3f1 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -100,7 +100,8 @@ Unit *unit_new(Manager *m, size_t size) { u->on_failure_job_mode = JOB_REPLACE; u->cgroup_inotify_wd = -1; u->job_timeout = USEC_INFINITY; - u->sigchldgen = 0; + u->ref_uid = UID_INVALID; + u->ref_gid = GID_INVALID; RATELIMIT_INIT(u->start_limit, m->default_start_limit_interval, m->default_start_limit_burst); RATELIMIT_INIT(u->auto_stop_ratelimit, 10 * USEC_PER_SEC, 16); @@ -550,6 +551,8 @@ void unit_free(Unit *u) { unit_release_cgroup(u); + unit_unref_uid_gid(u, false); + (void) manager_update_failed_units(u->manager, u, false); set_remove(u->manager->startup_units, u); @@ -2614,6 +2617,11 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { unit_serialize_item(u, f, "cgroup", u->cgroup_path); unit_serialize_item(u, f, "cgroup-realized", yes_no(u->cgroup_realized)); + if (uid_is_valid(u->ref_uid)) + unit_serialize_item_format(u, f, "ref-uid", UID_FMT, u->ref_uid); + if (gid_is_valid(u->ref_gid)) + unit_serialize_item_format(u, f, "ref-gid", GID_FMT, u->ref_gid); + if (serialize_jobs) { if (u->job) { fprintf(f, "job\n"); @@ -2851,6 +2859,28 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { u->cgroup_realized = b; continue; + + } else if (streq(l, "ref-uid")) { + uid_t uid; + + r = parse_uid(v, &uid); + if (r < 0) + log_unit_debug(u, "Failed to parse referenced UID %s, ignoring.", v); + else + unit_ref_uid_gid(u, uid, GID_INVALID); + + continue; + + } else if (streq(l, "ref-gid")) { + gid_t gid; + + r = parse_gid(v, &gid); + if (r < 0) + log_unit_debug(u, "Failed to parse referenced GID %s, ignoring.", v); + else + unit_ref_uid_gid(u, UID_INVALID, gid); + + continue; } if (unit_can_serialize(u)) { @@ -3310,6 +3340,7 @@ int unit_patch_contexts(Unit *u) { } ec->private_tmp = true; + ec->remove_ipc = true; } } @@ -3932,3 +3963,144 @@ pid_t unit_main_pid(Unit *u) { return 0; } + +static void unit_unref_uid_internal( + Unit *u, + uid_t *ref_uid, + bool destroy_now, + void (*_manager_unref_uid)(Manager *m, uid_t uid, bool destroy_now)) { + + assert(u); + assert(ref_uid); + assert(_manager_unref_uid); + + /* Generic implementation of both unit_unref_uid() and unit_unref_gid(), under the assumption that uid_t and + * gid_t are actually the same time, with the same validity rules. + * + * Drops a reference to UID/GID from a unit. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (!uid_is_valid(*ref_uid)) + return; + + _manager_unref_uid(u->manager, *ref_uid, destroy_now); + *ref_uid = UID_INVALID; +} + +void unit_unref_uid(Unit *u, bool destroy_now) { + unit_unref_uid_internal(u, &u->ref_uid, destroy_now, manager_unref_uid); +} + +void unit_unref_gid(Unit *u, bool destroy_now) { + unit_unref_uid_internal(u, (uid_t*) &u->ref_gid, destroy_now, manager_unref_gid); +} + +static int unit_ref_uid_internal( + Unit *u, + uid_t *ref_uid, + uid_t uid, + bool clean_ipc, + int (*_manager_ref_uid)(Manager *m, uid_t uid, bool clean_ipc)) { + + int r; + + assert(u); + assert(ref_uid); + assert(uid_is_valid(uid)); + assert(_manager_ref_uid); + + /* Generic implementation of both unit_ref_uid() and unit_ref_guid(), under the assumption that uid_t and gid_t + * are actually the same type, and have the same validity rules. + * + * Adds a reference on a specific UID/GID to this unit. Each unit referencing the same UID/GID maintains a + * reference so that we can destroy the UID/GID's IPC resources as soon as this is requested and the counter + * drops to zero. */ + + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + assert_cc(UID_INVALID == (uid_t) GID_INVALID); + + if (*ref_uid == uid) + return 0; + + if (uid_is_valid(*ref_uid)) /* Already set? */ + return -EBUSY; + + r = _manager_ref_uid(u->manager, uid, clean_ipc); + if (r < 0) + return r; + + *ref_uid = uid; + return 1; +} + +int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc) { + return unit_ref_uid_internal(u, &u->ref_uid, uid, clean_ipc, manager_ref_uid); +} + +int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc) { + return unit_ref_uid_internal(u, (uid_t*) &u->ref_gid, (uid_t) gid, clean_ipc, manager_ref_gid); +} + +static int unit_ref_uid_gid_internal(Unit *u, uid_t uid, gid_t gid, bool clean_ipc) { + int r = 0, q = 0; + + assert(u); + + /* Reference both a UID and a GID in one go. Either references both, or neither. */ + + if (uid_is_valid(uid)) { + r = unit_ref_uid(u, uid, clean_ipc); + if (r < 0) + return r; + } + + if (gid_is_valid(gid)) { + q = unit_ref_gid(u, gid, clean_ipc); + if (q < 0) { + if (r > 0) + unit_unref_uid(u, false); + + return q; + } + } + + return r > 0 || q > 0; +} + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid) { + ExecContext *c; + int r; + + assert(u); + + c = unit_get_exec_context(u); + + r = unit_ref_uid_gid_internal(u, uid, gid, c ? c->remove_ipc : false); + if (r < 0) + return log_unit_warning_errno(u, r, "Couldn't add UID/GID reference to unit, proceeding without: %m"); + + return r; +} + +void unit_unref_uid_gid(Unit *u, bool destroy_now) { + assert(u); + + unit_unref_uid(u, destroy_now); + unit_unref_gid(u, destroy_now); +} + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) { + int r; + + assert(u); + + /* This is invoked whenever one of the forked off processes let's us know the UID/GID its user name/group names + * resolved to. We keep track of which UID/GID is currently assigned in order to be able to destroy its IPC + * objects when no service references the UID/GID anymore. */ + + r = unit_ref_uid_gid(u, uid, gid); + if (r > 0) + bus_unit_send_change_signal(u); +} diff --git a/src/core/unit.h b/src/core/unit.h index 513ea1614c..53875653d7 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -180,6 +180,10 @@ struct Unit { /* Make sure we never enter endless loops with the check unneeded logic, or the BindsTo= logic */ RateLimit auto_stop_ratelimit; + /* Reference to a specific UID/GID */ + uid_t ref_uid; + gid_t ref_gid; + /* Cached unit file state and preset */ UnitFileState unit_file_state; int unit_file_preset; @@ -371,8 +375,7 @@ struct UnitVTable { /* Called whenever a process of this unit sends us a message */ void (*notify_message)(Unit *u, pid_t pid, char **tags, FDSet *fds); - /* Called whenever a name this Unit registered for comes or - * goes away. */ + /* Called whenever a name this Unit registered for comes or goes away. */ void (*bus_name_owner_change)(Unit *u, const char *name, const char *old_owner, const char *new_owner); /* Called for each property that is being set */ @@ -621,6 +624,17 @@ int unit_fail_if_symlink(Unit *u, const char* where); int unit_start_limit_test(Unit *u); +void unit_unref_uid(Unit *u, bool destroy_now); +int unit_ref_uid(Unit *u, uid_t uid, bool clean_ipc); + +void unit_unref_gid(Unit *u, bool destroy_now); +int unit_ref_gid(Unit *u, gid_t gid, bool clean_ipc); + +int unit_ref_uid_gid(Unit *u, uid_t uid, gid_t gid); +void unit_unref_uid_gid(Unit *u, bool destroy_now); + +void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid); + /* Macros which append UNIT= or USER_UNIT= to the message */ #define log_unit_full(unit, level, error, ...) \ diff --git a/src/login/logind-user.c b/src/login/logind-user.c index 63363035e7..11951aca5b 100644 --- a/src/login/logind-user.c +++ b/src/login/logind-user.c @@ -612,9 +612,14 @@ int user_finalize(User *u) { if (k < 0) r = k; - /* Clean SysV + POSIX IPC objects */ - if (u->manager->remove_ipc) { - k = clean_ipc(u->uid); + /* Clean SysV + POSIX IPC objects, but only if this is not a system user. Background: in many setups cronjobs + * are run in full PAM and thus logind sessions, even if the code run doesn't belong to actual users but to + * system components. Since enable RemoveIPC= globally for all users, we need to be a bit careful with such + * cases, as we shouldn't accidentally remove a system service's IPC objects while it is running, just because + * a cronjob running as the same user just finished. Hence: exclude system users generally from IPC clean-up, + * and do it only for normal users. */ + if (u->manager->remove_ipc && u->uid > SYSTEM_UID_MAX) { + k = clean_ipc_by_uid(u->uid); if (k < 0) r = k; } diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index f9e12e0578..ab30afb527 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC")) { r = parse_boolean(eq); if (r < 0) diff --git a/src/shared/clean-ipc.c b/src/shared/clean-ipc.c index 95686348c1..64f9b94641 100644 --- a/src/shared/clean-ipc.c +++ b/src/shared/clean-ipc.c @@ -41,8 +41,20 @@ #include "macro.h" #include "string-util.h" #include "strv.h" +#include "user-util.h" -static int clean_sysvipc_shm(uid_t delete_uid) { +static bool match_uid_gid(uid_t subject_uid, gid_t subject_gid, uid_t delete_uid, gid_t delete_gid) { + + if (uid_is_valid(delete_uid) && subject_uid == delete_uid) + return true; + + if (gid_is_valid(delete_gid) && subject_gid == delete_gid) + return true; + + return false; +} + +static int clean_sysvipc_shm(uid_t delete_uid, gid_t delete_gid) { _cleanup_fclose_ FILE *f = NULL; char line[LINE_MAX]; bool first = true; @@ -77,7 +89,7 @@ static int clean_sysvipc_shm(uid_t delete_uid) { if (n_attached > 0) continue; - if (uid != delete_uid) + if (!match_uid_gid(uid, gid, delete_uid, delete_gid)) continue; if (shmctl(shmid, IPC_RMID, NULL) < 0) { @@ -98,7 +110,7 @@ fail: return log_warning_errno(errno, "Failed to read /proc/sysvipc/shm: %m"); } -static int clean_sysvipc_sem(uid_t delete_uid) { +static int clean_sysvipc_sem(uid_t delete_uid, gid_t delete_gid) { _cleanup_fclose_ FILE *f = NULL; char line[LINE_MAX]; bool first = true; @@ -128,7 +140,7 @@ static int clean_sysvipc_sem(uid_t delete_uid) { &semid, &uid, &gid, &cuid, &cgid) != 5) continue; - if (uid != delete_uid) + if (!match_uid_gid(uid, gid, delete_uid, delete_gid)) continue; if (semctl(semid, 0, IPC_RMID) < 0) { @@ -149,7 +161,7 @@ fail: return log_warning_errno(errno, "Failed to read /proc/sysvipc/sem: %m"); } -static int clean_sysvipc_msg(uid_t delete_uid) { +static int clean_sysvipc_msg(uid_t delete_uid, gid_t delete_gid) { _cleanup_fclose_ FILE *f = NULL; char line[LINE_MAX]; bool first = true; @@ -180,7 +192,7 @@ static int clean_sysvipc_msg(uid_t delete_uid) { &msgid, &cpid, &lpid, &uid, &gid, &cuid, &cgid) != 7) continue; - if (uid != delete_uid) + if (!match_uid_gid(uid, gid, delete_uid, delete_gid)) continue; if (msgctl(msgid, IPC_RMID, NULL) < 0) { @@ -201,7 +213,7 @@ fail: return log_warning_errno(errno, "Failed to read /proc/sysvipc/msg: %m"); } -static int clean_posix_shm_internal(DIR *dir, uid_t uid) { +static int clean_posix_shm_internal(DIR *dir, uid_t uid, gid_t gid) { struct dirent *de; int ret = 0, r; @@ -221,7 +233,7 @@ static int clean_posix_shm_internal(DIR *dir, uid_t uid) { continue; } - if (st.st_uid != uid) + if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid)) continue; if (S_ISDIR(st.st_mode)) { @@ -232,7 +244,7 @@ static int clean_posix_shm_internal(DIR *dir, uid_t uid) { if (errno != ENOENT) ret = log_warning_errno(errno, "Failed to enter shared memory directory %s: %m", de->d_name); } else { - r = clean_posix_shm_internal(kid, uid); + r = clean_posix_shm_internal(kid, uid, gid); if (r < 0) ret = r; } @@ -262,7 +274,7 @@ fail: return log_warning_errno(errno, "Failed to read /dev/shm: %m"); } -static int clean_posix_shm(uid_t uid) { +static int clean_posix_shm(uid_t uid, gid_t gid) { _cleanup_closedir_ DIR *dir = NULL; dir = opendir("/dev/shm"); @@ -273,10 +285,10 @@ static int clean_posix_shm(uid_t uid) { return log_warning_errno(errno, "Failed to open /dev/shm: %m"); } - return clean_posix_shm_internal(dir, uid); + return clean_posix_shm_internal(dir, uid, gid); } -static int clean_posix_mq(uid_t uid) { +static int clean_posix_mq(uid_t uid, gid_t gid) { _cleanup_closedir_ DIR *dir = NULL; struct dirent *de; int ret = 0; @@ -306,7 +318,7 @@ static int clean_posix_mq(uid_t uid) { continue; } - if (st.st_uid != uid) + if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid)) continue; fn[0] = '/'; @@ -328,32 +340,44 @@ fail: return log_warning_errno(errno, "Failed to read /dev/mqueue: %m"); } -int clean_ipc(uid_t uid) { +int clean_ipc(uid_t uid, gid_t gid) { int ret = 0, r; - /* Refuse to clean IPC of the root and system users */ - if (uid <= SYSTEM_UID_MAX) + /* Anything to do? */ + if (!uid_is_valid(uid) && !gid_is_valid(gid)) return 0; - r = clean_sysvipc_shm(uid); + /* Refuse to clean IPC of the root user */ + if (uid == 0 && gid == 0) + return 0; + + r = clean_sysvipc_shm(uid, gid); if (r < 0) ret = r; - r = clean_sysvipc_sem(uid); + r = clean_sysvipc_sem(uid, gid); if (r < 0) ret = r; - r = clean_sysvipc_msg(uid); + r = clean_sysvipc_msg(uid, gid); if (r < 0) ret = r; - r = clean_posix_shm(uid); + r = clean_posix_shm(uid, gid); if (r < 0) ret = r; - r = clean_posix_mq(uid); + r = clean_posix_mq(uid, gid); if (r < 0) ret = r; return ret; } + +int clean_ipc_by_uid(uid_t uid) { + return clean_ipc(uid, GID_INVALID); +} + +int clean_ipc_by_gid(gid_t gid) { + return clean_ipc(UID_INVALID, gid); +} diff --git a/src/shared/clean-ipc.h b/src/shared/clean-ipc.h index 44a83afcf7..6ca57f44fd 100644 --- a/src/shared/clean-ipc.h +++ b/src/shared/clean-ipc.h @@ -21,4 +21,6 @@ #include -int clean_ipc(uid_t uid); +int clean_ipc(uid_t uid, gid_t gid); +int clean_ipc_by_uid(uid_t uid); +int clean_ipc_by_gid(gid_t gid); diff --git a/src/test/test-ipcrm.c b/src/test/test-ipcrm.c index c5bcaf47bb..551eba7215 100644 --- a/src/test/test-ipcrm.c +++ b/src/test/test-ipcrm.c @@ -32,5 +32,5 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } - return clean_ipc(uid) < 0 ? EXIT_FAILURE : EXIT_SUCCESS; + return clean_ipc_by_uid(uid) < 0 ? EXIT_FAILURE : EXIT_SUCCESS; } -- cgit v1.2.3-54-g00ecf From fd63e712b2025d235ce4bfbb512fada10e2690b5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 2 Aug 2016 12:28:51 +0200 Subject: core: bypass dynamic user lookups from dbus-daemon dbus-daemon does NSS name look-ups in order to enforce its bus policy. This might dead-lock if an NSS module use wants to use D-Bus for the look-up itself, like our nss-systemd does. Let's work around this by bypassing bus communication in the NSS module if we run inside of dbus-daemon. To make this work we keep a bit of extra state in /run/systemd/dynamic-uid/ so that we don't have to consult the bus, but can still resolve the names. Note that the normal codepath continues to be via the bus, so that resolving works from all mount namespaces and is subject to authentication, as before. This is a bit dirty, but not too dirty, as dbus daemon is kinda special anyway for PID 1. --- src/core/dynamic-user.c | 51 +++++++- src/core/execute.c | 15 ++- src/nss-systemd/nss-systemd.c | 262 ++++++++++++++++++++++++++++-------------- 3 files changed, 235 insertions(+), 93 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index 8035bee231..185d0e5f00 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -150,6 +150,42 @@ int dynamic_user_acquire(Manager *m, const char *name, DynamicUser** ret) { return 1; } +static int make_uid_symlinks(uid_t uid, const char *name, bool b) { + + char path1[strlen("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1]; + const char *path2; + int r = 0; + + /* Add direct additional symlinks for direct lookups of dynamic UIDs and their names by userspace code. The + * only reason we have this is because dbus-daemon cannot use D-Bus for resolving users and groups (since it + * would be its own client then). We hence keep these world-readable symlinks in place, so that the + * unprivileged dbus user can read the mappings when it needs them via these symlinks instead of having to go + * via the bus. Ideally, we'd use the lock files we keep for this anyway, but we can't since we use BSD locks + * on them and as those may be taken by any user with read access we can't make them world-readable. */ + + xsprintf(path1, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid); + if (unlink(path1) < 0) { + if (errno != ENOENT) + r = -errno; + } + if (b) { + if (symlink(name, path1) < 0) + r = -errno; + } + + path2 = strjoina("/run/systemd/dynamic-uid/direct:", name); + if (unlink(path2) < 0) { + if (errno != ENOENT) + r = -errno; + } + if (b) { + if (symlink(path1 + strlen("/run/systemd/dynamic-uid/direct:"), path2) < 0) + r = -errno; + } + + return r; +} + static int pick_uid(const char *name, uid_t *ret_uid) { static const uint8_t hash_key[] = { @@ -223,6 +259,7 @@ static int pick_uid(const char *name, uid_t *ret_uid) { } (void) ftruncate(lock_fd, l); + (void) make_uid_symlinks(candidate, name, true); /* also add direct lookup symlinks */ *ret_uid = candidate; r = lock_fd; @@ -324,14 +361,16 @@ static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) { return 0; } -static void unlink_uid_lock(int lock_fd, uid_t uid) { +static void unlink_uid_lock(int lock_fd, uid_t uid, const char *name) { char lock_path[strlen("/run/systemd/dynamic-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; if (lock_fd < 0) return; xsprintf(lock_path, "/run/systemd/dynamic-uid/" UID_FMT, uid); - (void) unlink_noerrno(lock_path); + (void) unlink(lock_path); + + (void) make_uid_symlinks(uid, name, false); /* remove direct lookup symlinks */ } int dynamic_user_realize(DynamicUser *d, uid_t *ret) { @@ -399,7 +438,7 @@ int dynamic_user_realize(DynamicUser *d, uid_t *ret) { /* So, we found a working UID/lock combination. Let's see if we actually still need it. */ if (lockf(d->storage_socket[0], F_LOCK, 0) < 0) { - unlink_uid_lock(uid_lock_fd, uid); + unlink_uid_lock(uid_lock_fd, uid, d->name); return -errno; } @@ -407,7 +446,7 @@ int dynamic_user_realize(DynamicUser *d, uid_t *ret) { if (r < 0) { if (r != -EAGAIN) { /* OK, something bad happened, let's get rid of the bits we acquired. */ - unlink_uid_lock(uid_lock_fd, uid); + unlink_uid_lock(uid_lock_fd, uid, d->name); goto finish; } @@ -416,7 +455,7 @@ int dynamic_user_realize(DynamicUser *d, uid_t *ret) { /* Hmm, so as it appears there's now something stored in the storage socket. Throw away what we * acquired, and use what's stored now. */ - unlink_uid_lock(uid_lock_fd, uid); + unlink_uid_lock(uid_lock_fd, uid, d->name); safe_close(uid_lock_fd); uid = new_uid; @@ -513,7 +552,7 @@ static int dynamic_user_close(DynamicUser *d) { goto finish; /* This dynamic user was realized and dynamically allocated. In this case, let's remove the lock file. */ - unlink_uid_lock(lock_fd, uid); + unlink_uid_lock(lock_fd, uid, d->name); r = 1; finish: diff --git a/src/core/execute.c b/src/core/execute.c index 4c786a2e33..0af8eb5a02 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -91,6 +91,7 @@ #include "selinux-util.h" #include "signal-util.h" #include "smack-util.h" +#include "special.h" #include "string-table.h" #include "string-util.h" #include "strv.h" @@ -1384,6 +1385,7 @@ static void do_idle_pipe_dance(int idle_pipe[4]) { } static int build_environment( + Unit *u, const ExecContext *c, const ExecParameters *p, unsigned n_fds, @@ -1401,7 +1403,7 @@ static int build_environment( assert(c); assert(ret); - our_env = new0(char*, 12); + our_env = new0(char*, 13); if (!our_env) return -ENOMEM; @@ -1436,6 +1438,16 @@ static int build_environment( our_env[n_env++] = x; } + /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic + * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but + * check the database directly. */ + if (unit_has_name(u, SPECIAL_DBUS_SERVICE)) { + x = strdup("SYSTEMD_NSS_BYPASS_BUS=1"); + if (!x) + return -ENOMEM; + our_env[n_env++] = x; + } + if (home) { x = strappend("HOME=", home); if (!x) @@ -2100,6 +2112,7 @@ static int exec_child( } r = build_environment( + unit, context, params, n_fds, diff --git a/src/nss-systemd/nss-systemd.c b/src/nss-systemd/nss-systemd.c index 7078c0c50c..17d04e958d 100644 --- a/src/nss-systemd/nss-systemd.c +++ b/src/nss-systemd/nss-systemd.c @@ -21,11 +21,14 @@ #include "sd-bus.h" +#include "alloc-util.h" #include "bus-common-errors.h" #include "env-util.h" +#include "fs-util.h" #include "macro.h" #include "nss-util.h" #include "signal-util.h" +#include "stdio-util.h" #include "string-util.h" #include "user-util.h" #include "util.h" @@ -75,15 +78,50 @@ static const struct group nobody_group = { NSS_GETPW_PROTOTYPES(systemd); NSS_GETGR_PROTOTYPES(systemd); +static int direct_lookup_name(const char *name, uid_t *ret) { + _cleanup_free_ char *s = NULL; + const char *path; + int r; + + assert(name); + + /* Normally, we go via the bus to resolve names. That has the benefit that it is available from any mount + * namespace and subject to proper authentication. However, there's one problem: if our module is called from + * dbus-daemon itself we really can't use D-Bus to communicate. In this case, resort to a client-side hack, + * and look for the dynamic names directly. This is pretty ugly, but breaks the cyclic dependency. */ + + path = strjoina("/run/systemd/dynamic-uid/direct:", name); + r = readlink_malloc(path, &s); + if (r < 0) + return r; + + return parse_uid(s, ret); +} + +static int direct_lookup_uid(uid_t uid, char **ret) { + char path[strlen("/run/systemd/dynamic-uid/direct:") + DECIMAL_STR_MAX(uid_t) + 1], *s; + int r; + + xsprintf(path, "/run/systemd/dynamic-uid/direct:" UID_FMT, uid); + + r = readlink_malloc(path, &s); + if (r < 0) + return r; + if (!valid_user_group_name(s)) { /* extra safety check */ + free(s); + return -EINVAL; + } + + *ret = s; + return 0; +} + enum nss_status _nss_systemd_getpwnam_r( const char *name, struct passwd *pwd, char *buffer, size_t buflen, int *errnop) { - _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; - _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; uint32_t translated; size_t l; int r; @@ -114,30 +152,45 @@ enum nss_status _nss_systemd_getpwnam_r( if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) goto not_found; - r = sd_bus_open_system(&bus); - if (r < 0) - goto fail; + if (getenv_bool("SYSTEMD_NSS_BYPASS_BUS") > 0) { - r = sd_bus_call_method(bus, - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - "LookupDynamicUserByName", - &error, - &reply, - "s", - name); - if (r < 0) { - if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + /* Access the dynamic UID allocation directly if we are called from dbus-daemon, see above. */ + r = direct_lookup_name(name, (uid_t*) &translated); + if (r == -ENOENT) goto not_found; - - goto fail; + if (r < 0) + goto fail; + + } else { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByName", + &error, + &reply, + "s", + name); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "u", &translated); + if (r < 0) + goto fail; } - r = sd_bus_message_read(reply, "u", &translated); - if (r < 0) - goto fail; - l = strlen(name); if (buflen < l+1) { *errnop = ENOMEM; @@ -175,6 +228,7 @@ enum nss_status _nss_systemd_getpwuid_r( _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *direct = NULL; const char *translated; size_t l; int r; @@ -204,30 +258,42 @@ enum nss_status _nss_systemd_getpwuid_r( if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) goto not_found; - r = sd_bus_open_system(&bus); - if (r < 0) - goto fail; + if (getenv_bool("SYSTEMD_NSS_BYPASS_BUS") > 0) { - r = sd_bus_call_method(bus, - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - "LookupDynamicUserByUID", - &error, - &reply, - "u", - (uint32_t) uid); - if (r < 0) { - if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + r = direct_lookup_uid(uid, &direct); + if (r == -ENOENT) goto not_found; - - goto fail; + if (r < 0) + goto fail; + + translated = direct; + + } else { + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByUID", + &error, + &reply, + "u", + (uint32_t) uid); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "s", &translated); + if (r < 0) + goto fail; } - r = sd_bus_message_read(reply, "s", &translated); - if (r < 0) - goto fail; - l = strlen(translated) + 1; if (buflen < l) { *errnop = ENOMEM; @@ -262,9 +328,6 @@ enum nss_status _nss_systemd_getgrnam_r( char *buffer, size_t buflen, int *errnop) { - _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; - _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; uint32_t translated; size_t l; int r; @@ -294,30 +357,45 @@ enum nss_status _nss_systemd_getgrnam_r( if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) goto not_found; - r = sd_bus_open_system(&bus); - if (r < 0) - goto fail; + if (getenv_bool("SYSTEMD_NSS_BYPASS_BUS") > 0) { - r = sd_bus_call_method(bus, - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - "LookupDynamicUserByName", - &error, - &reply, - "s", - name); - if (r < 0) { - if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + /* Access the dynamic GID allocation directly if we are called from dbus-daemon, see above. */ + r = direct_lookup_name(name, (uid_t*) &translated); + if (r == -ENOENT) goto not_found; - - goto fail; + if (r < 0) + goto fail; + } else { + + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByName", + &error, + &reply, + "s", + name); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "u", &translated); + if (r < 0) + goto fail; } - r = sd_bus_message_read(reply, "u", &translated); - if (r < 0) - goto fail; - l = sizeof(char*) + strlen(name) + 1; if (buflen < l) { *errnop = ENOMEM; @@ -353,6 +431,7 @@ enum nss_status _nss_systemd_getgrgid_r( _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; _cleanup_(sd_bus_message_unrefp) sd_bus_message* reply = NULL; _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *direct = NULL; const char *translated; size_t l; int r; @@ -382,30 +461,41 @@ enum nss_status _nss_systemd_getgrgid_r( if (getenv_bool("SYSTEMD_NSS_DYNAMIC_BYPASS") > 0) goto not_found; - r = sd_bus_open_system(&bus); - if (r < 0) - goto fail; + if (getenv_bool("SYSTEMD_NSS_BYPASS_BUS") > 0) { - r = sd_bus_call_method(bus, - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - "LookupDynamicUserByUID", - &error, - &reply, - "u", - (uint32_t) gid); - if (r < 0) { - if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + r = direct_lookup_uid(gid, &direct); + if (r == -ENOENT) goto not_found; - - goto fail; + if (r < 0) + goto fail; + + translated = direct; + } else { + r = sd_bus_open_system(&bus); + if (r < 0) + goto fail; + + r = sd_bus_call_method(bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "LookupDynamicUserByUID", + &error, + &reply, + "u", + (uint32_t) gid); + if (r < 0) { + if (sd_bus_error_has_name(&error, BUS_ERROR_NO_SUCH_DYNAMIC_USER)) + goto not_found; + + goto fail; + } + + r = sd_bus_message_read(reply, "s", &translated); + if (r < 0) + goto fail; } - r = sd_bus_message_read(reply, "s", &translated); - if (r < 0) - goto fail; - l = sizeof(char*) + strlen(translated) + 1; if (buflen < l) { *errnop = ENOMEM; -- cgit v1.2.3-54-g00ecf From 83f12b27d14853e7c89a326f7cd31a6c739d378e Mon Sep 17 00:00:00 2001 From: Felipe Sateler Date: Mon, 22 Aug 2016 16:40:58 -0300 Subject: core: do not fail at step SECCOMP if there is no kernel support (#4004) Fixes #3882 --- src/core/execute.c | 38 ++++++++++++++++++++++++++++++-------- src/core/main.c | 6 ++++++ src/shared/seccomp-util.c | 10 ++++++++++ src/shared/seccomp-util.h | 2 ++ src/test/test-execute.c | 11 ++++++++++- 5 files changed, 58 insertions(+), 9 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 0af8eb5a02..55f15d7e49 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1074,7 +1074,17 @@ static void rename_process_from_path(const char *path) { #ifdef HAVE_SECCOMP -static int apply_seccomp(const ExecContext *c) { +static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { + if (!is_seccomp_available()) { + log_open(); + log_unit_debug(u, "SECCOMP not detected in the kernel, skipping %s", msg); + log_close(); + return true; + } + return false; +} + +static int apply_seccomp(const Unit* u, const ExecContext *c) { uint32_t negative_action, action; scmp_filter_ctx *seccomp; Iterator i; @@ -1083,6 +1093,9 @@ static int apply_seccomp(const ExecContext *c) { assert(c); + if (skip_seccomp_unavailable(u, "syscall filtering")) + return 0; + negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno); seccomp = seccomp_init(c->syscall_whitelist ? negative_action : SCMP_ACT_ALLOW); @@ -1123,13 +1136,16 @@ finish: return r; } -static int apply_address_families(const ExecContext *c) { +static int apply_address_families(const Unit* u, const ExecContext *c) { scmp_filter_ctx *seccomp; Iterator i; int r; assert(c); + if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return -ENOMEM; @@ -1244,12 +1260,15 @@ finish: return r; } -static int apply_memory_deny_write_execute(const ExecContext *c) { +static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { scmp_filter_ctx *seccomp; int r; assert(c); + if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return -ENOMEM; @@ -1283,7 +1302,7 @@ finish: return r; } -static int apply_restrict_realtime(const ExecContext *c) { +static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { static const int permitted_policies[] = { SCHED_OTHER, SCHED_BATCH, @@ -1296,6 +1315,9 @@ static int apply_restrict_realtime(const ExecContext *c) { assert(c); + if (skip_seccomp_unavailable(u, "RestrictRealtime=")) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return -ENOMEM; @@ -2403,7 +2425,7 @@ static int exec_child( #ifdef HAVE_SECCOMP if (use_address_families) { - r = apply_address_families(context); + r = apply_address_families(unit, context); if (r < 0) { *exit_status = EXIT_ADDRESS_FAMILIES; return r; @@ -2411,7 +2433,7 @@ static int exec_child( } if (context->memory_deny_write_execute) { - r = apply_memory_deny_write_execute(context); + r = apply_memory_deny_write_execute(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; return r; @@ -2419,7 +2441,7 @@ static int exec_child( } if (context->restrict_realtime) { - r = apply_restrict_realtime(context); + r = apply_restrict_realtime(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; return r; @@ -2427,7 +2449,7 @@ static int exec_child( } if (use_syscall_filter) { - r = apply_seccomp(context); + r = apply_seccomp(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; return r; diff --git a/src/core/main.c b/src/core/main.c index 125cfb28f0..7d8322ebd8 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -72,6 +72,9 @@ #include "process-util.h" #include "raw-clone.h" #include "rlimit-util.h" +#ifdef HAVE_SECCOMP +#include "seccomp-util.h" +#endif #include "selinux-setup.h" #include "selinux-util.h" #include "signal-util.h" @@ -1186,6 +1189,9 @@ static int enforce_syscall_archs(Set *archs) { void *id; int r; + if (!is_seccomp_available()) + return 0; + seccomp = seccomp_init(SCMP_ACT_ALLOW); if (!seccomp) return log_oom(); diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 8656d112b8..4667f508c7 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -21,6 +21,8 @@ #include #include +#include "alloc-util.h" +#include "fileio.h" #include "macro.h" #include "seccomp-util.h" #include "string-util.h" @@ -89,6 +91,14 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { } +bool is_seccomp_available(void) { + _cleanup_free_ char* field = NULL; + static int cached_enabled = -1; + if (cached_enabled < 0) + cached_enabled = get_proc_field("/proc/self/status", "Seccomp", "\n", &field) == 0; + return cached_enabled; +} + const SystemCallFilterSet syscall_filter_sets[] = { { /* Clock */ diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index be33eecb85..cca7c17912 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -27,6 +27,8 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret); int seccomp_add_secondary_archs(scmp_filter_ctx *c); +bool is_seccomp_available(void); + typedef struct SystemCallFilterSet { const char *set_name; const char *value; diff --git a/src/test/test-execute.c b/src/test/test-execute.c index 1d24115b5c..05ec1d2eb1 100644 --- a/src/test/test-execute.c +++ b/src/test/test-execute.c @@ -30,6 +30,9 @@ #include "mkdir.h" #include "path-util.h" #include "rm-rf.h" +#ifdef HAVE_SECCOMP +#include "seccomp-util.h" +#endif #include "test-helper.h" #include "unit.h" #include "util.h" @@ -132,21 +135,27 @@ static void test_exec_privatedevices(Manager *m) { static void test_exec_systemcallfilter(Manager *m) { #ifdef HAVE_SECCOMP + if (!is_seccomp_available()) + return; test(m, "exec-systemcallfilter-not-failing.service", 0, CLD_EXITED); test(m, "exec-systemcallfilter-not-failing2.service", 0, CLD_EXITED); test(m, "exec-systemcallfilter-failing.service", SIGSYS, CLD_KILLED); test(m, "exec-systemcallfilter-failing2.service", SIGSYS, CLD_KILLED); + #endif } static void test_exec_systemcallerrornumber(Manager *m) { #ifdef HAVE_SECCOMP - test(m, "exec-systemcallerrornumber.service", 1, CLD_EXITED); + if (is_seccomp_available()) + test(m, "exec-systemcallerrornumber.service", 1, CLD_EXITED); #endif } static void test_exec_systemcall_system_mode_with_user(Manager *m) { #ifdef HAVE_SECCOMP + if (!is_seccomp_available()) + return; if (getpwnam("nobody")) test(m, "exec-systemcallfilter-system-user.service", 0, CLD_EXITED); else if (getpwnam("nfsnobody")) -- cgit v1.2.3-54-g00ecf From d347d9029c7ec6b30eaaab93649105d935061b55 Mon Sep 17 00:00:00 2001 From: Felipe Sateler Date: Wed, 31 Aug 2016 10:00:35 -0300 Subject: seccomp: also detect if seccomp filtering is enabled In https://github.com/systemd/systemd/pull/4004 , a runtime detection method for seccomp was added. However, it does not detect the case where CONFIG_SECCOMP=y but CONFIG_SECCOMP_FILTER=n. This is possible if the architecture does not support filtering yet. Add a check for that case too. While at it, change get_proc_field usage to use PR_GET_SECCOMP prctl, as that should save a few system calls and (unnecessary) allocations. Previously, reading of /proc/self/stat was done as recommended by prctl(2) as safer. However, given that we need to do the prctl call anyway, lets skip opening, reading and parsing the file. Code for checking inspired by https://outflux.net/teach-seccomp/autodetect.html --- src/core/execute.c | 2 +- src/shared/seccomp-util.c | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 55f15d7e49..2026137721 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1077,7 +1077,7 @@ static void rename_process_from_path(const char *path) { static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { if (!is_seccomp_available()) { log_open(); - log_unit_debug(u, "SECCOMP not detected in the kernel, skipping %s", msg); + log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); log_close(); return true; } diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 6c489284d1..2f42381fc1 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -20,9 +20,9 @@ #include #include #include +#include +#include -#include "alloc-util.h" -#include "fileio.h" #include "macro.h" #include "seccomp-util.h" #include "string-util.h" @@ -91,11 +91,22 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { } +static bool is_basic_seccomp_available(void) { + int r; + r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + return r >= 0; +} + +static bool is_seccomp_filter_available(void) { + int r; + r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0); + return r < 0 && errno == EFAULT; +} + bool is_seccomp_available(void) { - _cleanup_free_ char* field = NULL; static int cached_enabled = -1; if (cached_enabled < 0) - cached_enabled = get_proc_field("/proc/self/status", "Seccomp", "\n", &field) == 0; + cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available(); return cached_enabled; } -- cgit v1.2.3-54-g00ecf From 72246c2a654ead7f7ee6e7799161e2e46dc0b84b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 22 Aug 2016 19:01:14 +0200 Subject: core: enforce seccomp for secondary archs too, for all rules Let's make sure that all our rules apply to all archs the local kernel supports. --- src/core/execute.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 2026137721..ee734e8445 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1273,6 +1273,10 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (!seccomp) return -ENOMEM; + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + r = seccomp_rule_add( seccomp, SCMP_ACT_ERRNO(EPERM), @@ -1322,6 +1326,10 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (!seccomp) return -ENOMEM; + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + /* Determine the highest policy constant we want to allow */ for (i = 0; i < ELEMENTSOF(permitted_policies); i++) if (permitted_policies[i] > max_policy) -- cgit v1.2.3-54-g00ecf From 59eeb84ba65483c5543d1bc840c2ac75642ef638 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 22 Aug 2016 18:43:59 +0200 Subject: core: add two new service settings ProtectKernelTunables= and ProtectControlGroups= If enabled, these will block write access to /sys, /proc/sys and /proc/sys/fs/cgroup. --- man/systemd.exec.xml | 20 +++++++ src/core/dbus-execute.c | 9 ++- src/core/execute.c | 100 ++++++++++++++++++++++++++++++---- src/core/execute.h | 2 + src/core/load-fragment-gperf.gperf.m4 | 2 + src/core/namespace.c | 36 ++++++++++-- src/core/namespace.h | 2 + src/shared/bus-unit-util.c | 2 +- src/test/test-ns.c | 2 + 9 files changed, 159 insertions(+), 16 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index bcedebd5bb..07128b489e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1059,6 +1059,26 @@ Defaults to off. + + ProtectKernelTunables= + + Takes a boolean argument. If true, kernel variables accessible through + /proc/sys and /sys will be made read-only to all processes of the + unit. Usually, tunable kernel variables should only be written at boot-time, with the + sysctl.d5 mechanism. Almost + no services need to write to these at runtime; it is hence recommended to turn this on for most + services. Defaults to off. + + + + ProtectControlGroups= + + Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible + through /sys/fs/cgroup will be made read-only to all processes of the unit. Except for + container managers no services should require write access to the control groups hierarchies; it is hence + recommended to turn this on for most services. Defaults to off. + + MountFlags= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 7e33a2d201..eec4500c8c 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property( "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", + "ProtectControlGroups")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property( c->dynamic_user = b; else if (streq(name, "RemoveIPC")) c->remove_ipc = b; + else if (streq(name, "ProtectKernelTunables")) + c->protect_kernel_tunables = b; + else if (streq(name, "ProtectControlGroups")) + c->protect_control_groups = b; unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b)); } diff --git a/src/core/execute.c b/src/core/execute.c index ee734e8445..609b69a859 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1383,6 +1383,45 @@ finish: return r; } +static int apply_protect_sysctl(Unit *u, const ExecContext *c) { + scmp_filter_ctx *seccomp; + int r; + + assert(c); + + /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but + * let's protect even those systems where this is left on in the kernel. */ + + if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(_sysctl), + 0); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + #endif static void do_idle_pipe_dance(int idle_pipe[4]) { @@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace( if (context->private_devices || context->protect_system != PROTECT_SYSTEM_NO || - context->protect_home != PROTECT_HOME_NO) + context->protect_home != PROTECT_HOME_NO || + context->protect_kernel_tunables || + context->protect_control_groups) return true; return false; @@ -1804,6 +1845,37 @@ static int close_remaining_fds( return close_all_fds(dont_close, n_dont_close); } +static bool context_has_address_families(const ExecContext *c) { + assert(c); + + return c->address_families_whitelist || + !set_isempty(c->address_families); +} + +static bool context_has_syscall_filters(const ExecContext *c) { + assert(c); + + return c->syscall_whitelist || + !set_isempty(c->syscall_filter) || + !set_isempty(c->syscall_archs); +} + +static bool context_has_no_new_privileges(const ExecContext *c) { + assert(c); + + if (c->no_new_privileges) + return true; + + if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */ + return false; + + return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */ + c->memory_deny_write_execute || + c->restrict_realtime || + c->protect_kernel_tunables || + context_has_syscall_filters(c); +} + static int send_user_lookup( Unit *unit, int user_lookup_fd, @@ -2255,6 +2327,8 @@ static int exec_child( tmp, var, context->private_devices, + context->protect_kernel_tunables, + context->protect_control_groups, context->protect_home, context->protect_system, context->mount_flags); @@ -2343,11 +2417,6 @@ static int exec_child( if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - bool use_address_families = context->address_families_whitelist || - !set_isempty(context->address_families); - bool use_syscall_filter = context->syscall_whitelist || - !set_isempty(context->syscall_filter) || - !set_isempty(context->syscall_archs); int secure_bits = context->secure_bits; for (i = 0; i < _RLIMIT_MAX; i++) { @@ -2424,15 +2493,14 @@ static int exec_child( return -errno; } - if (context->no_new_privileges || - (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter))) + if (context_has_no_new_privileges(context)) if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) { *exit_status = EXIT_NO_NEW_PRIVILEGES; return -errno; } #ifdef HAVE_SECCOMP - if (use_address_families) { + if (context_has_address_families(context)) { r = apply_address_families(unit, context); if (r < 0) { *exit_status = EXIT_ADDRESS_FAMILIES; @@ -2456,7 +2524,15 @@ static int exec_child( } } - if (use_syscall_filter) { + if (context->protect_kernel_tunables) { + r = apply_protect_sysctl(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + + if (context_has_syscall_filters(context)) { r = apply_seccomp(unit, context); if (r < 0) { *exit_status = EXIT_SECCOMP; @@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sNonBlocking: %s\n" "%sPrivateTmp: %s\n" "%sPrivateDevices: %s\n" + "%sProtectKernelTunables: %s\n" + "%sProtectControlGroups: %s\n" "%sPrivateNetwork: %s\n" "%sPrivateUsers: %s\n" "%sProtectHome: %s\n" @@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->non_blocking), prefix, yes_no(c->private_tmp), prefix, yes_no(c->private_devices), + prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_control_groups), prefix, yes_no(c->private_network), prefix, yes_no(c->private_users), prefix, protect_home_to_string(c->protect_home), diff --git a/src/core/execute.h b/src/core/execute.h index 6082c42aba..449180c903 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -174,6 +174,8 @@ struct ExecContext { bool private_users; ProtectSystem protect_system; ProtectHome protect_home; + bool protect_kernel_tunables; + bool protect_control_groups; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 2e6c965aec..c49c1d6732 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -89,6 +89,8 @@ $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths) $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) +$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) $1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context) diff --git a/src/core/namespace.c b/src/core/namespace.c index 52a2505d94..f2768aeb28 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -53,7 +53,7 @@ typedef enum MountMode { PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV, - READWRITE + READWRITE, } MountMode; typedef struct BindMount { @@ -366,6 +366,8 @@ int setup_namespace( const char* tmp_dir, const char* var_tmp_dir, bool private_dev, + bool protect_sysctl, + bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags) { @@ -385,6 +387,8 @@ int setup_namespace( strv_length(read_only_paths) + strv_length(inaccessible_paths) + private_dev + + (protect_sysctl ? 3 : 0) + + (protect_cgroups != protect_sysctl) + (protect_home != PROTECT_HOME_NO ? 3 : 0) + (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) + (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0); @@ -421,6 +425,27 @@ int setup_namespace( m++; } + if (protect_sysctl) { + m->path = prefix_roota(root_directory, "/proc/sys"); + m->mode = READONLY; + m++; + + m->path = prefix_roota(root_directory, "/proc/sysrq-trigger"); + m->mode = READONLY; + m->ignore = true; /* Not always compiled into the kernel */ + m++; + + m->path = prefix_roota(root_directory, "/sys"); + m->mode = READONLY; + m++; + } + + if (protect_cgroups != protect_sysctl) { + m->path = prefix_roota(root_directory, "/sys/fs/cgroup"); + m->mode = protect_cgroups ? READONLY : READWRITE; + m++; + } + if (protect_home != PROTECT_HOME_NO) { const char *home_dir, *run_user_dir, *root_dir; @@ -505,9 +530,12 @@ int setup_namespace( fail: if (n > 0) { - for (m = mounts; m < mounts + n; ++m) - if (m->done) - (void) umount2(m->path, MNT_DETACH); + for (m = mounts; m < mounts + n; ++m) { + if (!m->done) + continue; + + (void) umount2(m->path, MNT_DETACH); + } } return r; diff --git a/src/core/namespace.h b/src/core/namespace.h index 1aedf5f208..3845336287 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -46,6 +46,8 @@ int setup_namespace(const char *chroot, const char *tmp_dir, const char *var_tmp_dir, bool private_dev, + bool protect_sysctl, + bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index feb4a06737..c6bd2f145c 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { r = parse_boolean(eq); if (r < 0) diff --git a/src/test/test-ns.c b/src/test/test-ns.c index 9248f2987c..05f243c75c 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -69,6 +69,8 @@ int main(int argc, char *argv[]) { tmp_dir, var_tmp_dir, true, + true, + true, PROTECT_HOME_NO, PROTECT_SYSTEM_NO, 0); -- cgit v1.2.3-54-g00ecf From 07689d5d2c07ee434437de5e39bf0abaa772818b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 10:12:57 +0200 Subject: execute: split out creation of runtime dirs into its own functions --- src/core/execute.c | 57 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 609b69a859..3877293b4f 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1789,6 +1789,37 @@ static int setup_private_users(uid_t uid, gid_t gid) { return 0; } +static int setup_runtime_directory( + const ExecContext *context, + const ExecParameters *params, + uid_t uid, + gid_t gid) { + + char **rt; + int r; + + assert(context); + assert(params); + + STRV_FOREACH(rt, context->runtime_directory) { + _cleanup_free_ char *p; + + p = strjoin(params->runtime_prefix, "/", *rt, NULL); + if (!p) + return -ENOMEM; + + r = mkdir_p_label(p, context->runtime_directory_mode); + if (r < 0) + return r; + + r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid); + if (r < 0) + return r; + } + + return 0; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -2188,28 +2219,10 @@ static int exec_child( } if (!strv_isempty(context->runtime_directory) && params->runtime_prefix) { - char **rt; - - STRV_FOREACH(rt, context->runtime_directory) { - _cleanup_free_ char *p; - - p = strjoin(params->runtime_prefix, "/", *rt, NULL); - if (!p) { - *exit_status = EXIT_RUNTIME_DIRECTORY; - return -ENOMEM; - } - - r = mkdir_p_label(p, context->runtime_directory_mode); - if (r < 0) { - *exit_status = EXIT_RUNTIME_DIRECTORY; - return r; - } - - r = chmod_and_chown(p, context->runtime_directory_mode, uid, gid); - if (r < 0) { - *exit_status = EXIT_RUNTIME_DIRECTORY; - return r; - } + r = setup_runtime_directory(context, params, uid, gid); + if (r < 0) { + *exit_status = EXIT_RUNTIME_DIRECTORY; + return r; } } -- cgit v1.2.3-54-g00ecf From be39ccf3a0d4d15324af1de4d8552a1d65f40808 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 10:24:10 +0200 Subject: execute: move suppression of HOME=/ and SHELL=/bin/nologin into user-util.c This adds a new call get_user_creds_clean(), which is just like get_user_creds() but returns NULL in the home/shell parameters if they contain no useful information. This code previously lived in execute.c, but by generalizing this we can reuse it in run.c. --- src/basic/user-util.c | 32 +++++++++++++++++++++++++++++++- src/basic/user-util.h | 1 + src/core/execute.c | 14 +++----------- src/run/run.c | 18 +++++++++++------- 4 files changed, 46 insertions(+), 19 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/basic/user-util.c b/src/basic/user-util.c index 122d9a0c7c..0522bce1d1 100644 --- a/src/basic/user-util.c +++ b/src/basic/user-util.c @@ -31,14 +31,15 @@ #include #include -#include "missing.h" #include "alloc-util.h" #include "fd-util.h" #include "formats-util.h" #include "macro.h" +#include "missing.h" #include "parse-util.h" #include "path-util.h" #include "string-util.h" +#include "strv.h" #include "user-util.h" #include "utf8.h" @@ -175,6 +176,35 @@ int get_user_creds( return 0; } +int get_user_creds_clean( + const char **username, + uid_t *uid, gid_t *gid, + const char **home, + const char **shell) { + + int r; + + /* Like get_user_creds(), but resets home/shell to NULL if they don't contain anything relevant. */ + + r = get_user_creds(username, uid, gid, home, shell); + if (r < 0) + return r; + + if (shell && + (isempty(*shell) || PATH_IN_SET(*shell, + "/bin/nologin", + "/sbin/nologin", + "/usr/bin/nologin", + "/usr/sbin/nologin"))) + *shell = NULL; + + if (home && + (isempty(*home) || path_equal(*home, "/"))) + *home = NULL; + + return 0; +} + int get_group_creds(const char **groupname, gid_t *gid) { struct group *g; gid_t id; diff --git a/src/basic/user-util.h b/src/basic/user-util.h index f569363811..6c61f63cae 100644 --- a/src/basic/user-util.h +++ b/src/basic/user-util.h @@ -40,6 +40,7 @@ char* getlogname_malloc(void); char* getusername_malloc(void); int get_user_creds(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell); +int get_user_creds_clean(const char **username, uid_t *uid, gid_t *gid, const char **home, const char **shell); int get_group_creds(const char **groupname, gid_t *gid); char* uid_to_name(uid_t uid); diff --git a/src/core/execute.c b/src/core/execute.c index 3877293b4f..c7a3ea39e7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2051,22 +2051,14 @@ static int exec_child( } else { if (context->user) { username = context->user; - r = get_user_creds(&username, &uid, &gid, &home, &shell); + r = get_user_creds_clean(&username, &uid, &gid, &home, &shell); if (r < 0) { *exit_status = EXIT_USER; return r; } - /* Don't set $HOME or $SHELL if they are are not particularly enlightening anyway. */ - if (isempty(home) || path_equal(home, "/")) - home = NULL; - - if (isempty(shell) || PATH_IN_SET(shell, - "/bin/nologin", - "/sbin/nologin", - "/usr/bin/nologin", - "/usr/sbin/nologin")) - shell = NULL; + /* Note that we don't set $HOME or $SHELL if they are are not particularly enlightening anyway + * (i.e. are "/" or "/bin/nologin"). */ } if (context->group) { diff --git a/src/run/run.c b/src/run/run.c index 2dd229868c..81b53fdfab 100644 --- a/src/run/run.c +++ b/src/run/run.c @@ -1168,17 +1168,21 @@ static int start_transient_scope( uid_t uid; gid_t gid; - r = get_user_creds(&arg_exec_user, &uid, &gid, &home, &shell); + r = get_user_creds_clean(&arg_exec_user, &uid, &gid, &home, &shell); if (r < 0) return log_error_errno(r, "Failed to resolve user %s: %m", arg_exec_user); - r = strv_extendf(&user_env, "HOME=%s", home); - if (r < 0) - return log_oom(); + if (home) { + r = strv_extendf(&user_env, "HOME=%s", home); + if (r < 0) + return log_oom(); + } - r = strv_extendf(&user_env, "SHELL=%s", shell); - if (r < 0) - return log_oom(); + if (shell) { + r = strv_extendf(&user_env, "SHELL=%s", shell); + if (r < 0) + return log_oom(); + } r = strv_extendf(&user_env, "USER=%s", arg_exec_user); if (r < 0) -- cgit v1.2.3-54-g00ecf From 3fbe8dbe41ad662d7cae0525f6fd62a66d2c5ec5 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 10:42:38 +0200 Subject: execute: if RuntimeDirectory= is set, it should be writable Implicitly make all dirs set with RuntimeDirectory= writable, as the concept otherwise makes no sense. --- src/core/execute.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index c7a3ea39e7..20e74ec8a6 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1820,6 +1820,44 @@ static int setup_runtime_directory( return 0; } +static int compile_read_write_paths( + const ExecContext *context, + const ExecParameters *params, + char ***ret) { + + _cleanup_strv_free_ char **l = NULL; + char **rt; + + /* Compile the list of writable paths. This is the combination of the explicitly configured paths, plus all + * runtime directories. */ + + if (strv_isempty(context->read_write_paths) && + strv_isempty(context->runtime_directory)) { + *ret = NULL; /* NOP if neither is set */ + return 0; + } + + l = strv_copy(context->read_write_paths); + if (!l) + return -ENOMEM; + + STRV_FOREACH(rt, context->runtime_directory) { + char *s; + + s = strjoin(params->runtime_prefix, "/", *rt, NULL); + if (!s) + return -ENOMEM; + + if (strv_consume(&l, s) < 0) + return -ENOMEM; + } + + *ret = l; + l = NULL; + + return 0; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -2307,8 +2345,8 @@ static int exec_child( } needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime); - if (needs_mount_namespace) { + _cleanup_free_ char **rw = NULL; char *tmp = NULL, *var = NULL; /* The runtime struct only contains the parent @@ -2324,9 +2362,15 @@ static int exec_child( var = strjoina(runtime->var_tmp_dir, "/tmp"); } + r = compile_read_write_paths(context, params, &rw); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return r; + } + r = setup_namespace( (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, - context->read_write_paths, + rw, context->read_only_paths, context->inaccessible_paths, tmp, -- cgit v1.2.3-54-g00ecf From 096424d1230e0a0339735c51b43949809e972430 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 17:29:12 +0200 Subject: execute: drop group priviliges only after setting up namespace If PrivateDevices=yes is set, the namespace code creates device nodes in /dev that should be owned by the host's root, hence let's make sure we set up the namespace before dropping group privileges. --- src/core/execute.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 20e74ec8a6..ae251b2a4c 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2291,14 +2291,9 @@ static int exec_child( } accum_env = strv_env_clean(accum_env); - umask(context->umask); + (void) umask(context->umask); if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - r = enforce_groups(context, username, gid); - if (r < 0) { - *exit_status = EXIT_GROUP; - return r; - } #ifdef HAVE_SMACK if (context->smack_process_label) { r = mac_smack_apply_pid(0, context->smack_process_label); @@ -2395,6 +2390,14 @@ static int exec_child( } } + if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { + r = enforce_groups(context, username, gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return r; + } + } + if (context->working_directory_home) wd = home; else if (context->working_directory) -- cgit v1.2.3-54-g00ecf From ba128bb809cc59ca60db65f0c09bd7f48876fa83 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 26 Aug 2016 16:39:04 +0200 Subject: execute: filter low-level I/O syscalls if PrivateDevices= is set If device access is restricted via PrivateDevices=, let's also block the various low-level I/O syscalls at the same time, so that we know that the minimal set of devices in our virtualized /dev are really everything the unit can access. --- src/core/execute.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index ae251b2a4c..a20e9ea829 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1422,12 +1422,67 @@ finish: return r; } +static int apply_private_devices(Unit *u, const ExecContext *c) { + + static const int device_syscalls[] = { + SCMP_SYS(ioperm), + SCMP_SYS(iopl), + SCMP_SYS(pciconfig_iobase), + SCMP_SYS(pciconfig_read), + SCMP_SYS(pciconfig_write), +#ifdef __NR_s390_pci_mmio_read + SCMP_SYS(s390_pci_mmio_read), +#endif +#ifdef __NR_s390_pci_mmio_write + SCMP_SYS(s390_pci_mmio_write), +#endif + }; + + scmp_filter_ctx *seccomp; + unsigned i; + int r; + + assert(c); + + /* If PrivateDevices= is set, also turn off iopl and friends. */ + + if (skip_seccomp_unavailable(u, "PrivateDevices=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + for (i = 0; i < ELEMENTSOF(device_syscalls); i++) { + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + device_syscalls[i], + 0); + if (r < 0) + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + #endif static void do_idle_pipe_dance(int idle_pipe[4]) { assert(idle_pipe); - idle_pipe[1] = safe_close(idle_pipe[1]); idle_pipe[2] = safe_close(idle_pipe[2]); @@ -2584,6 +2639,14 @@ static int exec_child( } } + if (context->private_devices) { + r = apply_private_devices(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + if (context_has_syscall_filters(context)) { r = apply_seccomp(unit, context); if (r < 0) { -- cgit v1.2.3-54-g00ecf From cefc33aee299fa214f093d3d1b4c171ac3b30dde Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 26 Aug 2016 17:40:42 +0200 Subject: execute: move SMACK setup code into its own function While we are at it, move PAM code #ifdeffery into setup_pam() to simplify the main execution logic a bit. --- src/core/execute.c | 74 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 27 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index a20e9ea829..0488ba2ca9 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -837,6 +837,8 @@ static int null_conv( return PAM_CONV_ERR; } +#endif + static int setup_pam( const char *name, const char *user, @@ -845,6 +847,8 @@ static int setup_pam( char ***env, int fds[], unsigned n_fds) { +#ifdef HAVE_PAM + static const struct pam_conv conv = { .conv = null_conv, .appdata_ptr = NULL @@ -1038,8 +1042,10 @@ fail: closelog(); return r; -} +#else + return 0; #endif +} static void rename_process_from_path(const char *path) { char process_name[11]; @@ -1875,6 +1881,42 @@ static int setup_runtime_directory( return 0; } +static int setup_smack( + const ExecContext *context, + const ExecCommand *command) { + +#ifdef HAVE_SMACK + int r; + + assert(context); + assert(command); + + if (!mac_smack_use()) + return 0; + + if (context->smack_process_label) { + r = mac_smack_apply_pid(0, context->smack_process_label); + if (r < 0) + return r; + } +#ifdef SMACK_DEFAULT_PROCESS_LABEL + else { + _cleanup_free_ char *exec_label = NULL; + + r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label); + if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) + return r; + + r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); + if (r < 0) + return r; + } +#endif +#endif + + return 0; +} + static int compile_read_write_paths( const ExecContext *context, const ExecParameters *params, @@ -2349,33 +2391,12 @@ static int exec_child( (void) umask(context->umask); if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { -#ifdef HAVE_SMACK - if (context->smack_process_label) { - r = mac_smack_apply_pid(0, context->smack_process_label); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } + r = setup_smack(context, command); + if (r < 0) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return r; } -#ifdef SMACK_DEFAULT_PROCESS_LABEL - else { - _cleanup_free_ char *exec_label = NULL; - r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label); - if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - - r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - } -#endif -#endif -#ifdef HAVE_PAM if (context->pam_name && username) { r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { @@ -2383,7 +2404,6 @@ static int exec_child( return r; } } -#endif } if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) { -- cgit v1.2.3-54-g00ecf From 8f81a5f61bcf745bae3acad599d7a9da686643e3 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 25 Sep 2016 12:52:27 +0200 Subject: core: Use @raw-io syscall group to filter I/O syscalls when PrivateDevices= is set Instead of having a local syscall list, use the @raw-io group which contains the same set of syscalls to filter. --- man/systemd.exec.xml | 6 ++++-- src/core/execute.c | 55 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 22 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index f19e7f6ee9..f70e5c36d4 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -933,8 +933,10 @@ /dev/random (as well as the pseudo TTY subsystem) to it, but no physical devices such as /dev/sda, system memory /dev/mem, system ports /dev/port and others. This is useful to securely turn off physical device access by the - executed process. Defaults to false. Enabling this option will also remove CAP_MKNOD from - the capability bounding set for the unit (see above), and set DevicePolicy=closed (see + executed process. Defaults to false. Enabling this option will install a system call filter to block low-level + I/O system calls that are grouped in the @raw-io set, will also remove + CAP_MKNOD from the capability bounding set for the unit (see above), and set + DevicePolicy=closed (see systemd.resource-control5 for details). Note that using this setting will disconnect propagation of mounts from the service to the host (propagation in the opposite direction continues to work). This means that this setting may not be used for diff --git a/src/core/execute.c b/src/core/execute.c index 0488ba2ca9..3da7ef3be6 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1429,28 +1429,15 @@ finish: } static int apply_private_devices(Unit *u, const ExecContext *c) { - - static const int device_syscalls[] = { - SCMP_SYS(ioperm), - SCMP_SYS(iopl), - SCMP_SYS(pciconfig_iobase), - SCMP_SYS(pciconfig_read), - SCMP_SYS(pciconfig_write), -#ifdef __NR_s390_pci_mmio_read - SCMP_SYS(s390_pci_mmio_read), -#endif -#ifdef __NR_s390_pci_mmio_write - SCMP_SYS(s390_pci_mmio_write), -#endif - }; - + const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; - unsigned i; + const char *sys; + bool syscalls_found = false; int r; assert(c); - /* If PrivateDevices= is set, also turn off iopl and friends. */ + /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; @@ -1463,12 +1450,40 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - for (i = 0; i < ELEMENTSOF(device_syscalls); i++) { + for (set = syscall_filter_sets; set->set_name; set++) + if (streq(set->set_name, "@raw-io")) { + syscalls_found = true; + break; + } + + /* We should never fail here */ + if (!syscalls_found) { + r = -EOPNOTSUPP; + goto finish; + } + + NULSTR_FOREACH(sys, set->value) { + int id; + bool add = true; + +#ifndef __NR_s390_pci_mmio_read + if (streq(sys, "s390_pci_mmio_read")) + add = false; +#endif +#ifndef __NR_s390_pci_mmio_write + if (streq(sys, "s390_pci_mmio_write")) + add = false; +#endif + + if (!add) + continue; + + id = seccomp_syscall_resolve_name(sys); + r = seccomp_rule_add( seccomp, SCMP_ACT_ERRNO(EPERM), - device_syscalls[i], - 0); + id, 0); if (r < 0) goto finish; } -- cgit v1.2.3-54-g00ecf From 629ff674ac48653703440ab16998adb03391f986 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 2 Oct 2016 19:37:21 +0200 Subject: tree-wide: remove consecutive duplicate words in comments --- src/basic/escape.c | 2 +- src/basic/util.c | 2 +- src/core/dynamic-user.c | 2 +- src/core/execute.c | 2 +- src/hostname/hostnamectl.c | 2 +- src/login/logind-session.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/basic/escape.c b/src/basic/escape.c index 01daf11ce7..4a1ec4505e 100644 --- a/src/basic/escape.c +++ b/src/basic/escape.c @@ -333,7 +333,7 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi assert(remaining > 0); if (*f != '\\') { - /* A literal literal, copy verbatim */ + /* A literal, copy verbatim */ *(t++) = *f; continue; } diff --git a/src/basic/util.c b/src/basic/util.c index 9d66d28eb7..ec7939dc83 100644 --- a/src/basic/util.c +++ b/src/basic/util.c @@ -467,7 +467,7 @@ bool in_initrd(void) { * 2. the root file system must be a memory file system * * The second check is extra paranoia, since misdetecting an - * initrd can have bad bad consequences due the initrd + * initrd can have bad consequences due the initrd * emptying when transititioning to the main systemd. */ diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index 310aaa94e1..1043da3eb7 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -233,7 +233,7 @@ static int pick_uid(const char *name, uid_t *ret_uid) { if (st.st_nlink > 0) break; - /* Oh, bummer, we got got the lock, but the file was unlinked between the time we opened it and + /* Oh, bummer, we got the lock, but the file was unlinked between the time we opened it and * got the lock. Close it, and try again. */ lock_fd = safe_close(lock_fd); } diff --git a/src/core/execute.c b/src/core/execute.c index 3da7ef3be6..82d8c978c1 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2207,7 +2207,7 @@ static int exec_child( return r; } - /* Note that we don't set $HOME or $SHELL if they are are not particularly enlightening anyway + /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway * (i.e. are "/" or "/bin/nologin"). */ } diff --git a/src/hostname/hostnamectl.c b/src/hostname/hostnamectl.c index 4795324667..07c57fb567 100644 --- a/src/hostname/hostnamectl.c +++ b/src/hostname/hostnamectl.c @@ -278,7 +278,7 @@ static int set_hostname(sd_bus *bus, char **args, unsigned n) { /* Now that we set the pretty hostname, let's clean up the parameter and use that as static * hostname. If the hostname was already valid as static hostname, this will only chop off the trailing * dot if there is one. If it was not valid, then it will be made fully valid by truncating, dropping - * multiple dots, and and dropping weird chars. Note that we clean the name up only if we also are + * multiple dots, and dropping weird chars. Note that we clean the name up only if we also are * supposed to set the pretty name. If the pretty name is not being set we assume the user knows what * he does and pass the name as-is. */ h = strdup(hostname); diff --git a/src/login/logind-session.c b/src/login/logind-session.c index b6da237397..ba1bcc2630 100644 --- a/src/login/logind-session.c +++ b/src/login/logind-session.c @@ -611,7 +611,7 @@ static int session_stop_scope(Session *s, bool force) { return 0; /* Let's always abandon the scope first. This tells systemd that we are not interested anymore, and everything - * that is left in in the scope is "left-over". Informing systemd about this has the benefit that it will log + * that is left in the scope is "left-over". Informing systemd about this has the benefit that it will log * when killing any processes left after this point. */ r = manager_abandon_scope(s->manager, s->scope, &error); if (r < 0) -- cgit v1.2.3-54-g00ecf From 36d854780c01d589e5da1fc6e94f46aa41f7016f Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 28 Sep 2016 18:37:39 +0200 Subject: core: do not fail in a container if we can't use setgroups It might be blocked through /proc/PID/setgroups --- src/basic/capability-util.c | 3 ++- src/basic/user-util.c | 27 ++++++++++++++++++++++++++- src/basic/user-util.h | 2 ++ src/core/execute.c | 2 +- 4 files changed, 31 insertions(+), 3 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c index d4c5bd6937..f8db6e0212 100644 --- a/src/basic/capability-util.c +++ b/src/basic/capability-util.c @@ -31,6 +31,7 @@ #include "log.h" #include "macro.h" #include "parse-util.h" +#include "user-util.h" #include "util.h" int have_effective_cap(int value) { @@ -295,7 +296,7 @@ int drop_privileges(uid_t uid, gid_t gid, uint64_t keep_capabilities) { if (setresgid(gid, gid, gid) < 0) return log_error_errno(errno, "Failed to change group ID: %m"); - if (setgroups(0, NULL) < 0) + if (maybe_setgroups(0, NULL) < 0) return log_error_errno(errno, "Failed to drop auxiliary groups list: %m"); /* Ensure we keep the permitted caps across the setresuid() */ diff --git a/src/basic/user-util.c b/src/basic/user-util.c index 0522bce1d1..16496fccfa 100644 --- a/src/basic/user-util.c +++ b/src/basic/user-util.c @@ -33,6 +33,7 @@ #include "alloc-util.h" #include "fd-util.h" +#include "fileio.h" #include "formats-util.h" #include "macro.h" #include "missing.h" @@ -460,7 +461,7 @@ int get_shell(char **_s) { int reset_uid_gid(void) { - if (setgroups(0, NULL) < 0) + if (maybe_setgroups(0, NULL) < 0) return -errno; if (setresgid(0, 0, 0) < 0) @@ -602,3 +603,27 @@ bool valid_home(const char *p) { return true; } + +int maybe_setgroups(size_t size, const gid_t *list) { + static int cached_can_setgroups = -1; + /* check if setgroups is allowed before we try to drop all the auxiliary groups */ + if (size == 0) { + if (cached_can_setgroups < 0) { + _cleanup_free_ char *setgroups_content = NULL; + int r = read_one_line_file("/proc/self/setgroups", &setgroups_content); + if (r < 0 && errno != ENOENT) + return r; + if (r < 0) { + /* old kernels don't have /proc/self/setgroups, so assume we can use setgroups */ + cached_can_setgroups = true; + } else { + cached_can_setgroups = streq(setgroups_content, "allow"); + if (!cached_can_setgroups) + log_debug("skip setgroups, /proc/self/setgroups is set to 'deny'"); + } + } + if (!cached_can_setgroups) + return 0; + } + return setgroups(size, list); +} diff --git a/src/basic/user-util.h b/src/basic/user-util.h index 6c61f63cae..dfea561bde 100644 --- a/src/basic/user-util.h +++ b/src/basic/user-util.h @@ -86,3 +86,5 @@ bool valid_user_group_name(const char *u); bool valid_user_group_name_or_id(const char *u); bool valid_gecos(const char *d); bool valid_home(const char *p); + +int maybe_setgroups(size_t size, const gid_t *list); diff --git a/src/core/execute.c b/src/core/execute.c index 82d8c978c1..019ff8490b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -781,7 +781,7 @@ static int enforce_groups(const ExecContext *context, const char *username, gid_ k++; } - if (setgroups(k, gids) < 0) { + if (maybe_setgroups(k, gids) < 0) { free(gids); return -errno; } -- cgit v1.2.3-54-g00ecf From 2d6fce8d7c397fe915230b728cb92aa749245e43 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 6 Oct 2016 16:03:01 +0200 Subject: core: leave PAM stub process around with GIDs updated In the process execution code of PID 1, before 096424d1230e0a0339735c51b43949809e972430 the GID settings where changed before invoking PAM, and the UID settings after. After the change both changes are made after the PAM session hooks are run. When invoking PAM we fork once, and leave a stub process around which will invoke the PAM session end hooks when the session goes away. This code previously was dropping the remaining privs (which were precisely the UID). Fix this code to do this correctly again, by really dropping them else (i.e. the GID as well). While we are at it, also fix error logging of this code. Fixes: #4238 --- src/core/execute.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 019ff8490b..e4a23ac169 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -843,6 +843,7 @@ static int setup_pam( const char *name, const char *user, uid_t uid, + gid_t gid, const char *tty, char ***env, int fds[], unsigned n_fds) { @@ -948,8 +949,13 @@ static int setup_pam( * and this will make PR_SET_PDEATHSIG work in most cases. * If this fails, ignore the error - but expect sd-pam threads * to fail to exit normally */ + + if (maybe_setgroups(0, NULL) < 0) + log_warning_errno(errno, "Failed to setgroups() in sd-pam: %m"); + if (setresgid(gid, gid, gid) < 0) + log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); if (setresuid(uid, uid, uid) < 0) - log_error_errno(r, "Error: Failed to setresuid() in sd-pam: %m"); + log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m"); (void) ignore_signals(SIGPIPE, -1); @@ -2413,7 +2419,7 @@ static int exec_child( } if (context->pam_name && username) { - r = setup_pam(context->pam_name, username, uid, context->tty_path, &accum_env, fds, n_fds); + r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { *exit_status = EXIT_PAM; return r; -- cgit v1.2.3-54-g00ecf From 97f0e76f18d322d29bcfbc4ab6bb9cd67a1cdd54 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 6 Oct 2016 17:54:12 +0200 Subject: user-util: rework maybe_setgroups() a bit Let's drop the caching of the setgroups /proc field for now. While there's a strict regime in place when it changes states, let's better not cache it since we cannot really be sure we follow that regime correctly. More importantly however, this is not in performance sensitive code, and there's no indication the cache is really beneficial, hence let's drop the caching and make things a bit simpler. Also, while we are at it, rework the error handling a bit, and always return negative errno-style error codes, following our usual coding style. This has the benefit that we can sensible hanld read_one_line_file() errors, without having to updat errno explicitly. --- src/basic/capability-util.c | 5 +++-- src/basic/user-util.c | 49 ++++++++++++++++++++++++++------------------- src/core/execute.c | 10 +++++---- 3 files changed, 37 insertions(+), 27 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/basic/capability-util.c b/src/basic/capability-util.c index f8db6e0212..c3de20a0e8 100644 --- a/src/basic/capability-util.c +++ b/src/basic/capability-util.c @@ -296,8 +296,9 @@ int drop_privileges(uid_t uid, gid_t gid, uint64_t keep_capabilities) { if (setresgid(gid, gid, gid) < 0) return log_error_errno(errno, "Failed to change group ID: %m"); - if (maybe_setgroups(0, NULL) < 0) - return log_error_errno(errno, "Failed to drop auxiliary groups list: %m"); + r = maybe_setgroups(0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to drop auxiliary groups list: %m"); /* Ensure we keep the permitted caps across the setresuid() */ if (prctl(PR_SET_KEEPCAPS, 1) < 0) diff --git a/src/basic/user-util.c b/src/basic/user-util.c index 16496fccfa..de6c93056e 100644 --- a/src/basic/user-util.c +++ b/src/basic/user-util.c @@ -460,9 +460,11 @@ int get_shell(char **_s) { } int reset_uid_gid(void) { + int r; - if (maybe_setgroups(0, NULL) < 0) - return -errno; + r = maybe_setgroups(0, NULL); + if (r < 0) + return r; if (setresgid(0, 0, 0) < 0) return -errno; @@ -605,25 +607,30 @@ bool valid_home(const char *p) { } int maybe_setgroups(size_t size, const gid_t *list) { - static int cached_can_setgroups = -1; - /* check if setgroups is allowed before we try to drop all the auxiliary groups */ - if (size == 0) { - if (cached_can_setgroups < 0) { - _cleanup_free_ char *setgroups_content = NULL; - int r = read_one_line_file("/proc/self/setgroups", &setgroups_content); - if (r < 0 && errno != ENOENT) - return r; - if (r < 0) { - /* old kernels don't have /proc/self/setgroups, so assume we can use setgroups */ - cached_can_setgroups = true; - } else { - cached_can_setgroups = streq(setgroups_content, "allow"); - if (!cached_can_setgroups) - log_debug("skip setgroups, /proc/self/setgroups is set to 'deny'"); - } - } - if (!cached_can_setgroups) + int r; + + /* Check if setgroups is allowed before we try to drop all the auxiliary groups */ + if (size == 0) { /* Dropping all aux groups? */ + _cleanup_free_ char *setgroups_content = NULL; + bool can_setgroups; + + r = read_one_line_file("/proc/self/setgroups", &setgroups_content); + if (r == -ENOENT) + /* Old kernels don't have /proc/self/setgroups, so assume we can use setgroups */ + can_setgroups = true; + else if (r < 0) + return r; + else + can_setgroups = streq(setgroups_content, "allow"); + + if (!can_setgroups) { + log_debug("Skipping setgroups(), /proc/self/setgroups is set to 'deny'"); return 0; + } } - return setgroups(size, list); + + if (setgroups(size, list) < 0) + return -errno; + + return 0; } diff --git a/src/core/execute.c b/src/core/execute.c index e4a23ac169..d5c4e60796 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -781,9 +781,10 @@ static int enforce_groups(const ExecContext *context, const char *username, gid_ k++; } - if (maybe_setgroups(k, gids) < 0) { + r = maybe_setgroups(k, gids); + if (r < 0) { free(gids); - return -errno; + return r; } free(gids); @@ -950,8 +951,9 @@ static int setup_pam( * If this fails, ignore the error - but expect sd-pam threads * to fail to exit normally */ - if (maybe_setgroups(0, NULL) < 0) - log_warning_errno(errno, "Failed to setgroups() in sd-pam: %m"); + r = maybe_setgroups(0, NULL); + if (r < 0) + log_warning_errno(r, "Failed to setgroups() in sd-pam: %m"); if (setresgid(gid, gid, gid) < 0) log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m"); if (setresuid(uid, uid, uid) < 0) -- cgit v1.2.3-54-g00ecf From 4b58153dd22172d817055d2a09a0cdf3f4bd9db3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 30 Aug 2016 23:18:46 +0200 Subject: core: add "invocation ID" concept to service manager This adds a new invocation ID concept to the service manager. The invocation ID identifies each runtime cycle of a unit uniquely. A new randomized 128bit ID is generated each time a unit moves from and inactive to an activating or active state. The primary usecase for this concept is to connect the runtime data PID 1 maintains about a service with the offline data the journal stores about it. Previously we'd use the unit name plus start/stop times, which however is highly racy since the journal will generally process log data after the service already ended. The "invocation ID" kinda matches the "boot ID" concept of the Linux kernel, except that it applies to an individual unit instead of the whole system. The invocation ID is passed to the activated processes as environment variable. It is additionally stored as extended attribute on the cgroup of the unit. The latter is used by journald to automatically retrieve it for each log logged message and attach it to the log entry. The environment variable is very easily accessible, even for unprivileged services. OTOH the extended attribute is only accessible to privileged processes (this is because cgroupfs only supports the "trusted." xattr namespace, not "user."). The environment variable may be altered by services, the extended attribute may not be, hence is the better choice for the journal. Note that reading the invocation ID off the extended attribute from journald is racy, similar to the way reading the unit name for a logging process is. This patch adds APIs to read the invocation ID to sd-id128: sd_id128_get_invocation() may be used in a similar fashion to sd_id128_get_boot(). PID1's own logging is updated to always include the invocation ID when it logs information about a unit. A new bus call GetUnitByInvocationID() is added that allows retrieving a bus path to a unit by its invocation ID. The bus path is built using the invocation ID, thus providing a path for referring to a unit that is valid only for the current runtime cycleof it. Outlook for the future: should the kernel eventually allow passing of cgroup information along AF_UNIX/SOCK_DGRAM messages via a unique cgroup id, then we can alter the invocation ID to be generated as hash from that rather than entirely randomly. This way we can derive the invocation race-freely from the messages. --- Makefile-man.am | 5 ++ man/sd_id128_get_machine.xml | 34 +++++++----- man/systemd.exec.xml | 10 ++++ src/basic/cgroup-util.c | 38 +++++++++++++ src/basic/cgroup-util.h | 3 ++ src/basic/log.c | 45 +++++++++------- src/basic/log.h | 10 ++-- src/core/automount.c | 4 ++ src/core/busname.c | 4 ++ src/core/cgroup.c | 21 ++++++++ src/core/dbus-manager.c | 59 +++++++++++++++++++++ src/core/dbus-unit.c | 1 + src/core/device.c | 4 ++ src/core/execute.c | 10 +++- src/core/manager.c | 24 ++++++++- src/core/manager.h | 4 ++ src/core/mount.c | 11 ++-- src/core/org.freedesktop.systemd1.conf | 4 ++ src/core/path.c | 4 ++ src/core/scope.c | 4 ++ src/core/service.c | 4 ++ src/core/slice.c | 5 ++ src/core/socket.c | 5 +- src/core/swap.c | 5 ++ src/core/target.c | 5 ++ src/core/timer.c | 6 ++- src/core/unit.c | 88 ++++++++++++++++++++++++++++++- src/core/unit.h | 10 +++- src/journal/journald-server.c | 60 ++++++++++++++++++--- src/journal/journald-server.h | 2 +- src/libsystemd/libsystemd.sym | 1 + src/libsystemd/sd-bus/bus-common-errors.c | 1 + src/libsystemd/sd-bus/bus-common-errors.h | 1 + src/libsystemd/sd-id128/id128-util.c | 13 +++++ src/libsystemd/sd-id128/id128-util.h | 6 +++ src/libsystemd/sd-id128/sd-id128.c | 22 ++++++++ src/network/networkd-link.h | 2 +- src/network/networkd-netdev.h | 2 +- src/shared/bus-util.c | 2 +- src/systemd/sd-id128.h | 2 +- 40 files changed, 486 insertions(+), 55 deletions(-) (limited to 'src/core/execute.c') diff --git a/Makefile-man.am b/Makefile-man.am index ef5077cc5a..5760878fe3 100644 --- a/Makefile-man.am +++ b/Makefile-man.am @@ -395,6 +395,7 @@ MANPAGES_ALIAS += \ man/sd_id128_equal.3 \ man/sd_id128_from_string.3 \ man/sd_id128_get_boot.3 \ + man/sd_id128_get_invocation.3 \ man/sd_id128_t.3 \ man/sd_is_mq.3 \ man/sd_is_socket.3 \ @@ -745,6 +746,7 @@ man/sd_event_unrefp.3: man/sd_event_new.3 man/sd_id128_equal.3: man/sd-id128.3 man/sd_id128_from_string.3: man/sd_id128_to_string.3 man/sd_id128_get_boot.3: man/sd_id128_get_machine.3 +man/sd_id128_get_invocation.3: man/sd_id128_get_machine.3 man/sd_id128_t.3: man/sd-id128.3 man/sd_is_mq.3: man/sd_is_fifo.3 man/sd_is_socket.3: man/sd_is_fifo.3 @@ -1519,6 +1521,9 @@ man/sd_id128_from_string.html: man/sd_id128_to_string.html man/sd_id128_get_boot.html: man/sd_id128_get_machine.html $(html-alias) +man/sd_id128_get_invocation.html: man/sd_id128_get_machine.html + $(html-alias) + man/sd_id128_t.html: man/sd-id128.html $(html-alias) diff --git a/man/sd_id128_get_machine.xml b/man/sd_id128_get_machine.xml index 2ad1f8f728..9a86c24aed 100644 --- a/man/sd_id128_get_machine.xml +++ b/man/sd_id128_get_machine.xml @@ -45,6 +45,7 @@ sd_id128_get_machine sd_id128_get_boot + sd_id128_get_invocation Retrieve 128-bit IDs @@ -62,6 +63,11 @@ sd_id128_t *ret + + int sd_id128_get_invocation + sd_id128_t *ret + + @@ -83,11 +89,15 @@ for more information. This function also internally caches the returned ID to make this call a cheap operation. - Note that sd_id128_get_boot() always - returns a UUID v4 compatible ID. - sd_id128_get_machine() will also return a - UUID v4-compatible ID on new installations but might not on older. - It is possible to convert the machine ID into a UUID v4-compatible + sd_id128_get_invocation() returns the invocation ID of the currently executed + service. In its current implementation, this reads and parses the $INVOCATION_ID environment + variable that the service manager sets when activating a service, see + systemd.exec5 for details. The + ID is cached internally. In future a different mechanism to determine the invocation ID may be added. + + Note that sd_id128_get_boot() and sd_id128_get_invocation() always + return UUID v4 compatible IDs. sd_id128_get_machine() will also return a UUID v4-compatible + ID on new installations but might not on older. It is possible to convert the machine ID into a UUID v4-compatible one. For more information, see machine-id5. @@ -107,11 +117,10 @@ Notes - The sd_id128_get_machine() and - sd_id128_get_boot() interfaces are available - as a shared library, which can be compiled and linked to with the - libsystemd pkg-config1 - file. + The sd_id128_get_machine(), sd_id128_get_boot() and + sd_id128_get_invocation() interfaces are available as a shared library, which can be compiled + and linked to with the libsystemd pkg-config1 file. @@ -121,8 +130,9 @@ systemd1, sd-id1283, machine-id5, - random4, - sd_id128_randomize3 + systemd.exec5, + sd_id128_randomize3, + random4 diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 5e6787338d..c73ccaa493 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1513,6 +1513,16 @@ + + $INVOCATION_ID + + Contains a randomized, unique 128bit ID identifying each runtime cycle of the unit, formatted + as 32 character hexadecimal string. A new ID is assigned each time the unit changes from an inactive state into + an activating or active state, and may be used to identify this specific runtime cycle, in particular in data + stored offline, such as the journal. The same ID is passed to all processes run as part of the + unit. + + $XDG_RUNTIME_DIR diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 7675ab0299..37e6928a46 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "alloc-util.h" @@ -883,6 +884,43 @@ int cg_set_task_access( return 0; } +int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) { + _cleanup_free_ char *fs = NULL; + int r; + + assert(path); + assert(name); + assert(value || size <= 0); + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + if (setxattr(fs, name, value, size, flags) < 0) + return -errno; + + return 0; +} + +int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) { + _cleanup_free_ char *fs = NULL; + ssize_t n; + int r; + + assert(path); + assert(name); + + r = cg_get_path(controller, path, NULL, &fs); + if (r < 0) + return r; + + n = getxattr(fs, name, value, size); + if (n < 0) + return -errno; + + return (int) n; +} + int cg_pid_get_path(const char *controller, pid_t pid, char **path) { _cleanup_fclose_ FILE *f = NULL; char line[LINE_MAX]; diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 1a61c7ad22..7529c9719e 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -185,6 +185,9 @@ int cg_get_keyed_attribute(const char *controller, const char *path, const char int cg_set_group_access(const char *controller, const char *path, mode_t mode, uid_t uid, gid_t gid); int cg_set_task_access(const char *controller, const char *path, mode_t mode, uid_t uid, gid_t gid); +int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags); +int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size); + int cg_install_release_agent(const char *controller, const char *agent); int cg_uninstall_release_agent(const char *controller); diff --git a/src/basic/log.c b/src/basic/log.c index 6a8dad311d..bd6c96c4f8 100644 --- a/src/basic/log.c +++ b/src/basic/log.c @@ -330,8 +330,6 @@ static int write_to_console( const char *file, int line, const char *func, - const char *object_field, - const char *object, const char *buffer) { char location[256], prefix[1 + DECIMAL_STR_MAX(int) + 2]; @@ -390,8 +388,6 @@ static int write_to_syslog( const char *file, int line, const char *func, - const char *object_field, - const char *object, const char *buffer) { char header_priority[2 + DECIMAL_STR_MAX(int) + 1], @@ -453,8 +449,6 @@ static int write_to_kmsg( const char *file, int line, const char *func, - const char *object_field, - const char *object, const char *buffer) { char header_priority[2 + DECIMAL_STR_MAX(int) + 1], @@ -485,7 +479,8 @@ static int log_do_header( int level, int error, const char *file, int line, const char *func, - const char *object_field, const char *object) { + const char *object_field, const char *object, + const char *extra_field, const char *extra) { snprintf(header, size, "PRIORITY=%i\n" @@ -495,6 +490,7 @@ static int log_do_header( "%s%s%s" "%s%.*i%s" "%s%s%s" + "%s%s%s" "SYSLOG_IDENTIFIER=%s\n", LOG_PRI(level), LOG_FAC(level), @@ -513,6 +509,9 @@ static int log_do_header( isempty(object) ? "" : object_field, isempty(object) ? "" : object, isempty(object) ? "" : "\n", + isempty(extra) ? "" : extra_field, + isempty(extra) ? "" : extra, + isempty(extra) ? "" : "\n", program_invocation_short_name); return 0; @@ -526,6 +525,8 @@ static int write_to_journal( const char *func, const char *object_field, const char *object, + const char *extra_field, + const char *extra, const char *buffer) { char header[LINE_MAX]; @@ -535,7 +536,7 @@ static int write_to_journal( if (journal_fd < 0) return 0; - log_do_header(header, sizeof(header), level, error, file, line, func, object_field, object); + log_do_header(header, sizeof(header), level, error, file, line, func, object_field, object, extra_field, extra); IOVEC_SET_STRING(iovec[0], header); IOVEC_SET_STRING(iovec[1], "MESSAGE="); @@ -559,6 +560,8 @@ static int log_dispatch( const char *func, const char *object_field, const char *object, + const char *extra, + const char *extra_field, char *buffer) { assert(buffer); @@ -589,7 +592,7 @@ static int log_dispatch( log_target == LOG_TARGET_JOURNAL_OR_KMSG || log_target == LOG_TARGET_JOURNAL) { - k = write_to_journal(level, error, file, line, func, object_field, object, buffer); + k = write_to_journal(level, error, file, line, func, object_field, object, extra_field, extra, buffer); if (k < 0) { if (k != -EAGAIN) log_close_journal(); @@ -600,7 +603,7 @@ static int log_dispatch( if (log_target == LOG_TARGET_SYSLOG_OR_KMSG || log_target == LOG_TARGET_SYSLOG) { - k = write_to_syslog(level, error, file, line, func, object_field, object, buffer); + k = write_to_syslog(level, error, file, line, func, buffer); if (k < 0) { if (k != -EAGAIN) log_close_syslog(); @@ -615,7 +618,7 @@ static int log_dispatch( log_target == LOG_TARGET_JOURNAL_OR_KMSG || log_target == LOG_TARGET_KMSG)) { - k = write_to_kmsg(level, error, file, line, func, object_field, object, buffer); + k = write_to_kmsg(level, error, file, line, func, buffer); if (k < 0) { log_close_kmsg(); log_open_console(); @@ -623,7 +626,7 @@ static int log_dispatch( } if (k <= 0) - (void) write_to_console(level, error, file, line, func, object_field, object, buffer); + (void) write_to_console(level, error, file, line, func, buffer); buffer = e; } while (buffer); @@ -649,7 +652,7 @@ int log_dump_internal( if (_likely_(LOG_PRI(level) > log_max_level)) return -error; - return log_dispatch(level, error, file, line, func, NULL, NULL, buffer); + return log_dispatch(level, error, file, line, func, NULL, NULL, NULL, NULL, buffer); } int log_internalv( @@ -676,7 +679,7 @@ int log_internalv( vsnprintf(buffer, sizeof(buffer), format, ap); - return log_dispatch(level, error, file, line, func, NULL, NULL, buffer); + return log_dispatch(level, error, file, line, func, NULL, NULL, NULL, NULL, buffer); } int log_internal( @@ -705,6 +708,8 @@ int log_object_internalv( const char *func, const char *object_field, const char *object, + const char *extra_field, + const char *extra, const char *format, va_list ap) { @@ -738,7 +743,7 @@ int log_object_internalv( vsnprintf(b, l, format, ap); - return log_dispatch(level, error, file, line, func, object_field, object, buffer); + return log_dispatch(level, error, file, line, func, object_field, object, extra_field, extra, buffer); } int log_object_internal( @@ -749,13 +754,15 @@ int log_object_internal( const char *func, const char *object_field, const char *object, + const char *extra_field, + const char *extra, const char *format, ...) { va_list ap; int r; va_start(ap, format); - r = log_object_internalv(level, error, file, line, func, object_field, object, format, ap); + r = log_object_internalv(level, error, file, line, func, object_field, object, extra_field, extra, format, ap); va_end(ap); return r; @@ -780,7 +787,7 @@ static void log_assert( log_abort_msg = buffer; - log_dispatch(level, 0, file, line, func, NULL, NULL, buffer); + log_dispatch(level, 0, file, line, func, NULL, NULL, NULL, NULL, buffer); } noreturn void log_assert_failed(const char *text, const char *file, int line, const char *func) { @@ -888,7 +895,7 @@ int log_struct_internal( bool fallback = false; /* If the journal is available do structured logging */ - log_do_header(header, sizeof(header), level, error, file, line, func, NULL, NULL); + log_do_header(header, sizeof(header), level, error, file, line, func, NULL, NULL, NULL, NULL); IOVEC_SET_STRING(iovec[n++], header); va_start(ap, format); @@ -935,7 +942,7 @@ int log_struct_internal( if (!found) return -error; - return log_dispatch(level, error, file, line, func, NULL, NULL, buf + 8); + return log_dispatch(level, error, file, line, func, NULL, NULL, NULL, NULL, buf + 8); } int log_set_target_from_string(const char *e) { diff --git a/src/basic/log.h b/src/basic/log.h index b6356228d9..2afee20bb5 100644 --- a/src/basic/log.h +++ b/src/basic/log.h @@ -100,18 +100,22 @@ int log_object_internal( const char *func, const char *object_field, const char *object, - const char *format, ...) _printf_(8,9); + const char *extra_field, + const char *extra, + const char *format, ...) _printf_(10,11); int log_object_internalv( int level, int error, - const char*file, + const char *file, int line, const char *func, const char *object_field, const char *object, + const char *extra_field, + const char *extra, const char *format, - va_list ap) _printf_(8,0); + va_list ap) _printf_(9,0); int log_struct_internal( int level, diff --git a/src/core/automount.c b/src/core/automount.c index bdc0e06965..7d7a0a6e46 100644 --- a/src/core/automount.c +++ b/src/core/automount.c @@ -800,6 +800,10 @@ static int automount_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + a->result = AUTOMOUNT_SUCCESS; automount_enter_waiting(a); return 1; diff --git a/src/core/busname.c b/src/core/busname.c index 7952cd31aa..63c7dde0bd 100644 --- a/src/core/busname.c +++ b/src/core/busname.c @@ -639,6 +639,10 @@ static int busname_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + n->result = BUSNAME_SUCCESS; busname_enter_making(n); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 7873f88785..20bdbc39d0 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1361,6 +1361,26 @@ int unit_attach_pids_to_cgroup(Unit *u) { return 0; } +static void cgroup_xattr_apply(Unit *u) { + char ids[SD_ID128_STRING_MAX]; + int r; + + assert(u); + + if (!MANAGER_IS_SYSTEM(u->manager)) + return; + + if (sd_id128_is_null(u->invocation_id)) + return; + + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, + "trusted.invocation_id", + sd_id128_to_string(u->invocation_id, ids), 32, + 0); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path); +} + static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask enable_mask) { assert(u); @@ -1404,6 +1424,7 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { /* Finally, apply the necessary attributes. */ cgroup_context_apply(u, target_mask, state); + cgroup_xattr_apply(u); return 0; } diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index ea7ced2fd0..12eb55cb7f 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -464,6 +464,64 @@ static int method_get_unit_by_pid(sd_bus_message *message, void *userdata, sd_bu return sd_bus_reply_method_return(message, "o", path); } +static int method_get_unit_by_invocation_id(sd_bus_message *message, void *userdata, sd_bus_error *error) { + _cleanup_free_ char *path = NULL; + Manager *m = userdata; + sd_id128_t id; + const void *a; + Unit *u; + size_t sz; + int r; + + assert(message); + assert(m); + + /* Anyone can call this method */ + + r = sd_bus_message_read_array(message, 'y', &a, &sz); + if (r < 0) + return r; + if (sz == 0) + id = SD_ID128_NULL; + else if (sz == 16) + memcpy(&id, a, sz); + else + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid invocation ID"); + + if (sd_id128_is_null(id)) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + pid_t pid; + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + + u = manager_get_unit_by_pid(m, pid); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_UNIT, "Client " PID_FMT " not member of any unit.", pid); + } else { + u = hashmap_get(m->units_by_invocation_id, &id); + if (!u) + return sd_bus_error_setf(error, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(id)); + } + + r = mac_selinux_unit_access_check(u, message, "status", error); + if (r < 0) + return r; + + /* So here's a special trick: the bus path we return actually references the unit by its invocation ID instead + * of the unit name. This means it stays valid only as long as the invocation ID stays the same. */ + path = unit_dbus_path_invocation_id(u); + if (!path) + return -ENOMEM; + + return sd_bus_reply_method_return(message, "o", path); +} + static int method_load_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { _cleanup_free_ char *path = NULL; Manager *m = userdata; @@ -2254,6 +2312,7 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_METHOD("GetUnit", "s", "o", method_get_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("GetUnitByPID", "u", "o", method_get_unit_by_pid, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("GetUnitByInvocationID", "ay", "o", method_get_unit_by_invocation_id, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("LoadUnit", "s", "o", method_load_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("StartUnit", "ss", "o", method_start_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("StartUnitReplace", "sss", "o", method_start_unit_replace, SD_BUS_VTABLE_UNPRIVILEGED), diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 5020dfba4b..245912fc0f 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -764,6 +764,7 @@ const sd_bus_vtable bus_unit_vtable[] = { SD_BUS_PROPERTY("StartLimitBurst", "u", bus_property_get_unsigned, offsetof(Unit, start_limit.burst), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("StartLimitAction", "s", property_get_failure_action, offsetof(Unit, start_limit_action), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RebootArgument", "s", NULL, offsetof(Unit, reboot_arg), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("InvocationID", "ay", bus_property_get_id128, offsetof(Unit, invocation_id), 0), SD_BUS_METHOD("Start", "s", "o", method_start, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("Stop", "s", "o", method_stop, SD_BUS_VTABLE_UNPRIVILEGED), diff --git a/src/core/device.c b/src/core/device.c index 16e56efcc3..8a3e888e5e 100644 --- a/src/core/device.c +++ b/src/core/device.c @@ -464,6 +464,10 @@ static void device_update_found_one(Device *d, bool add, DeviceFound found, bool if (!now) return; + /* Didn't exist before, but does now? if so, generate a new invocation ID for it */ + if (previous == DEVICE_NOT_FOUND && d->found != DEVICE_NOT_FOUND) + (void) unit_acquire_invocation_id(UNIT(d)); + if (d->found & DEVICE_FOUND_UDEV) /* When the device is known to udev we consider it * plugged. */ diff --git a/src/core/execute.c b/src/core/execute.c index d5c4e60796..7079aeed6e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1553,10 +1553,11 @@ static int build_environment( unsigned n_env = 0; char *x; + assert(u); assert(c); assert(ret); - our_env = new0(char*, 13); + our_env = new0(char*, 14); if (!our_env) return -ENOMEM; @@ -1627,6 +1628,13 @@ static int build_environment( our_env[n_env++] = x; } + if (!sd_id128_is_null(u->invocation_id)) { + if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0) + return -ENOMEM; + + our_env[n_env++] = x; + } + if (exec_context_needs_term(c)) { const char *tty_path, *term = NULL; diff --git a/src/core/manager.c b/src/core/manager.c index c1dce62a18..3569249788 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -522,6 +522,7 @@ static void manager_clean_environment(Manager *m) { "LISTEN_FDNAMES", "WATCHDOG_PID", "WATCHDOG_USEC", + "INVOCATION_ID", NULL); } @@ -582,9 +583,15 @@ int manager_new(UnitFileScope scope, bool test_run, Manager **_m) { if (MANAGER_IS_SYSTEM(m)) { m->unit_log_field = "UNIT="; m->unit_log_format_string = "UNIT=%s"; + + m->invocation_log_field = "INVOCATION_ID="; + m->invocation_log_format_string = "INVOCATION_ID=" SD_ID128_FORMAT_STR; } else { m->unit_log_field = "USER_UNIT="; m->unit_log_format_string = "USER_UNIT=%s"; + + m->invocation_log_field = "USER_INVOCATION_ID="; + m->invocation_log_format_string = "USER_INVOCATION_ID=" SD_ID128_FORMAT_STR; } m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; @@ -1062,6 +1069,7 @@ Manager* manager_free(Manager *m) { hashmap_free(m->dynamic_users); hashmap_free(m->units); + hashmap_free(m->units_by_invocation_id); hashmap_free(m->jobs); hashmap_free(m->watch_pids1); hashmap_free(m->watch_pids2); @@ -2268,6 +2276,7 @@ int manager_loop(Manager *m) { int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, Unit **_u) { _cleanup_free_ char *n = NULL; + sd_id128_t invocation_id; Unit *u; int r; @@ -2279,12 +2288,25 @@ int manager_load_unit_from_dbus_path(Manager *m, const char *s, sd_bus_error *e, if (r < 0) return r; + /* Permit addressing units by invocation ID: if the passed bus path is suffixed by a 128bit ID then we use it + * as invocation ID. */ + r = sd_id128_from_string(n, &invocation_id); + if (r >= 0) { + u = hashmap_get(m->units_by_invocation_id, &invocation_id); + if (u) { + *_u = u; + return 0; + } + + return sd_bus_error_setf(e, BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, "No unit with the specified invocation ID " SD_ID128_FORMAT_STR " known.", SD_ID128_FORMAT_VAL(invocation_id)); + } + + /* If this didn't work, we use the suffix as unit name. */ r = manager_load_unit(m, n, NULL, e, &u); if (r < 0) return r; *_u = u; - return 0; } diff --git a/src/core/manager.h b/src/core/manager.h index 495440b446..29fe14e10b 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -89,6 +89,7 @@ struct Manager { /* Active jobs and units */ Hashmap *units; /* name string => Unit object n:1 */ + Hashmap *units_by_invocation_id; Hashmap *jobs; /* job id => Job object 1:1 */ /* To make it easy to iterate through the units of a specific @@ -319,6 +320,9 @@ struct Manager { const char *unit_log_field; const char *unit_log_format_string; + const char *invocation_log_field; + const char *invocation_log_format_string; + int first_boot; /* tri-state */ }; diff --git a/src/core/mount.c b/src/core/mount.c index 04025b83b9..436c0e1029 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -1005,6 +1005,10 @@ static int mount_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + m->result = MOUNT_SUCCESS; m->reload_result = MOUNT_SUCCESS; m->reset_cpu_usage = true; @@ -1742,9 +1746,10 @@ static int mount_dispatch_io(sd_event_source *source, int fd, uint32_t revents, case MOUNT_DEAD: case MOUNT_FAILED: - /* This has just been mounted by - * somebody else, follow the state - * change. */ + + /* This has just been mounted by somebody else, follow the state change, but let's + * generate a new invocation ID for this implicitly and automatically. */ + (void) unit_acquire_invocation_id(UNIT(mount)); mount_enter_mounted(mount, MOUNT_SUCCESS); break; diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf index 647e5f736c..6caa15b0b8 100644 --- a/src/core/org.freedesktop.systemd1.conf +++ b/src/core/org.freedesktop.systemd1.conf @@ -52,6 +52,10 @@ send_interface="org.freedesktop.systemd1.Manager" send_member="GetUnitByPID"/> + + diff --git a/src/core/path.c b/src/core/path.c index 10f9b06974..83f794be89 100644 --- a/src/core/path.c +++ b/src/core/path.c @@ -577,6 +577,10 @@ static int path_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + path_mkdir(p); p->result = PATH_SUCCESS; diff --git a/src/core/scope.c b/src/core/scope.c index 65fa65493b..e7583f6d89 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -298,6 +298,10 @@ static int scope_start(Unit *u) { if (!u->transient && !MANAGER_IS_RELOADING(u->manager)) return -ENOENT; + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + (void) unit_realize_cgroup(u); (void) unit_reset_cpu_usage(u); diff --git a/src/core/service.c b/src/core/service.c index 99a70395fc..8ce25c494c 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -2033,6 +2033,10 @@ static int service_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + s->result = SERVICE_SUCCESS; s->reload_result = SERVICE_SUCCESS; s->main_pid_known = false; diff --git a/src/core/slice.c b/src/core/slice.c index c7700b8857..03fe797f27 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -187,10 +187,15 @@ static void slice_dump(Unit *u, FILE *f, const char *prefix) { static int slice_start(Unit *u) { Slice *t = SLICE(u); + int r; assert(t); assert(t->state == SLICE_DEAD); + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + (void) unit_realize_cgroup(u); (void) unit_reset_cpu_usage(u); diff --git a/src/core/socket.c b/src/core/socket.c index b9032fa5c9..ae8a1f751f 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -2354,11 +2354,14 @@ static int socket_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + s->result = SOCKET_SUCCESS; s->reset_cpu_usage = true; socket_enter_start_pre(s); - return 1; } diff --git a/src/core/swap.c b/src/core/swap.c index fb222b6858..0333eaefb8 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -861,6 +861,10 @@ static int swap_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + s->result = SWAP_SUCCESS; s->reset_cpu_usage = true; @@ -1189,6 +1193,7 @@ static int swap_dispatch_io(sd_event_source *source, int fd, uint32_t revents, v case SWAP_DEAD: case SWAP_FAILED: + (void) unit_acquire_invocation_id(UNIT(swap)); swap_enter_active(swap, SWAP_SUCCESS); break; diff --git a/src/core/target.c b/src/core/target.c index 61a91aad07..765c1f3fa4 100644 --- a/src/core/target.c +++ b/src/core/target.c @@ -124,10 +124,15 @@ static void target_dump(Unit *u, FILE *f, const char *prefix) { static int target_start(Unit *u) { Target *t = TARGET(u); + int r; assert(t); assert(t->state == TARGET_DEAD); + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + target_set_state(t, TARGET_ACTIVE); return 1; } diff --git a/src/core/timer.c b/src/core/timer.c index e2b43f02f8..9538059c13 100644 --- a/src/core/timer.c +++ b/src/core/timer.c @@ -616,6 +616,10 @@ static int timer_start(Unit *u) { return r; } + r = unit_acquire_invocation_id(u); + if (r < 0) + return r; + t->last_trigger = DUAL_TIMESTAMP_NULL; /* Reenable all timers that depend on unit activation time */ @@ -632,7 +636,7 @@ static int timer_start(Unit *u) { /* The timer has never run before, * make sure a stamp file exists. */ - touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); + (void) touch_file(t->stamp_path, true, USEC_INFINITY, UID_INVALID, GID_INVALID, MODE_INVALID); } t->result = TIMER_SUCCESS; diff --git a/src/core/unit.c b/src/core/unit.c index 693f75c928..690f7f7dd9 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -37,6 +37,7 @@ #include "execute.h" #include "fileio-label.h" #include "formats-util.h" +#include "id128-util.h" #include "load-dropin.h" #include "load-fragment.h" #include "log.h" @@ -521,6 +522,9 @@ void unit_free(Unit *u) { SET_FOREACH(t, u->names, i) hashmap_remove_value(u->manager->units, t, u); + if (!sd_id128_is_null(u->invocation_id)) + hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + if (u->job) { Job *j = u->job; job_uninstall(j); @@ -953,6 +957,10 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { SET_FOREACH(t, u->names, i) fprintf(f, "%s\tName: %s\n", prefix, t); + if (!sd_id128_is_null(u->invocation_id)) + fprintf(f, "%s\tInvocation ID: " SD_ID128_FORMAT_STR "\n", + prefix, SD_ID128_FORMAT_VAL(u->invocation_id)); + STRV_FOREACH(j, u->documentation) fprintf(f, "%s\tDocumentation: %s\n", prefix, *j); @@ -1054,7 +1062,6 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) { if (u->nop_job) job_dump(u->nop_job, f, prefix2); - } /* Common implementation for multiple backends */ @@ -2392,6 +2399,15 @@ char *unit_dbus_path(Unit *u) { return unit_dbus_path_from_name(u->id); } +char *unit_dbus_path_invocation_id(Unit *u) { + assert(u); + + if (sd_id128_is_null(u->invocation_id)) + return NULL; + + return unit_dbus_path_from_name(u->invocation_id_string); +} + int unit_set_slice(Unit *u, Unit *slice) { assert(u); assert(slice); @@ -2640,6 +2656,9 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { if (gid_is_valid(u->ref_gid)) unit_serialize_item_format(u, f, "ref-gid", GID_FMT, u->ref_gid); + if (!sd_id128_is_null(u->invocation_id)) + unit_serialize_item_format(u, f, "invocation-id", SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)); + bus_track_serialize(u->bus_track, f, "ref"); if (serialize_jobs) { @@ -2914,6 +2933,19 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { if (r < 0) log_oom(); + continue; + } else if (streq(l, "invocation-id")) { + sd_id128_t id; + + r = sd_id128_from_string(v, &id); + if (r < 0) + log_unit_debug(u, "Failed to parse invocation id %s, ignoring.", v); + else { + r = unit_set_invocation_id(u, id); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to set invocation ID for unit: %m"); + } + continue; } @@ -4153,3 +4185,57 @@ void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid) { if (r > 0) bus_unit_send_change_signal(u); } + +int unit_set_invocation_id(Unit *u, sd_id128_t id) { + int r; + + assert(u); + + /* Set the invocation ID for this unit. If we cannot, this will not roll back, but reset the whole thing. */ + + if (sd_id128_equal(u->invocation_id, id)) + return 0; + + if (!sd_id128_is_null(u->invocation_id)) + (void) hashmap_remove_value(u->manager->units_by_invocation_id, &u->invocation_id, u); + + if (sd_id128_is_null(id)) { + r = 0; + goto reset; + } + + r = hashmap_ensure_allocated(&u->manager->units_by_invocation_id, &id128_hash_ops); + if (r < 0) + goto reset; + + u->invocation_id = id; + sd_id128_to_string(id, u->invocation_id_string); + + r = hashmap_put(u->manager->units_by_invocation_id, &u->invocation_id, u); + if (r < 0) + goto reset; + + return 0; + +reset: + u->invocation_id = SD_ID128_NULL; + u->invocation_id_string[0] = 0; + return r; +} + +int unit_acquire_invocation_id(Unit *u) { + sd_id128_t id; + int r; + + assert(u); + + r = sd_id128_randomize(&id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to generate invocation ID for unit: %m"); + + r = unit_set_invocation_id(u, id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to set invocation ID for unit: %m"); + + return 0; +} diff --git a/src/core/unit.h b/src/core/unit.h index 3584c16d8c..a8dd3e602c 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -207,6 +207,10 @@ struct Unit { /* How to start OnFailure units */ JobMode on_failure_job_mode; + /* The current invocation ID */ + sd_id128_t invocation_id; + char invocation_id_string[SD_ID128_STRING_MAX]; /* useful when logging */ + /* Garbage collect us we nobody wants or requires us anymore */ bool stop_when_unneeded; @@ -546,6 +550,7 @@ bool unit_job_is_applicable(Unit *u, JobType j); int set_unit_path(const char *p); char *unit_dbus_path(Unit *u); +char *unit_dbus_path_invocation_id(Unit *u); int unit_load_related_unit(Unit *u, const char *type, Unit **_found); @@ -643,12 +648,15 @@ void unit_unref_uid_gid(Unit *u, bool destroy_now); void unit_notify_user_lookup(Unit *u, uid_t uid, gid_t gid); +int unit_set_invocation_id(Unit *u, sd_id128_t id); +int unit_acquire_invocation_id(Unit *u); + /* Macros which append UNIT= or USER_UNIT= to the message */ #define log_unit_full(unit, level, error, ...) \ ({ \ const Unit *_u = (unit); \ - _u ? log_object_internal(level, error, __FILE__, __LINE__, __func__, _u->manager->unit_log_field, _u->id, ##__VA_ARGS__) : \ + _u ? log_object_internal(level, error, __FILE__, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \ log_internal(level, error, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ }) diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c index 015b74b203..f01cf1d937 100644 --- a/src/journal/journald-server.c +++ b/src/journal/journald-server.c @@ -44,6 +44,7 @@ #include "fs-util.h" #include "hashmap.h" #include "hostname-util.h" +#include "id128-util.h" #include "io-util.h" #include "journal-authenticate.h" #include "journal-file.h" @@ -56,6 +57,7 @@ #include "journald-server.h" #include "journald-stream.h" #include "journald-syslog.h" +#include "log.h" #include "missing.h" #include "mkdir.h" #include "parse-util.h" @@ -69,7 +71,6 @@ #include "string-table.h" #include "string-util.h" #include "user-util.h" -#include "log.h" #define USER_JOURNALS_MAX 1024 @@ -675,6 +676,44 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned server_schedule_sync(s, priority); } +static int get_invocation_id(const char *cgroup_root, const char *slice, const char *unit, char **ret) { + _cleanup_free_ char *escaped = NULL, *slice_path = NULL, *p = NULL; + char *copy, ids[SD_ID128_STRING_MAX]; + int r; + + /* Read the invocation ID of a unit off a unit. It's stored in the "trusted.invocation_id" extended attribute + * on the cgroup path. */ + + r = cg_slice_to_path(slice, &slice_path); + if (r < 0) + return r; + + escaped = cg_escape(unit); + if (!escaped) + return -ENOMEM; + + p = strjoin(cgroup_root, "/", slice_path, "/", escaped, NULL); + if (!p) + return -ENOMEM; + + r = cg_get_xattr(SYSTEMD_CGROUP_CONTROLLER, p, "trusted.invocation_id", ids, 32); + if (r < 0) + return r; + if (r != 32) + return -EINVAL; + ids[32] = 0; + + if (!id128_is_valid(ids)) + return -EINVAL; + + copy = strdup(ids); + if (!copy) + return -ENOMEM; + + *ret = copy; + return 0; +} + static void dispatch_message_real( Server *s, struct iovec *iovec, unsigned n, unsigned m, @@ -771,6 +810,7 @@ static void dispatch_message_real( r = cg_pid_get_path_shifted(ucred->pid, s->cgroup_root, &c); if (r >= 0) { + _cleanup_free_ char *raw_unit = NULL, *raw_slice = NULL; char *session = NULL; x = strjoina("_SYSTEMD_CGROUP=", c); @@ -790,9 +830,8 @@ static void dispatch_message_real( IOVEC_SET_STRING(iovec[n++], owner_uid); } - if (cg_path_get_unit(c, &t) >= 0) { - x = strjoina("_SYSTEMD_UNIT=", t); - free(t); + if (cg_path_get_unit(c, &raw_unit) >= 0) { + x = strjoina("_SYSTEMD_UNIT=", raw_unit); IOVEC_SET_STRING(iovec[n++], x); } else if (unit_id && !session) { x = strjoina("_SYSTEMD_UNIT=", unit_id); @@ -808,9 +847,8 @@ static void dispatch_message_real( IOVEC_SET_STRING(iovec[n++], x); } - if (cg_path_get_slice(c, &t) >= 0) { - x = strjoina("_SYSTEMD_SLICE=", t); - free(t); + if (cg_path_get_slice(c, &raw_slice) >= 0) { + x = strjoina("_SYSTEMD_SLICE=", raw_slice); IOVEC_SET_STRING(iovec[n++], x); } @@ -820,6 +858,14 @@ static void dispatch_message_real( IOVEC_SET_STRING(iovec[n++], x); } + if (raw_slice && raw_unit) { + if (get_invocation_id(s->cgroup_root, raw_slice, raw_unit, &t) >= 0) { + x = strjoina("_SYSTEMD_INVOCATION_ID=", t); + free(t); + IOVEC_SET_STRING(iovec[n++], x); + } + } + free(c); } else if (unit_id) { x = strjoina("_SYSTEMD_UNIT=", unit_id); diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h index 784d24833d..dfb5724794 100644 --- a/src/journal/journald-server.h +++ b/src/journal/journald-server.h @@ -153,7 +153,7 @@ struct Server { #define SERVER_MACHINE_ID(s) ((s)->machine_id_field + strlen("_MACHINE_ID=")) -#define N_IOVEC_META_FIELDS 21 +#define N_IOVEC_META_FIELDS 22 #define N_IOVEC_KERNEL_FIELDS 64 #define N_IOVEC_UDEV_FIELDS 32 #define N_IOVEC_OBJECT_FIELDS 14 diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index 70ea347361..d48ef6bbe2 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -509,4 +509,5 @@ global: sd_bus_track_count_sender; sd_bus_set_exit_on_disconnect; sd_bus_get_exit_on_disconnect; + sd_id128_get_invocation; } LIBSYSTEMD_231; diff --git a/src/libsystemd/sd-bus/bus-common-errors.c b/src/libsystemd/sd-bus/bus-common-errors.c index 9cc28ed564..d2a826bf6e 100644 --- a/src/libsystemd/sd-bus/bus-common-errors.c +++ b/src/libsystemd/sd-bus/bus-common-errors.c @@ -27,6 +27,7 @@ BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map bus_common_errors[] = { SD_BUS_ERROR_MAP(BUS_ERROR_NO_SUCH_UNIT, ENOENT), SD_BUS_ERROR_MAP(BUS_ERROR_NO_UNIT_FOR_PID, ESRCH), + SD_BUS_ERROR_MAP(BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID, ENOENT), SD_BUS_ERROR_MAP(BUS_ERROR_UNIT_EXISTS, EEXIST), SD_BUS_ERROR_MAP(BUS_ERROR_LOAD_FAILED, EIO), SD_BUS_ERROR_MAP(BUS_ERROR_JOB_FAILED, EREMOTEIO), diff --git a/src/libsystemd/sd-bus/bus-common-errors.h b/src/libsystemd/sd-bus/bus-common-errors.h index 5df21c8926..525b79fa77 100644 --- a/src/libsystemd/sd-bus/bus-common-errors.h +++ b/src/libsystemd/sd-bus/bus-common-errors.h @@ -23,6 +23,7 @@ #define BUS_ERROR_NO_SUCH_UNIT "org.freedesktop.systemd1.NoSuchUnit" #define BUS_ERROR_NO_UNIT_FOR_PID "org.freedesktop.systemd1.NoUnitForPID" +#define BUS_ERROR_NO_UNIT_FOR_INVOCATION_ID "org.freedesktop.systemd1.NoUnitForInvocationID" #define BUS_ERROR_UNIT_EXISTS "org.freedesktop.systemd1.UnitExists" #define BUS_ERROR_LOAD_FAILED "org.freedesktop.systemd1.LoadFailed" #define BUS_ERROR_JOB_FAILED "org.freedesktop.systemd1.JobFailed" diff --git a/src/libsystemd/sd-id128/id128-util.c b/src/libsystemd/sd-id128/id128-util.c index c3f527d657..337eae24b4 100644 --- a/src/libsystemd/sd-id128/id128-util.c +++ b/src/libsystemd/sd-id128/id128-util.c @@ -192,3 +192,16 @@ int id128_write(const char *p, Id128Format f, sd_id128_t id, bool do_sync) { return id128_write_fd(fd, f, id, do_sync); } + +void id128_hash_func(const void *p, struct siphash *state) { + siphash24_compress(p, 16, state); +} + +int id128_compare_func(const void *a, const void *b) { + return memcmp(a, b, 16); +} + +const struct hash_ops id128_hash_ops = { + .hash = id128_hash_func, + .compare = id128_compare_func, +}; diff --git a/src/libsystemd/sd-id128/id128-util.h b/src/libsystemd/sd-id128/id128-util.h index 3ba59acbca..6b3855acbb 100644 --- a/src/libsystemd/sd-id128/id128-util.h +++ b/src/libsystemd/sd-id128/id128-util.h @@ -22,6 +22,8 @@ #include #include "sd-id128.h" + +#include "hash-funcs.h" #include "macro.h" char *id128_to_uuid_string(sd_id128_t id, char s[37]); @@ -43,3 +45,7 @@ int id128_read(const char *p, Id128Format f, sd_id128_t *ret); int id128_write_fd(int fd, Id128Format f, sd_id128_t id, bool do_sync); int id128_write(const char *p, Id128Format f, sd_id128_t id, bool do_sync); + +void id128_hash_func(const void *p, struct siphash *state); +int id128_compare_func(const void *a, const void *b) _pure_; +extern const struct hash_ops id128_hash_ops; diff --git a/src/libsystemd/sd-id128/sd-id128.c b/src/libsystemd/sd-id128/sd-id128.c index 9f47d04e61..d4450c70a0 100644 --- a/src/libsystemd/sd-id128/sd-id128.c +++ b/src/libsystemd/sd-id128/sd-id128.c @@ -129,6 +129,28 @@ _public_ int sd_id128_get_boot(sd_id128_t *ret) { return 0; } +_public_ int sd_id128_get_invocation(sd_id128_t *ret) { + static thread_local sd_id128_t saved_invocation_id = {}; + int r; + + assert_return(ret, -EINVAL); + + if (sd_id128_is_null(saved_invocation_id)) { + const char *e; + + e = secure_getenv("INVOCATION_ID"); + if (!e) + return -ENXIO; + + r = sd_id128_from_string(e, &saved_invocation_id); + if (r < 0) + return r; + } + + *ret = saved_invocation_id; + return 0; +} + static sd_id128_t make_v4_uuid(sd_id128_t id) { /* Stolen from generate_random_uuid() of drivers/char/random.c * in the kernel sources */ diff --git a/src/network/networkd-link.h b/src/network/networkd-link.h index 05b2a2b323..77f72d070e 100644 --- a/src/network/networkd-link.h +++ b/src/network/networkd-link.h @@ -187,7 +187,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Link*, link_unref); #define log_link_full(link, level, error, ...) \ ({ \ const Link *_l = (link); \ - _l ? log_object_internal(level, error, __FILE__, __LINE__, __func__, "INTERFACE=", _l->ifname, ##__VA_ARGS__) : \ + _l ? log_object_internal(level, error, __FILE__, __LINE__, __func__, "INTERFACE=", _l->ifname, NULL, NULL, ##__VA_ARGS__) : \ log_internal(level, error, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ }) \ diff --git a/src/network/networkd-netdev.h b/src/network/networkd-netdev.h index 31b55e2791..70ff947b99 100644 --- a/src/network/networkd-netdev.h +++ b/src/network/networkd-netdev.h @@ -182,7 +182,7 @@ const struct ConfigPerfItem* network_netdev_gperf_lookup(const char *key, unsign #define log_netdev_full(netdev, level, error, ...) \ ({ \ const NetDev *_n = (netdev); \ - _n ? log_object_internal(level, error, __FILE__, __LINE__, __func__, "INTERFACE=", _n->ifname, ##__VA_ARGS__) : \ + _n ? log_object_internal(level, error, __FILE__, __LINE__, __func__, "INTERFACE=", _n->ifname, NULL, NULL, ##__VA_ARGS__) : \ log_internal(level, error, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ }) diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c index 64fcf9295f..bb90c89cc2 100644 --- a/src/shared/bus-util.c +++ b/src/shared/bus-util.c @@ -1338,7 +1338,7 @@ int bus_property_get_id128( if (sd_id128_is_null(*id)) /* Add an empty array if the ID is zero */ return sd_bus_message_append(reply, "ay", 0); else - return sd_bus_message_append_array(reply, 'b', id->bytes, 16); + return sd_bus_message_append_array(reply, 'y', id->bytes, 16); } #if __SIZEOF_SIZE_T__ != 8 diff --git a/src/systemd/sd-id128.h b/src/systemd/sd-id128.h index 4dff0b9b81..ee011b1861 100644 --- a/src/systemd/sd-id128.h +++ b/src/systemd/sd-id128.h @@ -45,8 +45,8 @@ int sd_id128_from_string(const char *s, sd_id128_t *ret); int sd_id128_randomize(sd_id128_t *ret); int sd_id128_get_machine(sd_id128_t *ret); - int sd_id128_get_boot(sd_id128_t *ret); +int sd_id128_get_invocation(sd_id128_t *ret); #define SD_ID128_MAKE(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) \ ((const sd_id128_t) { .bytes = { 0x##v0, 0x##v1, 0x##v2, 0x##v3, 0x##v4, 0x##v5, 0x##v6, 0x##v7, \ -- cgit v1.2.3-54-g00ecf From e0d2adfde677d91b57dd63f6a3f00f4b86be9a64 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 11 Oct 2016 20:07:22 +0200 Subject: core: chown() any TTY used for stdin, not just when StandardInput=tty is used (#4347) If stdin is supplied as an fd for transient units (using the StandardInputFileDescriptor pseudo-property for transient units), then we should also fix up the TTY ownership, not just when we opened the TTY ourselves. This simply drops the explicit is_terminal_input()-based check. Note that chown_terminal() internally does a much more appropriate isatty()-based check anyway, hence we can drop this without replacement. Fixes: #4260 --- src/core/execute.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 7079aeed6e..0c983f4953 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2350,7 +2350,7 @@ static int exec_child( USER_PROCESS, username ? "root" : context->user); - if (context->user && is_terminal_input(context->std_input)) { + if (context->user) { r = chown_terminal(STDIN_FILENO, uid); if (r < 0) { *exit_status = EXIT_STDIN; -- cgit v1.2.3-54-g00ecf From 502d704e5ed2d288069471f4e3611115cde107d6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 13:31:21 +0200 Subject: core:sandbox: Add ProtectKernelModules= option This is useful to turn off explicit module load and unload operations on modular kernels. This option removes CAP_SYS_MODULE from the capability bounding set for the unit, and installs a system call filter to block module system calls. This option will not prevent the kernel from loading modules using the module auto-load feature which is a system wide operation. --- man/systemd.exec.xml | 17 ++++++++++++ src/core/dbus-execute.c | 5 +++- src/core/execute.c | 52 +++++++++++++++++++++++++++++++++++ src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.m4 | 1 + src/core/unit.c | 3 ++ src/shared/bus-unit-util.c | 3 +- 7 files changed, 80 insertions(+), 2 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 986985ad35..3bea4976b3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1404,6 +1404,23 @@ logging. This does not affect commands prefixed with +. + + ProtectKernelModules= + + Takes a boolean argument. If true, explicit module loading will + be denied. This allows to turn off module load and unload operations on modular + kernels. It is recomended to turn this on for most services that do not need special + file systems or extra kernel modules to work. Default to off. Enabling this option + removes CAP_SYS_MODULE from the capability bounding set for + the unit, and installs a system call filter to block module system calls. + Note that limited automatic module loading due to user configuration or kernel + mapping tables might still happen as side effect of requested user operations, + both privileged and unprivileged. To disable module auto-load feature please see + sysctl.d5 + kernel.modules_disabled mechanism and + /proc/sys/kernel/modules_disabled documentation. + + Personality= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index eec4500c8c..b8720d7d3d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property( "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", - "ProtectControlGroups")) { + "ProtectKernelModules", "ProtectControlGroups")) { int b; r = sd_bus_message_read(message, "b", &b); @@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property( c->remove_ipc = b; else if (streq(name, "ProtectKernelTunables")) c->protect_kernel_tunables = b; + else if (streq(name, "ProtectKernelModules")) + c->protect_kernel_modules = b; else if (streq(name, "ProtectControlGroups")) c->protect_control_groups = b; diff --git a/src/core/execute.c b/src/core/execute.c index 0c983f4953..7a278b7d31 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1436,6 +1436,50 @@ finish: return r; } +static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { + static const int module_syscalls[] = { + SCMP_SYS(delete_module), + SCMP_SYS(finit_module), + SCMP_SYS(init_module), + }; + + scmp_filter_ctx *seccomp; + unsigned i; + int r; + + assert(c); + + /* Turn of module syscalls on ProtectKernelModules=yes */ + + if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) + return 0; + + seccomp = seccomp_init(SCMP_ACT_ALLOW); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { + r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), + module_syscalls[i], 0); + if (r < 0) + goto finish; + } + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; +} + static int apply_private_devices(Unit *u, const ExecContext *c) { const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; @@ -2690,6 +2734,14 @@ static int exec_child( } } + if (context->protect_kernel_modules) { + r = apply_protect_kernel_modules(unit, context); + if (r < 0) { + *exit_status = EXIT_SECCOMP; + return r; + } + } + if (context->private_devices) { r = apply_private_devices(unit, context); if (r < 0) { diff --git a/src/core/execute.h b/src/core/execute.h index 449180c903..1de439c3ad 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -175,6 +175,7 @@ struct ExecContext { ProtectSystem protect_system; ProtectHome protect_home; bool protect_kernel_tunables; + bool protect_kernel_modules; bool protect_control_groups; bool no_new_privileges; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index c49c1d6732..a700d853cc 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -90,6 +90,7 @@ $1.InaccessiblePaths, config_parse_namespace_path_strv, 0, $1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp) $1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices) $1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) +$1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) diff --git a/src/core/unit.c b/src/core/unit.c index 690f7f7dd9..71f95c0b96 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3401,6 +3401,9 @@ int unit_patch_contexts(Unit *u) { if (ec->private_devices) ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD); + if (ec->protect_kernel_modules) + ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE); + if (ec->dynamic_user) { if (!ec->user) { r = user_from_unit_name(u, &ec->user); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index a550a370b5..f639e0e832 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) { + "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", + "ProtectKernelModules", "ProtectControlGroups")) { r = parse_boolean(eq); if (r < 0) -- cgit v1.2.3-54-g00ecf From c575770b75b6cd15684fbacd249147bf5fd6ead7 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 12 Oct 2016 14:11:16 +0200 Subject: core:sandbox: lets make /lib/modules/ inaccessible on ProtectKernelModules= Lets go further and make /lib/modules/ inaccessible for services that do not have business with modules, this is a minor improvment but it may help on setups with custom modules and they are limited... in regard of kernel auto-load feature. This change introduce NameSpaceInfo struct which we may embed later inside ExecContext but for now lets just reduce the argument number to setup_namespace() and merge ProtectKernelModules feature. --- man/systemd.exec.xml | 5 ++++- src/core/execute.c | 11 ++++++++--- src/core/namespace.c | 54 +++++++++++++++++++++++++++++++++++----------------- src/core/namespace.h | 14 +++++++++++--- src/test/test-ns.c | 12 +++++++++--- 5 files changed, 69 insertions(+), 27 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 4a68695348..249fcb0363 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1415,7 +1415,10 @@ kernels. It is recomended to turn this on for most services that do not need special file systems or extra kernel modules to work. Default to off. Enabling this option removes CAP_SYS_MODULE from the capability bounding set for - the unit, and installs a system call filter to block module system calls. + the unit, and installs a system call filter to block module system calls, + also /usr/lib/modules is made inaccessible. For this + setting the same restrictions regarding mount propagation and privileges + apply as for ReadOnlyPaths= and related calls, see above. Note that limited automatic module loading due to user configuration or kernel mapping tables might still happen as side effect of requested user operations, both privileged and unprivileged. To disable module auto-load feature please see diff --git a/src/core/execute.c b/src/core/execute.c index 7a278b7d31..dc078d96f0 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1766,6 +1766,7 @@ static bool exec_needs_mount_namespace( context->protect_system != PROTECT_SYSTEM_NO || context->protect_home != PROTECT_HOME_NO || context->protect_kernel_tunables || + context->protect_kernel_modules || context->protect_control_groups) return true; @@ -2493,6 +2494,12 @@ static int exec_child( if (needs_mount_namespace) { _cleanup_free_ char **rw = NULL; char *tmp = NULL, *var = NULL; + NameSpaceInfo ns_info = { + .private_dev = context->private_devices, + .protect_control_groups = context->protect_control_groups, + .protect_kernel_tunables = context->protect_kernel_tunables, + .protect_kernel_modules = context->protect_kernel_modules, + }; /* The runtime struct only contains the parent * of the private /tmp, which is @@ -2515,14 +2522,12 @@ static int exec_child( r = setup_namespace( (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, + &ns_info, rw, context->read_only_paths, context->inaccessible_paths, tmp, var, - context->private_devices, - context->protect_kernel_tunables, - context->protect_control_groups, context->protect_home, context->protect_system, context->mount_flags); diff --git a/src/core/namespace.c b/src/core/namespace.c index 43a2f4ba6e..1195e9a854 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -97,6 +97,14 @@ static const TargetMount protect_kernel_tunables_table[] = { { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ }; +/* ProtectKernelModules= option */ +static const TargetMount protect_kernel_modules_table[] = { +#ifdef HAVE_SPLIT_USR + { "/lib/modules", INACCESSIBLE, true }, +#endif + { "/usr/lib/modules", INACCESSIBLE, true }, +}; + /* * ProtectHome=read-only table, protect $HOME and $XDG_RUNTIME_DIR and rest of * system should be protected by ProtectSystem= @@ -207,6 +215,13 @@ static int append_protect_kernel_tunables(BindMount **p, const char *root_direct ELEMENTSOF(protect_kernel_tunables_table)); } +static int append_protect_kernel_modules(BindMount **p, const char *root_directory) { + assert(p); + + return append_target_mounts(p, root_directory, protect_kernel_modules_table, + ELEMENTSOF(protect_kernel_modules_table)); +} + static int append_protect_home(BindMount **p, const char *root_directory, ProtectHome protect_home) { int r = 0; @@ -660,14 +675,12 @@ static int chase_all_symlinks(const char *root_directory, BindMount *m, unsigned } static unsigned namespace_calculate_mounts( + const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system) { @@ -690,22 +703,21 @@ static unsigned namespace_calculate_mounts( strv_length(read_write_paths) + strv_length(read_only_paths) + strv_length(inaccessible_paths) + - private_dev + - (protect_sysctl ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + - (protect_cgroups ? 1 : 0) + + ns_info->private_dev + + (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) + + (ns_info->protect_control_groups ? 1 : 0) + + (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) + protect_home_cnt + protect_system_cnt; } int setup_namespace( const char* root_directory, + const NameSpaceInfo *ns_info, char** read_write_paths, char** read_only_paths, char** inaccessible_paths, const char* tmp_dir, const char* var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags) { @@ -718,13 +730,12 @@ int setup_namespace( if (mount_flags == 0) mount_flags = MS_SHARED; - n = namespace_calculate_mounts(read_write_paths, + n = namespace_calculate_mounts(ns_info, + read_write_paths, read_only_paths, inaccessible_paths, tmp_dir, var_tmp_dir, - private_dev, protect_sysctl, - protect_cgroups, protect_home, - protect_system); + protect_home, protect_system); /* Set mount slave mode */ if (root_directory || n > 0) @@ -756,16 +767,25 @@ int setup_namespace( m++; } - if (private_dev) { + if (ns_info->private_dev) { m->path = prefix_roota(root_directory, "/dev"); m->mode = PRIVATE_DEV; m++; } - if (protect_sysctl) - append_protect_kernel_tunables(&m, root_directory); + if (ns_info->protect_kernel_tunables) { + r = append_protect_kernel_tunables(&m, root_directory); + if (r < 0) + return r; + } + + if (ns_info->protect_kernel_modules) { + r = append_protect_kernel_modules(&m, root_directory); + if (r < 0) + return r; + } - if (protect_cgroups) { + if (ns_info->protect_control_groups) { m->path = prefix_roota(root_directory, "/sys/fs/cgroup"); m->mode = READONLY; m++; diff --git a/src/core/namespace.h b/src/core/namespace.h index 6505bcc499..6310638e9a 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -4,6 +4,7 @@ This file is part of systemd. Copyright 2010 Lennart Poettering + Copyright 2016 Djalal Harouni systemd is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by @@ -19,6 +20,8 @@ along with systemd; If not, see . ***/ +typedef struct NameSpaceInfo NameSpaceInfo; + #include #include "macro.h" @@ -40,15 +43,20 @@ typedef enum ProtectSystem { _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; +struct NameSpaceInfo { + bool private_dev:1; + bool protect_control_groups:1; + bool protect_kernel_tunables:1; + bool protect_kernel_modules:1; +}; + int setup_namespace(const char *chroot, + const NameSpaceInfo *ns_info, char **read_write_paths, char **read_only_paths, char **inaccessible_paths, const char *tmp_dir, const char *var_tmp_dir, - bool private_dev, - bool protect_sysctl, - bool protect_cgroups, ProtectHome protect_home, ProtectSystem protect_system, unsigned long mount_flags); diff --git a/src/test/test-ns.c b/src/test/test-ns.c index c4d4da6d05..da7a8b0565 100644 --- a/src/test/test-ns.c +++ b/src/test/test-ns.c @@ -45,6 +45,14 @@ int main(int argc, char *argv[]) { "/home/lennart/projects", NULL }; + + static const NameSpaceInfo ns_info = { + .private_dev = true, + .protect_control_groups = true, + .protect_kernel_tunables = true, + .protect_kernel_modules = true, + }; + char *root_directory; char *projects_directory; int r; @@ -69,14 +77,12 @@ int main(int argc, char *argv[]) { log_info("Not chrooted"); r = setup_namespace(root_directory, + &ns_info, (char **) writable, (char **) readonly, (char **) inaccessible, tmp_dir, var_tmp_dir, - true, - true, - true, PROTECT_HOME_NO, PROTECT_SYSTEM_NO, 0); -- cgit v1.2.3-54-g00ecf From 4084e8fc8947566092fd4ee5a07405570fdbf84d Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 9 Oct 2016 12:28:25 +0200 Subject: core: check protect_kernel_modules and private_devices in order to setup NNP --- src/core/execute.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index dc078d96f0..71439bc3c2 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2115,6 +2115,8 @@ static bool context_has_no_new_privileges(const ExecContext *c) { c->memory_deny_write_execute || c->restrict_realtime || c->protect_kernel_tunables || + c->protect_kernel_modules || + c->private_devices || context_has_syscall_filters(c); } -- cgit v1.2.3-54-g00ecf From e66a2f658b182b1fe8e4bc46b384b9967abd2bf2 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 9 Oct 2016 12:31:51 +0200 Subject: core: make sure to dump ProtectKernelModules= value --- src/core/execute.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 71439bc3c2..869522704a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3190,6 +3190,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { "%sPrivateTmp: %s\n" "%sPrivateDevices: %s\n" "%sProtectKernelTunables: %s\n" + "%sProtectKernelModules: %s\n" "%sProtectControlGroups: %s\n" "%sPrivateNetwork: %s\n" "%sPrivateUsers: %s\n" @@ -3205,6 +3206,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->private_tmp), prefix, yes_no(c->private_devices), prefix, yes_no(c->protect_kernel_tunables), + prefix, yes_no(c->protect_kernel_modules), prefix, yes_no(c->protect_control_groups), prefix, yes_no(c->private_network), prefix, yes_no(c->private_users), -- cgit v1.2.3-54-g00ecf From 6b430fdb7c0c2c52ea69a7d56f23d739218b13d0 Mon Sep 17 00:00:00 2001 From: Zbigniew Jędrzejewski-Szmek Date: Sun, 16 Oct 2016 18:28:30 -0400 Subject: tree-wide: use mfree more --- coccinelle/mfree_return.cocci | 6 ++++++ src/basic/bitmap.c | 6 ++---- src/basic/env-util.c | 3 +-- src/basic/prioq.c | 4 +--- src/basic/replace-var.c | 3 +-- src/basic/strbuf.c | 3 +-- src/basic/string-util.c | 6 ++---- src/basic/strv.c | 9 +++------ src/core/dynamic-user.c | 4 +--- src/core/execute.c | 4 +--- src/core/manager.c | 3 +-- src/core/transaction.c | 6 ++---- src/core/unit.c | 6 ++---- src/cryptsetup/cryptsetup-generator.c | 9 +++------ src/import/curl-util.c | 4 +--- src/import/export-raw.c | 4 +--- src/import/export-tar.c | 4 +--- src/import/import-raw.c | 4 +--- src/import/import-tar.c | 4 +--- src/import/importd.c | 6 ++---- src/import/pull-job.c | 4 +--- src/import/pull-raw.c | 4 +--- src/import/pull-tar.c | 4 +--- src/journal-remote/journal-remote-write.c | 10 +++------- src/journal/journal-file.c | 3 +-- src/journal/mmap-cache.c | 6 ++---- src/journal/sd-journal.c | 9 +++------ src/libsystemd-network/ndisc-router.c | 3 +-- src/libsystemd-network/sd-dhcp-client.c | 4 +--- src/libsystemd-network/sd-dhcp-lease.c | 4 +--- src/libsystemd-network/sd-dhcp-server.c | 4 +--- src/libsystemd-network/sd-dhcp6-client.c | 4 +--- src/libsystemd-network/sd-dhcp6-lease.c | 4 +--- src/libsystemd-network/sd-ipv4acd.c | 4 +--- src/libsystemd-network/sd-ipv4ll.c | 4 +--- src/libsystemd-network/sd-lldp.c | 4 +--- src/libsystemd-network/sd-ndisc.c | 4 +--- src/libsystemd/sd-bus/bus-slot.c | 4 +--- src/libsystemd/sd-bus/bus-track.c | 8 ++------ src/libudev/libudev-list.c | 14 ++++++-------- src/libudev/libudev-monitor.c | 3 +-- src/login/logind-button.c | 9 +++------ src/login/logind-device.c | 9 +++------ src/login/logind-inhibit.c | 9 +++------ src/login/logind-seat.c | 9 +++------ src/login/logind-session.c | 12 ++++-------- src/machine/machine.c | 4 +--- src/machine/operation.c | 3 +-- src/network/networkd-wait-online-link.c | 3 +-- src/nspawn/nspawn-settings.c | 4 +--- src/resolve/resolved-dns-query.c | 8 ++------ src/resolve/resolved-dns-rr.c | 10 +++------- src/resolve/resolved-dns-scope.c | 4 +--- src/resolve/resolved-dns-search-domain.c | 4 +--- src/resolve/resolved-dns-server.c | 3 +-- src/resolve/resolved-dns-stream.c | 4 +--- src/resolve/resolved-dns-transaction.c | 3 +-- src/resolve/resolved-link.c | 6 ++---- src/resolve/resolved-manager.c | 4 +--- src/shared/machine-image.c | 3 +-- src/shared/ptyfwd.c | 3 +-- src/timesync/timesyncd-server.c | 7 ++----- src/udev/udev-ctrl.c | 3 +-- src/udev/udev-rules.c | 3 +-- 64 files changed, 105 insertions(+), 228 deletions(-) create mode 100644 coccinelle/mfree_return.cocci (limited to 'src/core/execute.c') diff --git a/coccinelle/mfree_return.cocci b/coccinelle/mfree_return.cocci new file mode 100644 index 0000000000..8119fe07f2 --- /dev/null +++ b/coccinelle/mfree_return.cocci @@ -0,0 +1,6 @@ +@@ +expression p; +@@ +- free(p); +- return NULL; ++ return mfree(p); diff --git a/src/basic/bitmap.c b/src/basic/bitmap.c index f4b12fc261..f6212e6151 100644 --- a/src/basic/bitmap.c +++ b/src/basic/bitmap.c @@ -58,10 +58,8 @@ Bitmap *bitmap_copy(Bitmap *b) { return NULL; ret->bitmaps = newdup(uint64_t, b->bitmaps, b->n_bitmaps); - if (!ret->bitmaps) { - free(ret); - return NULL; - } + if (!ret->bitmaps) + return mfree(ret); ret->n_bitmaps = ret->bitmaps_allocated = b->n_bitmaps; return ret; diff --git a/src/basic/env-util.c b/src/basic/env-util.c index 7f5fddb700..b74290d6fd 100644 --- a/src/basic/env-util.c +++ b/src/basic/env-util.c @@ -544,8 +544,7 @@ char *replace_env(const char *format, char **env) { return k; fail: - free(r); - return NULL; + return mfree(r); } char **replace_env_argv(char **argv, char **env) { diff --git a/src/basic/prioq.c b/src/basic/prioq.c index d2ec516d29..4570b8e4ba 100644 --- a/src/basic/prioq.c +++ b/src/basic/prioq.c @@ -62,9 +62,7 @@ Prioq* prioq_free(Prioq *q) { return NULL; free(q->items); - free(q); - - return NULL; + return mfree(q); } int prioq_ensure_allocated(Prioq **q, compare_func_t compare_func) { diff --git a/src/basic/replace-var.c b/src/basic/replace-var.c index 6a204b9ec3..0d21423a9c 100644 --- a/src/basic/replace-var.c +++ b/src/basic/replace-var.c @@ -107,6 +107,5 @@ char *replace_var(const char *text, char *(*lookup)(const char *variable, void*u return r; oom: - free(r); - return NULL; + return mfree(r); } diff --git a/src/basic/strbuf.c b/src/basic/strbuf.c index 4bef87d3c2..00aaf9e621 100644 --- a/src/basic/strbuf.c +++ b/src/basic/strbuf.c @@ -62,8 +62,7 @@ struct strbuf *strbuf_new(void) { err: free(str->buf); free(str->root); - free(str); - return NULL; + return mfree(str); } static void strbuf_node_cleanup(struct strbuf_node *node) { diff --git a/src/basic/string-util.c b/src/basic/string-util.c index dc7de5dab8..6b06e643c9 100644 --- a/src/basic/string-util.c +++ b/src/basic/string-util.c @@ -610,8 +610,7 @@ char *strreplace(const char *text, const char *old_string, const char *new_strin return r; oom: - free(r); - return NULL; + return mfree(r); } char *strip_tab_ansi(char **ibuf, size_t *_isz) { @@ -682,8 +681,7 @@ char *strip_tab_ansi(char **ibuf, size_t *_isz) { if (ferror(f)) { fclose(f); - free(obuf); - return NULL; + return mfree(obuf); } fclose(f); diff --git a/src/basic/strv.c b/src/basic/strv.c index 34e464d253..0eec868eed 100644 --- a/src/basic/strv.c +++ b/src/basic/strv.c @@ -87,8 +87,7 @@ void strv_clear(char **l) { char **strv_free(char **l) { strv_clear(l); - free(l); - return NULL; + return mfree(l); } char **strv_free_erase(char **l) { @@ -426,8 +425,7 @@ char *strv_join_quoted(char **l) { return buf; oom: - free(buf); - return NULL; + return mfree(buf); } int strv_push(char ***l, char *value) { @@ -869,8 +867,7 @@ char ***strv_free_free(char ***l) { for (i = l; *i; i++) strv_free(*i); - free(l); - return NULL; + return mfree(l); } char **strv_skip(char **l, size_t n) { diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index 1043da3eb7..e1846e1adb 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -42,9 +42,7 @@ static DynamicUser* dynamic_user_free(DynamicUser *d) { (void) hashmap_remove(d->manager->dynamic_users, d->name); safe_close_pair(d->storage_socket); - free(d); - - return NULL; + return mfree(d); } static int dynamic_user_add(Manager *m, const char *name, int storage_socket[2], DynamicUser **ret) { diff --git a/src/core/execute.c b/src/core/execute.c index 869522704a..59c2bd0dd2 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3740,9 +3740,7 @@ ExecRuntime *exec_runtime_unref(ExecRuntime *r) { free(r->tmp_dir); free(r->var_tmp_dir); safe_close_pair(r->netns_storage_socket); - free(r); - - return NULL; + return mfree(r); } int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) { diff --git a/src/core/manager.c b/src/core/manager.c index 3569249788..50aae0d1ba 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -1119,8 +1119,7 @@ Manager* manager_free(Manager *m) { hashmap_free(m->uid_refs); hashmap_free(m->gid_refs); - free(m); - return NULL; + return mfree(m); } void manager_enumerate(Manager *m) { diff --git a/src/core/transaction.c b/src/core/transaction.c index 8370b864fb..e22e3b30c2 100644 --- a/src/core/transaction.c +++ b/src/core/transaction.c @@ -1085,10 +1085,8 @@ Transaction *transaction_new(bool irreversible) { return NULL; tr->jobs = hashmap_new(NULL); - if (!tr->jobs) { - free(tr); - return NULL; - } + if (!tr->jobs) + return mfree(tr); tr->irreversible = irreversible; diff --git a/src/core/unit.c b/src/core/unit.c index 67668bdc48..a6b8ecdcb7 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -88,10 +88,8 @@ Unit *unit_new(Manager *m, size_t size) { return NULL; u->names = set_new(&string_hash_ops); - if (!u->names) { - free(u); - return NULL; - } + if (!u->names) + return mfree(u); u->manager = m; u->type = _UNIT_TYPE_INVALID; diff --git a/src/cryptsetup/cryptsetup-generator.c b/src/cryptsetup/cryptsetup-generator.c index 8ac5ab730a..de0a3b6f9c 100644 --- a/src/cryptsetup/cryptsetup-generator.c +++ b/src/cryptsetup/cryptsetup-generator.c @@ -264,16 +264,13 @@ static crypto_device *get_crypto_device(const char *uuid) { d->keyfile = d->options = d->name = NULL; d->uuid = strdup(uuid); - if (!d->uuid) { - free(d); - return NULL; - } + if (!d->uuid) + return mfree(d); r = hashmap_put(arg_disks, d->uuid, d); if (r < 0) { free(d->uuid); - free(d); - return NULL; + return mfree(d); } } diff --git a/src/import/curl-util.c b/src/import/curl-util.c index 6990c47f48..734e1560e6 100644 --- a/src/import/curl-util.c +++ b/src/import/curl-util.c @@ -235,9 +235,7 @@ CurlGlue *curl_glue_unref(CurlGlue *g) { sd_event_source_unref(g->timer); sd_event_unref(g->event); - free(g); - - return NULL; + return mfree(g); } int curl_glue_new(CurlGlue **glue, sd_event *event) { diff --git a/src/import/export-raw.c b/src/import/export-raw.c index 6136b677dd..a3dbce1954 100644 --- a/src/import/export-raw.c +++ b/src/import/export-raw.c @@ -87,9 +87,7 @@ RawExport *raw_export_unref(RawExport *e) { free(e->buffer); free(e->path); - free(e); - - return NULL; + return mfree(e); } int raw_export_new( diff --git a/src/import/export-tar.c b/src/import/export-tar.c index d79c27f2d0..3bb6027431 100644 --- a/src/import/export-tar.c +++ b/src/import/export-tar.c @@ -91,9 +91,7 @@ TarExport *tar_export_unref(TarExport *e) { free(e->buffer); free(e->path); - free(e); - - return NULL; + return mfree(e); } int tar_export_new( diff --git a/src/import/import-raw.c b/src/import/import-raw.c index fd6b9f7703..29f3f896e5 100644 --- a/src/import/import-raw.c +++ b/src/import/import-raw.c @@ -100,9 +100,7 @@ RawImport* raw_import_unref(RawImport *i) { free(i->final_path); free(i->image_root); free(i->local); - free(i); - - return NULL; + return mfree(i); } int raw_import_new( diff --git a/src/import/import-tar.c b/src/import/import-tar.c index 8b81324fde..22f9b8c5ea 100644 --- a/src/import/import-tar.c +++ b/src/import/import-tar.c @@ -107,9 +107,7 @@ TarImport* tar_import_unref(TarImport *i) { free(i->final_path); free(i->image_root); free(i->local); - free(i); - - return NULL; + return mfree(i); } int tar_import_new( diff --git a/src/import/importd.c b/src/import/importd.c index 28b4302cb3..9d31a956a5 100644 --- a/src/import/importd.c +++ b/src/import/importd.c @@ -141,8 +141,7 @@ static Transfer *transfer_unref(Transfer *t) { safe_close(t->stdin_fd); safe_close(t->stdout_fd); - free(t); - return NULL; + return mfree(t); } DEFINE_TRIVIAL_CLEANUP_FUNC(Transfer*, transfer_unref); @@ -548,8 +547,7 @@ static Manager *manager_unref(Manager *m) { m->bus = sd_bus_flush_close_unref(m->bus); sd_event_unref(m->event); - free(m); - return NULL; + return mfree(m); } DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unref); diff --git a/src/import/pull-job.c b/src/import/pull-job.c index 6bcf35ef4e..e550df2c57 100644 --- a/src/import/pull-job.c +++ b/src/import/pull-job.c @@ -50,9 +50,7 @@ PullJob* pull_job_unref(PullJob *j) { free(j->payload); free(j->checksum); - free(j); - - return NULL; + return mfree(j); } static void pull_job_finish(PullJob *j, int ret) { diff --git a/src/import/pull-raw.c b/src/import/pull-raw.c index 8993402821..0cf410a5d9 100644 --- a/src/import/pull-raw.c +++ b/src/import/pull-raw.c @@ -110,9 +110,7 @@ RawPull* raw_pull_unref(RawPull *i) { free(i->settings_path); free(i->image_root); free(i->local); - free(i); - - return NULL; + return mfree(i); } int raw_pull_new( diff --git a/src/import/pull-tar.c b/src/import/pull-tar.c index 8c61c46f73..68e2397b02 100644 --- a/src/import/pull-tar.c +++ b/src/import/pull-tar.c @@ -114,9 +114,7 @@ TarPull* tar_pull_unref(TarPull *i) { free(i->settings_path); free(i->image_root); free(i->local); - free(i); - - return NULL; + return mfree(i); } int tar_pull_new( diff --git a/src/journal-remote/journal-remote-write.c b/src/journal-remote/journal-remote-write.c index 7bba52566e..8729372aa3 100644 --- a/src/journal-remote/journal-remote-write.c +++ b/src/journal-remote/journal-remote-write.c @@ -75,10 +75,8 @@ Writer* writer_new(RemoteServer *server) { memset(&w->metrics, 0xFF, sizeof(w->metrics)); w->mmap = mmap_cache_new(); - if (!w->mmap) { - free(w); - return NULL; - } + if (!w->mmap) + return mfree(w); w->n_ref = 1; w->server = server; @@ -103,9 +101,7 @@ Writer* writer_free(Writer *w) { if (w->mmap) mmap_cache_unref(w->mmap); - free(w); - - return NULL; + return mfree(w); } Writer* writer_unref(Writer *w) { diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c index 49199b269f..d3e0214731 100644 --- a/src/journal/journal-file.c +++ b/src/journal/journal-file.c @@ -394,8 +394,7 @@ JournalFile* journal_file_close(JournalFile *f) { gcry_md_close(f->hmac); #endif - free(f); - return NULL; + return mfree(f); } void journal_file_close_set(Set *s) { diff --git a/src/journal/mmap-cache.c b/src/journal/mmap-cache.c index 293d27053a..d91247b524 100644 --- a/src/journal/mmap-cache.c +++ b/src/journal/mmap-cache.c @@ -325,10 +325,8 @@ static FileDescriptor* fd_add(MMapCache *m, int fd) { f->fd = fd; r = hashmap_put(m->fds, FD_TO_PTR(fd), f); - if (r < 0) { - free(f); - return NULL; - } + if (r < 0) + return mfree(f); return f; } diff --git a/src/journal/sd-journal.c b/src/journal/sd-journal.c index 98c8a47afe..f2f8546086 100644 --- a/src/journal/sd-journal.c +++ b/src/journal/sd-journal.c @@ -387,7 +387,7 @@ _public_ int sd_journal_add_disjunction(sd_journal *j) { } static char *match_make_string(Match *m) { - char *p, *r; + char *p = NULL, *r; Match *i; bool enclose = false; @@ -397,15 +397,12 @@ static char *match_make_string(Match *m) { if (m->type == MATCH_DISCRETE) return strndup(m->data, m->size); - p = NULL; LIST_FOREACH(matches, i, m->matches) { char *t, *k; t = match_make_string(i); - if (!t) { - free(p); - return NULL; - } + if (!t) + return mfree(p); if (p) { k = strjoin(p, m->type == MATCH_OR_TERM ? " OR " : " AND ", t, NULL); diff --git a/src/libsystemd-network/ndisc-router.c b/src/libsystemd-network/ndisc-router.c index d9950b638c..41ff2b353a 100644 --- a/src/libsystemd-network/ndisc-router.c +++ b/src/libsystemd-network/ndisc-router.c @@ -49,8 +49,7 @@ _public_ sd_ndisc_router* sd_ndisc_router_unref(sd_ndisc_router *rt) { if (rt->n_ref > 0) return NULL; - free(rt); - return NULL; + return mfree(rt); } sd_ndisc_router *ndisc_router_new(size_t raw_size) { diff --git a/src/libsystemd-network/sd-dhcp-client.c b/src/libsystemd-network/sd-dhcp-client.c index 179e5950bd..5ccb23922c 100644 --- a/src/libsystemd-network/sd-dhcp-client.c +++ b/src/libsystemd-network/sd-dhcp-client.c @@ -1873,9 +1873,7 @@ sd_dhcp_client *sd_dhcp_client_unref(sd_dhcp_client *client) { free(client->req_opts); free(client->hostname); free(client->vendor_class_identifier); - free(client); - - return NULL; + return mfree(client); } int sd_dhcp_client_new(sd_dhcp_client **ret) { diff --git a/src/libsystemd-network/sd-dhcp-lease.c b/src/libsystemd-network/sd-dhcp-lease.c index ef50ed17a1..8387b185c0 100644 --- a/src/libsystemd-network/sd-dhcp-lease.c +++ b/src/libsystemd-network/sd-dhcp-lease.c @@ -282,9 +282,7 @@ sd_dhcp_lease *sd_dhcp_lease_unref(sd_dhcp_lease *lease) { free(lease->static_route); free(lease->client_id); free(lease->vendor_specific); - free(lease); - - return NULL; + return mfree(lease); } static int lease_parse_u32(const uint8_t *option, size_t len, uint32_t *ret, uint32_t min) { diff --git a/src/libsystemd-network/sd-dhcp-server.c b/src/libsystemd-network/sd-dhcp-server.c index 11ee2e252e..f16314a37f 100644 --- a/src/libsystemd-network/sd-dhcp-server.c +++ b/src/libsystemd-network/sd-dhcp-server.c @@ -178,9 +178,7 @@ sd_dhcp_server *sd_dhcp_server_unref(sd_dhcp_server *server) { hashmap_free(server->leases_by_client_id); free(server->bound_leases); - free(server); - - return NULL; + return mfree(server); } int sd_dhcp_server_new(sd_dhcp_server **ret, int ifindex) { diff --git a/src/libsystemd-network/sd-dhcp6-client.c b/src/libsystemd-network/sd-dhcp6-client.c index 463fde401c..e81215f7d7 100644 --- a/src/libsystemd-network/sd-dhcp6-client.c +++ b/src/libsystemd-network/sd-dhcp6-client.c @@ -1300,9 +1300,7 @@ sd_dhcp6_client *sd_dhcp6_client_unref(sd_dhcp6_client *client) { sd_dhcp6_client_detach_event(client); free(client->req_opts); - free(client); - - return NULL; + return mfree(client); } int sd_dhcp6_client_new(sd_dhcp6_client **ret) { diff --git a/src/libsystemd-network/sd-dhcp6-lease.c b/src/libsystemd-network/sd-dhcp6-lease.c index 5c10a6326a..ab59977a3f 100644 --- a/src/libsystemd-network/sd-dhcp6-lease.c +++ b/src/libsystemd-network/sd-dhcp6-lease.c @@ -389,9 +389,7 @@ sd_dhcp6_lease *sd_dhcp6_lease_unref(sd_dhcp6_lease *lease) { free(lease->ntp); lease->ntp_fqdn = strv_free(lease->ntp_fqdn); - free(lease); - - return NULL; + return mfree(lease); } int dhcp6_lease_new(sd_dhcp6_lease **ret) { diff --git a/src/libsystemd-network/sd-ipv4acd.c b/src/libsystemd-network/sd-ipv4acd.c index 662885840f..4dd343c101 100644 --- a/src/libsystemd-network/sd-ipv4acd.c +++ b/src/libsystemd-network/sd-ipv4acd.c @@ -135,9 +135,7 @@ sd_ipv4acd *sd_ipv4acd_unref(sd_ipv4acd *acd) { ipv4acd_reset(acd); sd_ipv4acd_detach_event(acd); - free(acd); - - return NULL; + return mfree(acd); } int sd_ipv4acd_new(sd_ipv4acd **ret) { diff --git a/src/libsystemd-network/sd-ipv4ll.c b/src/libsystemd-network/sd-ipv4ll.c index 5603a533a5..13209261f9 100644 --- a/src/libsystemd-network/sd-ipv4ll.c +++ b/src/libsystemd-network/sd-ipv4ll.c @@ -90,9 +90,7 @@ sd_ipv4ll *sd_ipv4ll_unref(sd_ipv4ll *ll) { return NULL; sd_ipv4acd_unref(ll->acd); - free(ll); - - return NULL; + return mfree(ll); } int sd_ipv4ll_new(sd_ipv4ll **ret) { diff --git a/src/libsystemd-network/sd-lldp.c b/src/libsystemd-network/sd-lldp.c index 0bd1e66aa0..0702241506 100644 --- a/src/libsystemd-network/sd-lldp.c +++ b/src/libsystemd-network/sd-lldp.c @@ -374,9 +374,7 @@ _public_ sd_lldp* sd_lldp_unref(sd_lldp *lldp) { hashmap_free(lldp->neighbor_by_id); prioq_free(lldp->neighbor_by_expiry); - free(lldp); - - return NULL; + return mfree(lldp); } _public_ int sd_lldp_new(sd_lldp **ret) { diff --git a/src/libsystemd-network/sd-ndisc.c b/src/libsystemd-network/sd-ndisc.c index 07b0d7f704..1d3be9b862 100644 --- a/src/libsystemd-network/sd-ndisc.c +++ b/src/libsystemd-network/sd-ndisc.c @@ -148,9 +148,7 @@ _public_ sd_ndisc *sd_ndisc_unref(sd_ndisc *nd) { ndisc_reset(nd); sd_ndisc_detach_event(nd); - free(nd); - - return NULL; + return mfree(nd); } _public_ int sd_ndisc_new(sd_ndisc **ret) { diff --git a/src/libsystemd/sd-bus/bus-slot.c b/src/libsystemd/sd-bus/bus-slot.c index 8e9074c7df..33590c31ac 100644 --- a/src/libsystemd/sd-bus/bus-slot.c +++ b/src/libsystemd/sd-bus/bus-slot.c @@ -212,9 +212,7 @@ _public_ sd_bus_slot* sd_bus_slot_unref(sd_bus_slot *slot) { bus_slot_disconnect(slot); free(slot->description); - free(slot); - - return NULL; + return mfree(slot); } _public_ sd_bus* sd_bus_slot_get_bus(sd_bus_slot *slot) { diff --git a/src/libsystemd/sd-bus/bus-track.c b/src/libsystemd/sd-bus/bus-track.c index 00e93a215f..4acaf24793 100644 --- a/src/libsystemd/sd-bus/bus-track.c +++ b/src/libsystemd/sd-bus/bus-track.c @@ -74,9 +74,7 @@ static struct track_item* track_item_free(struct track_item *i) { sd_bus_slot_unref(i->slot); free(i->name); - free(i); - - return NULL; + return mfree(i); } DEFINE_TRIVIAL_CLEANUP_FUNC(struct track_item*, track_item_free); @@ -206,9 +204,7 @@ _public_ sd_bus_track* sd_bus_track_unref(sd_bus_track *track) { bus_track_remove_from_queue(track); hashmap_free(track->names); sd_bus_unref(track->bus); - free(track); - - return NULL; + return mfree(track); } static int on_name_owner_changed(sd_bus_message *message, void *userdata, sd_bus_error *error) { diff --git a/src/libudev/libudev-list.c b/src/libudev/libudev-list.c index da496ed456..0d51322a15 100644 --- a/src/libudev/libudev-list.c +++ b/src/libudev/libudev-list.c @@ -166,17 +166,16 @@ struct udev_list_entry *udev_list_entry_add(struct udev_list *list, const char * entry = new0(struct udev_list_entry, 1); if (entry == NULL) return NULL; + entry->name = strdup(name); - if (entry->name == NULL) { - free(entry); - return NULL; - } + if (entry->name == NULL) + return mfree(entry); + if (value != NULL) { entry->value = strdup(value); if (entry->value == NULL) { free(entry->name); - free(entry); - return NULL; + return mfree(entry); } } @@ -193,8 +192,7 @@ struct udev_list_entry *udev_list_entry_add(struct udev_list *list, const char * if (entries == NULL) { free(entry->name); free(entry->value); - free(entry); - return NULL; + return mfree(entry); } list->entries = entries; list->entries_max += add; diff --git a/src/libudev/libudev-monitor.c b/src/libudev/libudev-monitor.c index 1f9d16c450..a1f2b33ad5 100644 --- a/src/libudev/libudev-monitor.c +++ b/src/libudev/libudev-monitor.c @@ -207,8 +207,7 @@ struct udev_monitor *udev_monitor_new_from_netlink_fd(struct udev *udev, const c udev_monitor->sock = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_KOBJECT_UEVENT); if (udev_monitor->sock < 0) { log_debug_errno(errno, "error getting socket: %m"); - free(udev_monitor); - return NULL; + return mfree(udev_monitor); } } else { udev_monitor->bound = true; diff --git a/src/login/logind-button.c b/src/login/logind-button.c index baa6b7113c..90fb93bbaf 100644 --- a/src/login/logind-button.c +++ b/src/login/logind-button.c @@ -43,15 +43,12 @@ Button* button_new(Manager *m, const char *name) { return NULL; b->name = strdup(name); - if (!b->name) { - free(b); - return NULL; - } + if (!b->name) + return mfree(b); if (hashmap_put(m->buttons, b->name, b) < 0) { free(b->name); - free(b); - return NULL; + return mfree(b); } b->manager = m; diff --git a/src/login/logind-device.c b/src/login/logind-device.c index eb5edd1cd5..6537fa04bf 100644 --- a/src/login/logind-device.c +++ b/src/login/logind-device.c @@ -34,15 +34,12 @@ Device* device_new(Manager *m, const char *sysfs, bool master) { return NULL; d->sysfs = strdup(sysfs); - if (!d->sysfs) { - free(d); - return NULL; - } + if (!d->sysfs) + return mfree(d); if (hashmap_put(m->devices, d->sysfs, d) < 0) { free(d->sysfs); - free(d); - return NULL; + return mfree(d); } d->manager = m; diff --git a/src/login/logind-inhibit.c b/src/login/logind-inhibit.c index 6c78e0dddc..c93b24009b 100644 --- a/src/login/logind-inhibit.c +++ b/src/login/logind-inhibit.c @@ -45,17 +45,14 @@ Inhibitor* inhibitor_new(Manager *m, const char* id) { return NULL; i->state_file = strappend("/run/systemd/inhibit/", id); - if (!i->state_file) { - free(i); - return NULL; - } + if (!i->state_file) + return mfree(i); i->id = basename(i->state_file); if (hashmap_put(m->inhibitors, i->id, i) < 0) { free(i->state_file); - free(i); - return NULL; + return mfree(i); } i->manager = m; diff --git a/src/login/logind-seat.c b/src/login/logind-seat.c index b5192320e4..ecc7bd2e5b 100644 --- a/src/login/logind-seat.c +++ b/src/login/logind-seat.c @@ -48,18 +48,15 @@ Seat *seat_new(Manager *m, const char *id) { return NULL; s->state_file = strappend("/run/systemd/seats/", id); - if (!s->state_file) { - free(s); - return NULL; - } + if (!s->state_file) + return mfree(s); s->id = basename(s->state_file); s->manager = m; if (hashmap_put(m->seats, s->id, s) < 0) { free(s->state_file); - free(s); - return NULL; + return mfree(s); } return s; diff --git a/src/login/logind-session.c b/src/login/logind-session.c index ba1bcc2630..cbf035f706 100644 --- a/src/login/logind-session.c +++ b/src/login/logind-session.c @@ -62,16 +62,13 @@ Session* session_new(Manager *m, const char *id) { return NULL; s->state_file = strappend("/run/systemd/sessions/", id); - if (!s->state_file) { - free(s); - return NULL; - } + if (!s->state_file) + return mfree(s); s->devices = hashmap_new(&devt_hash_ops); if (!s->devices) { free(s->state_file); - free(s); - return NULL; + return mfree(s); } s->id = basename(s->state_file); @@ -79,8 +76,7 @@ Session* session_new(Manager *m, const char *id) { if (hashmap_put(m->sessions, s->id, s) < 0) { hashmap_free(s->devices); free(s->state_file); - free(s); - return NULL; + return mfree(s); } s->manager = m; diff --git a/src/machine/machine.c b/src/machine/machine.c index dd046d6563..a02b9d7575 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -80,9 +80,7 @@ Machine* machine_new(Manager *manager, MachineClass class, const char *name) { fail: free(m->state_file); free(m->name); - free(m); - - return NULL; + return mfree(m); } void machine_free(Machine *m) { diff --git a/src/machine/operation.c b/src/machine/operation.c index 2bf93cb493..c966d0d21c 100644 --- a/src/machine/operation.c +++ b/src/machine/operation.c @@ -147,6 +147,5 @@ Operation *operation_free(Operation *o) { if (o->machine) LIST_REMOVE(operations_by_machine, o->machine->operations, o); - free(o); - return NULL; + return mfree(o); } diff --git a/src/network/networkd-wait-online-link.c b/src/network/networkd-wait-online-link.c index 5727422e3d..e63ba07e90 100644 --- a/src/network/networkd-wait-online-link.c +++ b/src/network/networkd-wait-online-link.c @@ -77,8 +77,7 @@ Link *link_free(Link *l) { } free(l->ifname); - free(l); - return NULL; + return mfree(l); } int link_update_rtnl(Link *l, sd_netlink_message *m) { diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 5f1522cfb6..09c8f070ba 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -101,9 +101,7 @@ Settings* settings_free(Settings *s) { expose_port_free_all(s->expose_ports); custom_mount_free_all(s->custom_mounts, s->n_custom_mounts); - free(s); - - return NULL; + return mfree(s); } bool settings_private_network(Settings *s) { diff --git a/src/resolve/resolved-dns-query.c b/src/resolve/resolved-dns-query.c index 53be18efc6..e03db4d003 100644 --- a/src/resolve/resolved-dns-query.c +++ b/src/resolve/resolved-dns-query.c @@ -83,9 +83,7 @@ DnsQueryCandidate* dns_query_candidate_free(DnsQueryCandidate *c) { if (c->scope) LIST_REMOVE(candidates_by_scope, c->scope->query_candidates, c); - free(c); - - return NULL; + return mfree(c); } static int dns_query_candidate_next_search_domain(DnsQueryCandidate *c) { @@ -421,9 +419,7 @@ DnsQuery *dns_query_free(DnsQuery *q) { q->manager->n_dns_queries--; } - free(q); - - return NULL; + return mfree(q); } int dns_query_new( diff --git a/src/resolve/resolved-dns-rr.c b/src/resolve/resolved-dns-rr.c index 5687588a7d..87e4abec6e 100644 --- a/src/resolve/resolved-dns-rr.c +++ b/src/resolve/resolved-dns-rr.c @@ -73,10 +73,8 @@ DnsResourceKey* dns_resource_key_new_redirect(const DnsResourceKey *key, const D return dns_resource_key_ref((DnsResourceKey*) key); k = dns_resource_key_new_consume(key->class, key->type, destination); - if (!k) { - free(destination); - return NULL; - } + if (!k) + return mfree(destination); return k; } @@ -513,9 +511,7 @@ DnsResourceRecord* dns_resource_record_unref(DnsResourceRecord *rr) { } free(rr->to_string); - free(rr); - - return NULL; + return mfree(rr); } int dns_resource_record_new_reverse(DnsResourceRecord **ret, int family, const union in_addr_union *address, const char *hostname) { diff --git a/src/resolve/resolved-dns-scope.c b/src/resolve/resolved-dns-scope.c index 03811ac8e7..8dbc7f623b 100644 --- a/src/resolve/resolved-dns-scope.c +++ b/src/resolve/resolved-dns-scope.c @@ -128,9 +128,7 @@ DnsScope* dns_scope_free(DnsScope *s) { dns_zone_flush(&s->zone); LIST_REMOVE(scopes, s->manager->dns_scopes, s); - free(s); - - return NULL; + return mfree(s); } DnsServer *dns_scope_get_dns_server(DnsScope *s) { diff --git a/src/resolve/resolved-dns-search-domain.c b/src/resolve/resolved-dns-search-domain.c index 732471027b..1386e6a17b 100644 --- a/src/resolve/resolved-dns-search-domain.c +++ b/src/resolve/resolved-dns-search-domain.c @@ -104,9 +104,7 @@ DnsSearchDomain* dns_search_domain_unref(DnsSearchDomain *d) { return NULL; free(d->name); - free(d); - - return NULL; + return mfree(d); } void dns_search_domain_unlink(DnsSearchDomain *d) { diff --git a/src/resolve/resolved-dns-server.c b/src/resolve/resolved-dns-server.c index 97cc8c0e09..7282848e35 100644 --- a/src/resolve/resolved-dns-server.c +++ b/src/resolve/resolved-dns-server.c @@ -139,8 +139,7 @@ DnsServer* dns_server_unref(DnsServer *s) { return NULL; free(s->server_string); - free(s); - return NULL; + return mfree(s); } void dns_server_unlink(DnsServer *s) { diff --git a/src/resolve/resolved-dns-stream.c b/src/resolve/resolved-dns-stream.c index dd0e0b90e3..878bae47dc 100644 --- a/src/resolve/resolved-dns-stream.c +++ b/src/resolve/resolved-dns-stream.c @@ -343,9 +343,7 @@ DnsStream *dns_stream_unref(DnsStream *s) { dns_packet_unref(s->write_packet); dns_packet_unref(s->read_packet); - free(s); - - return NULL; + return mfree(s); } DEFINE_TRIVIAL_CLEANUP_FUNC(DnsStream*, dns_stream_unref); diff --git a/src/resolve/resolved-dns-transaction.c b/src/resolve/resolved-dns-transaction.c index d455b6b1fa..2fce44ec8b 100644 --- a/src/resolve/resolved-dns-transaction.c +++ b/src/resolve/resolved-dns-transaction.c @@ -134,8 +134,7 @@ DnsTransaction* dns_transaction_free(DnsTransaction *t) { dns_answer_unref(t->validated_keys); dns_resource_key_unref(t->key); - free(t); - return NULL; + return mfree(t); } DEFINE_TRIVIAL_CLEANUP_FUNC(DnsTransaction*, dns_transaction_free); diff --git a/src/resolve/resolved-link.c b/src/resolve/resolved-link.c index ea4a007139..13e1f91192 100644 --- a/src/resolve/resolved-link.c +++ b/src/resolve/resolved-link.c @@ -101,8 +101,7 @@ Link *link_free(Link *l) { free(l->state_file); - free(l); - return NULL; + return mfree(l); } void link_allocate_scopes(Link *l) { @@ -698,8 +697,7 @@ LinkAddress *link_address_free(LinkAddress *a) { dns_resource_record_unref(a->llmnr_address_rr); dns_resource_record_unref(a->llmnr_ptr_rr); - free(a); - return NULL; + return mfree(a); } void link_address_add_rrs(LinkAddress *a, bool force_remove) { diff --git a/src/resolve/resolved-manager.c b/src/resolve/resolved-manager.c index 40f08e8044..0954641c20 100644 --- a/src/resolve/resolved-manager.c +++ b/src/resolve/resolved-manager.c @@ -630,9 +630,7 @@ Manager *manager_free(Manager *m) { dns_trust_anchor_flush(&m->trust_anchor); manager_etc_hosts_flush(m); - free(m); - - return NULL; + return mfree(m); } int manager_recv(Manager *m, int fd, DnsProtocol protocol, DnsPacket **ret) { diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index 529d89ee2a..060f8d50c7 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -62,8 +62,7 @@ Image *image_unref(Image *i) { free(i->name); free(i->path); - free(i); - return NULL; + return mfree(i); } static char **image_settings_path(Image *image) { diff --git a/src/shared/ptyfwd.c b/src/shared/ptyfwd.c index 24055e772b..293c6673fc 100644 --- a/src/shared/ptyfwd.c +++ b/src/shared/ptyfwd.c @@ -463,8 +463,7 @@ int pty_forward_new( PTYForward *pty_forward_free(PTYForward *f) { pty_forward_disconnect(f); - free(f); - return NULL; + return mfree(f); } int pty_forward_get_last_char(PTYForward *f, char *ch) { diff --git a/src/timesync/timesyncd-server.c b/src/timesync/timesyncd-server.c index 6bda86fe6e..57a7bf2c25 100644 --- a/src/timesync/timesyncd-server.c +++ b/src/timesync/timesyncd-server.c @@ -61,8 +61,7 @@ ServerAddress* server_address_free(ServerAddress *a) { manager_set_server_address(a->name->manager, NULL); } - free(a); - return NULL; + return mfree(a); } int server_name_new( @@ -137,9 +136,7 @@ ServerName *server_name_free(ServerName *n) { log_debug("Removed server %s.", n->string); free(n->string); - free(n); - - return NULL; + return mfree(n); } void server_name_flush_addresses(ServerName *n) { diff --git a/src/udev/udev-ctrl.c b/src/udev/udev-ctrl.c index f68a09d7a8..7717ac7924 100644 --- a/src/udev/udev-ctrl.c +++ b/src/udev/udev-ctrl.c @@ -211,8 +211,7 @@ struct udev_ctrl_connection *udev_ctrl_get_connection(struct udev_ctrl *uctrl) { err: if (conn->sock >= 0) close(conn->sock); - free(conn); - return NULL; + return mfree(conn); } struct udev_ctrl_connection *udev_ctrl_connection_ref(struct udev_ctrl_connection *conn) { diff --git a/src/udev/udev-rules.c b/src/udev/udev-rules.c index 26fa52cf6c..7619c8371b 100644 --- a/src/udev/udev-rules.c +++ b/src/udev/udev-rules.c @@ -1583,8 +1583,7 @@ struct udev_rules *udev_rules_unref(struct udev_rules *rules) { strbuf_cleanup(rules->strbuf); free(rules->uids); free(rules->gids); - free(rules); - return NULL; + return mfree(rules); } bool udev_rules_check_timestamp(struct udev_rules *rules) { -- cgit v1.2.3-54-g00ecf From 52c239d770d3ef955220c5ae72b852360da67c8b Mon Sep 17 00:00:00 2001 From: Luca Bruno Date: Tue, 18 Oct 2016 00:05:49 +0000 Subject: core/exec: add a named-descriptor option ("fd") for streams (#4179) This commit adds a `fd` option to `StandardInput=`, `StandardOutput=` and `StandardError=` properties in order to connect standard streams to externally named descriptors provided by some socket units. This option looks for a file descriptor named as the corresponding stream. Custom names can be specified, separated by a colon. If multiple name-matches exist, the first matching fd will be used. --- man/systemd.exec.xml | 46 +++++++++++++-- src/core/dbus-execute.c | 86 +++++++++++++++++++++++++++- src/core/execute.c | 86 +++++++++++++++++++++++++--- src/core/execute.h | 5 ++ src/core/load-fragment-gperf.gperf.m4 | 6 +- src/core/load-fragment.c | 104 ++++++++++++++++++++++++++++++++-- src/core/load-fragment.h | 2 + src/core/unit.c | 20 +++---- 8 files changed, 322 insertions(+), 33 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index a5a453031f..7453aa7bee 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -409,8 +409,9 @@ , , , - or - . + , + or + . If is selected, standard input will be connected to /dev/null, i.e. all @@ -448,6 +449,20 @@ inetd8 daemon. + The option connects + the input stream to a single file descriptor provided by a socket unit. + A custom named file descriptor can be specified as part of this option, + after a : (e.g. fd:foobar). + If no name is specified, stdin is assumed + (i.e. fd is equivalent to fd:stdin). + At least one socket unit defining such name must be explicitly provided via the + Sockets= option, and file descriptor name may differ + from the name of its containing socket unit. + If multiple matches are found, the first one will be used. + See FileDescriptorName= in + systemd.socket5 + for more details about named descriptors and ordering. + This setting defaults to . @@ -464,8 +479,9 @@ , , , - or - . + , + or + . duplicates the file descriptor of standard input for standard output. @@ -514,6 +530,20 @@ similar to the same option of StandardInput=. + The option connects + the output stream to a single file descriptor provided by a socket unit. + A custom named file descriptor can be specified as part of this option, + after a : (e.g. fd:foobar). + If no name is specified, stdout is assumed + (i.e. fd is equivalent to fd:stdout). + At least one socket unit defining such name must be explicitly provided via the + Sockets= option, and file descriptor name may differ + from the name of its containing socket unit. + If multiple matches are found, the first one will be used. + See FileDescriptorName= in + systemd.socket5 + for more details about named descriptors and ordering. + If the standard output (or error output, see below) of a unit is connected to the journal, syslog or the kernel log buffer, the unit will implicitly gain a dependency of type After= on systemd-journald.socket (also see the automatic dependencies section above). @@ -531,9 +561,13 @@ Controls where file descriptor 2 (STDERR) of the executed processes is connected to. The available options are identical to those of StandardOutput=, - with one exception: if set to the + with some exceptions: if set to the file descriptor used for standard output is duplicated for - standard error. This setting defaults to the value set with + standard error, while operates on the error + stream and will look by default for a descriptor named + stderr. + + This setting defaults to the value set with in systemd-system.conf5, which defaults to . Note that setting diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index b8720d7d3d..1a7f770db1 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -627,6 +627,53 @@ static int property_get_syslog_facility( return sd_bus_message_append(reply, "i", LOG_FAC(c->syslog_priority)); } +static int property_get_input_fdname( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + const char *name; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + name = exec_context_fdname(c, STDIN_FILENO); + + return sd_bus_message_append(reply, "s", name); +} + +static int property_get_output_fdname( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + ExecContext *c = userdata; + const char *name = NULL; + + assert(bus); + assert(c); + assert(property); + assert(reply); + + if (c->std_output == EXEC_OUTPUT_NAMED_FD && streq(property, "StandardOutputFileDescriptorName")) + name = exec_context_fdname(c, STDOUT_FILENO); + else if (c->std_error == EXEC_OUTPUT_NAMED_FD && streq(property, "StandardErrorFileDescriptorName")) + name = exec_context_fdname(c, STDERR_FILENO); + + return sd_bus_message_append(reply, "s", name); +} + const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST), @@ -677,8 +724,11 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("CPUSchedulingResetOnFork", "b", bus_property_get_bool, offsetof(ExecContext, cpu_sched_reset_on_fork), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NonBlocking", "b", bus_property_get_bool, offsetof(ExecContext, non_blocking), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("StandardInput", "s", property_get_exec_input, offsetof(ExecContext, std_input), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardInputFileDescriptorName", "s", property_get_input_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("StandardOutput", "s", bus_property_get_exec_output, offsetof(ExecContext, std_output), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardOutputFileDescriptorName", "s", property_get_output_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("StandardError", "s", bus_property_get_exec_output, offsetof(ExecContext, std_error), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("StandardErrorFileDescriptorName", "s", property_get_output_fdname, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TTYPath", "s", NULL, offsetof(ExecContext, tty_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TTYReset", "b", bus_property_get_bool, offsetof(ExecContext, tty_reset), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("TTYVHangup", "b", bus_property_get_bool, offsetof(ExecContext, tty_vhangup), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1030,7 +1080,6 @@ int bus_exec_context_set_transient_property( return 1; - } else if (streq(name, "StandardOutput")) { const char *s; ExecOutput p; @@ -1071,6 +1120,41 @@ int bus_exec_context_set_transient_property( return 1; + } else if (STR_IN_SET(name, + "StandardInputFileDescriptorName", "StandardOutputFileDescriptorName", "StandardErrorFileDescriptorName")) { + const char *s; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + if (!fdname_is_valid(s)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid file descriptor name"); + + if (mode != UNIT_CHECK) { + if (streq(name, "StandardInputFileDescriptorName")) { + c->std_input = EXEC_INPUT_NAMED_FD; + r = free_and_strdup(&c->stdio_fdname[STDIN_FILENO], s); + if (r < 0) + return r; + unit_write_drop_in_private_format(u, mode, name, "StandardInput=fd:%s", s); + } else if (streq(name, "StandardOutputFileDescriptorName")) { + c->std_output = EXEC_OUTPUT_NAMED_FD; + r = free_and_strdup(&c->stdio_fdname[STDOUT_FILENO], s); + if (r < 0) + return r; + unit_write_drop_in_private_format(u, mode, name, "StandardOutput=fd:%s", s); + } else if (streq(name, "StandardErrorFileDescriptorName")) { + c->std_error = EXEC_OUTPUT_NAMED_FD; + r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], s); + if (r < 0) + return r; + unit_write_drop_in_private_format(u, mode, name, "StandardError=fd:%s", s); + } + } + + return 1; + } else if (STR_IN_SET(name, "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", diff --git a/src/core/execute.c b/src/core/execute.c index 59c2bd0dd2..1b7b4a928d 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -411,7 +411,8 @@ static int fixup_output(ExecOutput std_output, int socket_fd) { static int setup_input( const ExecContext *context, const ExecParameters *params, - int socket_fd) { + int socket_fd, + int named_iofds[3]) { ExecInput i; @@ -461,6 +462,10 @@ static int setup_input( case EXEC_INPUT_SOCKET: return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; + case EXEC_INPUT_NAMED_FD: + (void) fd_nonblock(named_iofds[STDIN_FILENO], false); + return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO; + default: assert_not_reached("Unknown input type"); } @@ -472,6 +477,7 @@ static int setup_output( const ExecParameters *params, int fileno, int socket_fd, + int named_iofds[3], const char *ident, uid_t uid, gid_t gid, @@ -523,7 +529,7 @@ static int setup_output( return fileno; /* Duplicate from stdout if possible */ - if (e == o || e == EXEC_OUTPUT_INHERIT) + if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT) return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno; o = e; @@ -585,6 +591,10 @@ static int setup_output( assert(socket_fd >= 0); return dup2(socket_fd, fileno) < 0 ? -errno : fileno; + case EXEC_OUTPUT_NAMED_FD: + (void) fd_nonblock(named_iofds[fileno], false); + return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno; + default: assert_not_reached("Unknown error type"); } @@ -2157,6 +2167,7 @@ static int exec_child( DynamicCreds *dcreds, char **argv, int socket_fd, + int named_iofds[3], int *fds, unsigned n_fds, char **files_env, int user_lookup_fd, @@ -2298,19 +2309,19 @@ static int exec_child( if (socket_fd >= 0) (void) fd_nonblock(socket_fd, false); - r = setup_input(context, params, socket_fd); + r = setup_input(context, params, socket_fd, named_iofds); if (r < 0) { *exit_status = EXIT_STDIN; return r; } - r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); if (r < 0) { *exit_status = EXIT_STDOUT; return r; } - r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); + r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino); if (r < 0) { *exit_status = EXIT_STDERR; return r; @@ -2829,6 +2840,7 @@ int exec_spawn(Unit *unit, int *fds = NULL; unsigned n_fds = 0; _cleanup_free_ char *line = NULL; int socket_fd, r; + int named_iofds[3] = { -1, -1, -1 }; char **argv; pid_t pid; @@ -2855,6 +2867,10 @@ int exec_spawn(Unit *unit, n_fds = params->n_fds; } + r = exec_context_named_iofds(unit, context, params, named_iofds); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m"); + r = exec_context_load_environment(unit, context, &files_env); if (r < 0) return log_unit_error_errno(unit, r, "Failed to load environment files: %m"); @@ -2884,6 +2900,7 @@ int exec_spawn(Unit *unit, dcreds, argv, socket_fd, + named_iofds, fds, n_fds, files_env, unit->manager->user_lookup_fds[1], @@ -2946,6 +2963,9 @@ void exec_context_done(ExecContext *c) { for (l = 0; l < ELEMENTSOF(c->rlimit); l++) c->rlimit[l] = mfree(c->rlimit[l]); + for (l = 0; l < 3; l++) + c->stdio_fdname[l] = mfree(c->stdio_fdname[l]); + c->working_directory = mfree(c->working_directory); c->root_directory = mfree(c->root_directory); c->tty_path = mfree(c->tty_path); @@ -3044,6 +3064,56 @@ static void invalid_env(const char *p, void *userdata) { log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path); } +const char* exec_context_fdname(const ExecContext *c, int fd_index) { + assert(c); + + switch (fd_index) { + case STDIN_FILENO: + if (c->std_input != EXEC_INPUT_NAMED_FD) + return NULL; + return c->stdio_fdname[STDIN_FILENO] ?: "stdin"; + case STDOUT_FILENO: + if (c->std_output != EXEC_OUTPUT_NAMED_FD) + return NULL; + return c->stdio_fdname[STDOUT_FILENO] ?: "stdout"; + case STDERR_FILENO: + if (c->std_error != EXEC_OUTPUT_NAMED_FD) + return NULL; + return c->stdio_fdname[STDERR_FILENO] ?: "stderr"; + default: + return NULL; + } +} + +int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) { + unsigned i, targets; + const char *stdio_fdname[3]; + + assert(c); + assert(p); + + targets = (c->std_input == EXEC_INPUT_NAMED_FD) + + (c->std_output == EXEC_OUTPUT_NAMED_FD) + + (c->std_error == EXEC_OUTPUT_NAMED_FD); + + for (i = 0; i < 3; i++) + stdio_fdname[i] = exec_context_fdname(c, i); + + for (i = 0; i < p->n_fds && targets > 0; i++) + if (named_iofds[STDIN_FILENO] < 0 && c->std_input == EXEC_INPUT_NAMED_FD && stdio_fdname[STDIN_FILENO] && streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) { + named_iofds[STDIN_FILENO] = p->fds[i]; + targets--; + } else if (named_iofds[STDOUT_FILENO] < 0 && c->std_output == EXEC_OUTPUT_NAMED_FD && stdio_fdname[STDOUT_FILENO] && streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) { + named_iofds[STDOUT_FILENO] = p->fds[i]; + targets--; + } else if (named_iofds[STDERR_FILENO] < 0 && c->std_error == EXEC_OUTPUT_NAMED_FD && stdio_fdname[STDERR_FILENO] && streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) { + named_iofds[STDERR_FILENO] = p->fds[i]; + targets--; + } + + return (targets == 0 ? 0 : -ENOENT); +} + int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) { char **i, **r = NULL; @@ -3896,7 +3966,8 @@ static const char* const exec_input_table[_EXEC_INPUT_MAX] = { [EXEC_INPUT_TTY] = "tty", [EXEC_INPUT_TTY_FORCE] = "tty-force", [EXEC_INPUT_TTY_FAIL] = "tty-fail", - [EXEC_INPUT_SOCKET] = "socket" + [EXEC_INPUT_SOCKET] = "socket", + [EXEC_INPUT_NAMED_FD] = "fd", }; DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput); @@ -3911,7 +3982,8 @@ static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = { [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console", [EXEC_OUTPUT_JOURNAL] = "journal", [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console", - [EXEC_OUTPUT_SOCKET] = "socket" + [EXEC_OUTPUT_SOCKET] = "socket", + [EXEC_OUTPUT_NAMED_FD] = "fd", }; DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput); diff --git a/src/core/execute.h b/src/core/execute.h index 1de439c3ad..c7d0f7761e 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -50,6 +50,7 @@ typedef enum ExecInput { EXEC_INPUT_TTY_FORCE, EXEC_INPUT_TTY_FAIL, EXEC_INPUT_SOCKET, + EXEC_INPUT_NAMED_FD, _EXEC_INPUT_MAX, _EXEC_INPUT_INVALID = -1 } ExecInput; @@ -65,6 +66,7 @@ typedef enum ExecOutput { EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, EXEC_OUTPUT_SOCKET, + EXEC_OUTPUT_NAMED_FD, _EXEC_OUTPUT_MAX, _EXEC_OUTPUT_INVALID = -1 } ExecOutput; @@ -120,6 +122,7 @@ struct ExecContext { ExecInput std_input; ExecOutput std_output; ExecOutput std_error; + char *stdio_fdname[3]; nsec_t timer_slack_nsec; @@ -284,6 +287,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix); int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_root); int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l); +int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]); +const char* exec_context_fdname(const ExecContext *c, int fd_index); bool exec_context_may_touch_console(ExecContext *c); bool exec_context_maintains_privileges(ExecContext *c); diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index a700d853cc..08c88b6b53 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -35,9 +35,9 @@ $1.Environment, config_parse_environ, 0, $1.EnvironmentFile, config_parse_unit_env_file, 0, offsetof($1, exec_context.environment_files) $1.PassEnvironment, config_parse_pass_environ, 0, offsetof($1, exec_context.pass_environment) $1.DynamicUser, config_parse_bool, 0, offsetof($1, exec_context.dynamic_user) -$1.StandardInput, config_parse_input, 0, offsetof($1, exec_context.std_input) -$1.StandardOutput, config_parse_output, 0, offsetof($1, exec_context.std_output) -$1.StandardError, config_parse_output, 0, offsetof($1, exec_context.std_error) +$1.StandardInput, config_parse_exec_input, 0, offsetof($1, exec_context) +$1.StandardOutput, config_parse_exec_output, 0, offsetof($1, exec_context) +$1.StandardError, config_parse_exec_output, 0, offsetof($1, exec_context) $1.TTYPath, config_parse_unit_path_printf, 0, offsetof($1, exec_context.tty_path) $1.TTYReset, config_parse_bool, 0, offsetof($1, exec_context.tty_reset) $1.TTYVHangup, config_parse_bool, 0, offsetof($1, exec_context.tty_vhangup) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 8cc7a8e765..a69f60097d 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -776,8 +776,104 @@ int config_parse_socket_bindtodevice( return 0; } -DEFINE_CONFIG_PARSE_ENUM(config_parse_output, exec_output, ExecOutput, "Failed to parse output specifier"); -DEFINE_CONFIG_PARSE_ENUM(config_parse_input, exec_input, ExecInput, "Failed to parse input specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_input, exec_input, ExecInput, "Failed to parse input literal specifier"); +DEFINE_CONFIG_PARSE_ENUM(config_parse_output, exec_output, ExecOutput, "Failed to parse output literal specifier"); + +int config_parse_exec_input(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + ExecContext *c = data; + const char *name; + int r; + + assert(data); + assert(filename); + assert(line); + assert(rvalue); + + name = startswith(rvalue, "fd:"); + if (name) { + /* Strip prefix and validate fd name */ + if (!fdname_is_valid(name)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid file descriptor name, ignoring: %s", name); + return 0; + } + c->std_input = EXEC_INPUT_NAMED_FD; + r = free_and_strdup(&c->stdio_fdname[STDIN_FILENO], name); + if (r < 0) + log_oom(); + return r; + } else { + ExecInput ei = exec_input_from_string(rvalue); + if (ei == _EXEC_INPUT_INVALID) + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse input specifier, ignoring: %s", rvalue); + else + c->std_input = ei; + return 0; + } +} + +int config_parse_exec_output(const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + ExecContext *c = data; + ExecOutput eo; + const char *name; + int r; + + assert(data); + assert(filename); + assert(line); + assert(lvalue); + assert(rvalue); + + name = startswith(rvalue, "fd:"); + if (name) { + /* Strip prefix and validate fd name */ + if (!fdname_is_valid(name)) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Invalid file descriptor name, ignoring: %s", name); + return 0; + } + eo = EXEC_OUTPUT_NAMED_FD; + } else { + eo = exec_output_from_string(rvalue); + if (eo == _EXEC_OUTPUT_INVALID) { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse output specifier, ignoring: %s", rvalue); + return 0; + } + } + + if (streq(lvalue, "StandardOutput")) { + c->std_output = eo; + r = free_and_strdup(&c->stdio_fdname[STDOUT_FILENO], name); + if (r < 0) + log_oom(); + return r; + } else if (streq(lvalue, "StandardError")) { + c->std_error = eo; + r = free_and_strdup(&c->stdio_fdname[STDERR_FILENO], name); + if (r < 0) + log_oom(); + return r; + } else { + log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse output property, ignoring: %s", lvalue); + return 0; + } +} int config_parse_exec_io_class(const char *unit, const char *filename, @@ -4183,8 +4279,8 @@ void unit_dump_config_items(FILE *f) { { config_parse_exec_cpu_affinity, "CPUAFFINITY" }, { config_parse_mode, "MODE" }, { config_parse_unit_env_file, "FILE" }, - { config_parse_output, "OUTPUT" }, - { config_parse_input, "INPUT" }, + { config_parse_exec_output, "OUTPUT" }, + { config_parse_exec_input, "INPUT" }, { config_parse_log_facility, "FACILITY" }, { config_parse_log_level, "LEVEL" }, { config_parse_exec_secure_bits, "SECUREBITS" }, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 8b688740cf..6d1fe55bcd 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -45,7 +45,9 @@ int config_parse_service_timeout(const char *unit, const char *filename, unsigne int config_parse_service_type(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_service_restart(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_socket_bindtodevice(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_exec_output(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_output(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_exec_input(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_input(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_exec_io_class(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_exec_io_priority(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/core/unit.c b/src/core/unit.c index b24ca5aed8..2fa397bd41 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -858,18 +858,14 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { return r; } - if (c->std_output != EXEC_OUTPUT_KMSG && - c->std_output != EXEC_OUTPUT_SYSLOG && - c->std_output != EXEC_OUTPUT_JOURNAL && - c->std_output != EXEC_OUTPUT_KMSG_AND_CONSOLE && - c->std_output != EXEC_OUTPUT_SYSLOG_AND_CONSOLE && - c->std_output != EXEC_OUTPUT_JOURNAL_AND_CONSOLE && - c->std_error != EXEC_OUTPUT_KMSG && - c->std_error != EXEC_OUTPUT_SYSLOG && - c->std_error != EXEC_OUTPUT_JOURNAL && - c->std_error != EXEC_OUTPUT_KMSG_AND_CONSOLE && - c->std_error != EXEC_OUTPUT_JOURNAL_AND_CONSOLE && - c->std_error != EXEC_OUTPUT_SYSLOG_AND_CONSOLE) + if (!IN_SET(c->std_output, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_SYSLOG, EXEC_OUTPUT_SYSLOG_AND_CONSOLE) && + !IN_SET(c->std_error, + EXEC_OUTPUT_JOURNAL, EXEC_OUTPUT_JOURNAL_AND_CONSOLE, + EXEC_OUTPUT_KMSG, EXEC_OUTPUT_KMSG_AND_CONSOLE, + EXEC_OUTPUT_SYSLOG, EXEC_OUTPUT_SYSLOG_AND_CONSOLE)) return 0; /* If syslog or kernel logging is requested, make sure our own -- cgit v1.2.3-54-g00ecf From 4d885bd326d958827d926512ecc5e0d21f6ad76b Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Sun, 23 Oct 2016 23:24:14 +0200 Subject: core: first lookup and cache creds then apply them after namespace setup This fixes: https://github.com/systemd/systemd/issues/4357 Let's lookup and cache creds then apply them. We also switch from getgroups() to getgrouplist(). --- src/core/execute.c | 213 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 144 insertions(+), 69 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 1b7b4a928d..874f035b2e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -730,74 +730,146 @@ static int ask_for_confirmation(char *response, char **argv) { return r; } -static int enforce_groups(const ExecContext *context, const char *username, gid_t gid) { - bool keep_groups = false; +static int get_fixed_user(const ExecContext *c, const char **user, + uid_t *uid, gid_t *gid, + const char **home, const char **shell) { int r; + const char *name; - assert(context); + assert(c); - /* Lookup and set GID and supplementary group list. Here too - * we avoid NSS lookups for gid=0. */ + if (!c->user) + return 0; - if (context->group || username) { - /* First step, initialize groups from /etc/groups */ - if (username && gid != 0) { - if (initgroups(username, gid) < 0) - return -errno; + /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway + * (i.e. are "/" or "/bin/nologin"). */ - keep_groups = true; - } + name = c->user; + r = get_user_creds_clean(&name, uid, gid, home, shell); + if (r < 0) + return r; - /* Second step, set our gids */ - if (setresgid(gid, gid, gid) < 0) + *user = name; + return 0; +} + +static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) { + int r; + const char *name; + + assert(c); + + if (!c->group) + return 0; + + name = c->group; + r = get_group_creds(&name, gid); + if (r < 0) + return r; + + *group = name; + return 0; +} + +static int get_fixed_supplementary_groups(const ExecContext *c, + const char *user, + const char *group, + gid_t gid, + gid_t **supplementary_gids, int *ngids) { + char **i; + int r, k = 0; + int ngroups_max; + bool keep_groups = false; + gid_t *groups = NULL; + _cleanup_free_ gid_t *l_gids = NULL; + + assert(c); + + if (!c->supplementary_groups) + return 0; + + /* + * If user is given, then lookup GID and supplementary group list. + * We avoid NSS lookups for gid=0. + */ + if (user && gid_is_valid(gid) && gid != 0) { + /* First step, initialize groups from /etc/groups */ + if (initgroups(user, gid) < 0) return -errno; + + keep_groups = true; } - if (context->supplementary_groups) { - int ngroups_max, k; - gid_t *gids; - char **i; + assert_se((ngroups_max = (int) sysconf(_SC_NGROUPS_MAX)) > 0); - /* Final step, initialize any manually set supplementary groups */ - assert_se((ngroups_max = (int) sysconf(_SC_NGROUPS_MAX)) > 0); + l_gids = new(gid_t, ngroups_max); + if (!l_gids) + return -ENOMEM; - if (!(gids = new(gid_t, ngroups_max))) - return -ENOMEM; + if (keep_groups) { + /* + * Lookup the list of groups that the user belongs to, we + * avoid NSS lookups here too for gid=0. + */ + k = ngroups_max; + if (getgrouplist(user, gid, l_gids, &k) < 0) + return -EINVAL; + } else + k = 0; - if (keep_groups) { - k = getgroups(ngroups_max, gids); - if (k < 0) { - free(gids); - return -errno; - } - } else - k = 0; + STRV_FOREACH(i, c->supplementary_groups) { + const char *g; - STRV_FOREACH(i, context->supplementary_groups) { - const char *g; + if (k >= ngroups_max) + return -E2BIG; - if (k >= ngroups_max) { - free(gids); - return -E2BIG; - } + g = *i; + r = get_group_creds(&g, l_gids+k); + if (r < 0) + return r; - g = *i; - r = get_group_creds(&g, gids+k); - if (r < 0) { - free(gids); - return r; - } + k++; + } - k++; - } + /* + * Sets ngids to zero to drop all supplementary groups, happens + * when we are under root and SupplementaryGroups= is empty. + */ + if (k == 0) { + *ngids = 0; + return 0; + } - r = maybe_setgroups(k, gids); - if (r < 0) { - free(gids); + /* Otherwise get the final list of supplementary groups */ + groups = memdup(l_gids, sizeof(gid_t) * k); + if (!groups) + return -ENOMEM; + + *supplementary_gids = groups; + *ngids = k; + + groups = NULL; + + return 0; +} + +static int enforce_groups(const ExecContext *context, gid_t gid, + gid_t *supplementary_gids, int ngids) { + int r; + + assert(context); + + /* Handle SupplementaryGroups= even if it is empty */ + if (context->supplementary_groups) { + r = maybe_setgroups(ngids, supplementary_gids); + if (r < 0) return r; - } + } - free(gids); + if (gid_is_valid(gid)) { + /* Then set our gids */ + if (setresgid(gid, gid, gid) < 0) + return -errno; } return 0; @@ -806,6 +878,9 @@ static int enforce_groups(const ExecContext *context, const char *username, gid_ static int enforce_user(const ExecContext *context, uid_t uid) { assert(context); + if (!uid_is_valid(uid)) + return 0; + /* Sets (but doesn't look up) the uid and make sure we keep the * capabilities while doing so. */ @@ -2175,13 +2250,15 @@ static int exec_child( _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL; _cleanup_free_ char *mac_selinux_context_net = NULL; - const char *username = NULL, *home = NULL, *shell = NULL, *wd; + _cleanup_free_ gid_t *supplementary_gids = NULL; + const char *username = NULL, *groupname = NULL; + const char *home = NULL, *shell = NULL, *wd; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; bool needs_mount_namespace; uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - int i, r; + int i, r, ngids = 0; assert(unit); assert(command); @@ -2273,26 +2350,23 @@ static int exec_child( username = dcreds->user->name; } else { - if (context->user) { - username = context->user; - r = get_user_creds_clean(&username, &uid, &gid, &home, &shell); - if (r < 0) { - *exit_status = EXIT_USER; - return r; - } - - /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway - * (i.e. are "/" or "/bin/nologin"). */ + r = get_fixed_user(context, &username, &uid, &gid, &home, &shell); + if (r < 0) { + *exit_status = EXIT_USER; + return r; } - if (context->group) { - const char *g = context->group; + r = get_fixed_group(context, &groupname, &gid); + if (r < 0) { + *exit_status = EXIT_GROUP; + return r; + } - r = get_group_creds(&g, &gid); - if (r < 0) { - *exit_status = EXIT_GROUP; - return r; - } + r = get_fixed_supplementary_groups(context, username, groupname, + gid, &supplementary_gids, &ngids); + if (r < 0) { + *exit_status = EXIT_GROUP; + return r; } } @@ -2558,8 +2632,9 @@ static int exec_child( } } + /* Drop group as early as possbile */ if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - r = enforce_groups(context, username, gid); + r = enforce_groups(context, gid, supplementary_gids, ngids); if (r < 0) { *exit_status = EXIT_GROUP; return r; -- cgit v1.2.3-54-g00ecf From 8b6903ad4d0dc94cd0098f453a4ea8ab24a4a3f7 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 21 Oct 2016 22:22:56 +0200 Subject: core: lets move the setup of working directory before group enforce This is minor but lets try to split and move bit by bit cgroups and portable environment setup before applying the security context. --- src/core/execute.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 874f035b2e..a9b2b8f299 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2632,6 +2632,13 @@ static int exec_child( } } + if (context->working_directory_home) + wd = home; + else if (context->working_directory) + wd = context->working_directory; + else + wd = "/"; + /* Drop group as early as possbile */ if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { r = enforce_groups(context, gid, supplementary_gids, ngids); @@ -2641,13 +2648,6 @@ static int exec_child( } } - if (context->working_directory_home) - wd = home; - else if (context->working_directory) - wd = context->working_directory; - else - wd = "/"; - if (params->flags & EXEC_APPLY_CHROOT) { if (!needs_mount_namespace && context->root_directory) if (chroot(context->root_directory) < 0) { -- cgit v1.2.3-54-g00ecf From 366ddd252ed25397ead209228b48c5eef93ced2e Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 24 Oct 2016 13:13:06 +0200 Subject: core: do not assert when sysconf(_SC_NGROUPS_MAX) fails (#4466) Remove the assert and check the return code of sysconf(_SC_NGROUPS_MAX). _SC_NGROUPS_MAX maps to NGROUPS_MAX which is defined in to 65536 these days. The value is a sysctl read-only /proc/sys/kernel/ngroups_max and the kernel assumes that it is always positive otherwise things may break. Follow this and support only positive values for all other case return either -errno or -EOPNOTSUPP. Now if there are systems that want to re-write NGROUPS_MAX then they should not pass SupplementaryGroups= in units even if it is empty, in this case nothing fails and we just ignore supplementary groups. However if SupplementaryGroups= is passed even if it is empty we have to assume that there will be groups manipulation from our side or the kernel and since the kernel always assumes that NGROUPS_MAX is positive, then follow that and support only positive values. --- src/core/execute.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index a9b2b8f299..53356c3c06 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -788,6 +788,19 @@ static int get_fixed_supplementary_groups(const ExecContext *c, if (!c->supplementary_groups) return 0; + /* + * If SupplementaryGroups= was passed then NGROUPS_MAX has to + * be positive, otherwise fail. + */ + errno = 0; + ngroups_max = (int) sysconf(_SC_NGROUPS_MAX); + if (ngroups_max <= 0) { + if (errno > 0) + return -errno; + else + return -EOPNOTSUPP; /* For all other values */ + } + /* * If user is given, then lookup GID and supplementary group list. * We avoid NSS lookups for gid=0. @@ -800,8 +813,6 @@ static int get_fixed_supplementary_groups(const ExecContext *c, keep_groups = true; } - assert_se((ngroups_max = (int) sysconf(_SC_NGROUPS_MAX)) > 0); - l_gids = new(gid_t, ngroups_max); if (!l_gids) return -ENOMEM; -- cgit v1.2.3-54-g00ecf From f673b62df67dfa67ddfa4f5a72d78ea3c002278f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 20:03:51 +0200 Subject: core: simplify skip_seccomp_unavailable() a bit Let's prefer early-exit over deep-indented if blocks. Not behavioural change. --- src/core/execute.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 53356c3c06..b69297241b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1185,13 +1185,14 @@ static void rename_process_from_path(const char *path) { #ifdef HAVE_SECCOMP static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { - if (!is_seccomp_available()) { - log_open(); - log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); - log_close(); - return true; - } - return false; + + if (is_seccomp_available()) + return false; + + log_open(); + log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg); + log_close(); + return true; } static int apply_seccomp(const Unit* u, const ExecContext *c) { -- cgit v1.2.3-54-g00ecf From e0f3720e399573134657458f4c8bd20c68fc092a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 20:05:49 +0200 Subject: core: move misplaced comment to the right place --- src/core/execute.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index b69297241b..e63a12f934 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1891,9 +1891,9 @@ static int setup_private_users(uid_t uid, gid_t gid) { asprintf(&uid_map, "0 0 1\n" /* Map root → root */ UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */ - uid, uid); /* The case where the above is the same */ + uid, uid); else - uid_map = strdup("0 0 1\n"); + uid_map = strdup("0 0 1\n"); /* The case where the above is the same */ if (!uid_map) return -ENOMEM; -- cgit v1.2.3-54-g00ecf From 8130926d32d76193e98ba783ba932816f276bfad Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 21:50:05 +0200 Subject: core: rework syscall filter set handling A variety of fixes: - rename the SystemCallFilterSet structure to SyscallFilterSet. So far the main instance of it (the syscall_filter_sets[] array) used to abbreviate "SystemCall" as "Syscall". Let's stick to one of the two syntaxes, and not mix and match too wildly. Let's pick the shorter name in this case, as it is sufficiently well established to not confuse hackers reading this. - Export explicit indexes into the syscall_filter_sets[] array via an enum. This way, code that wants to make use of a specific filter set, can index it directly via the enum, instead of having to search for it. This makes apply_private_devices() in particular a lot simpler. - Provide two new helper calls in seccomp-util.c: syscall_filter_set_find() to find a set by its name, seccomp_add_syscall_filter_set() to add a set to a seccomp object. - Update SystemCallFilter= parser to use extract_first_word(). Let's work on deprecating FOREACH_WORD_QUOTED(). - Simplify apply_private_devices() using this functionality --- src/core/execute.c | 41 +-------------- src/core/load-fragment.c | 54 ++++++++++--------- src/shared/seccomp-util.c | 128 ++++++++++++++++++++++++++++++++++------------ src/shared/seccomp-util.h | 32 ++++++++++-- 4 files changed, 155 insertions(+), 100 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index e63a12f934..18bb67cda9 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1578,10 +1578,7 @@ finish: } static int apply_private_devices(Unit *u, const ExecContext *c) { - const SystemCallFilterSet *set; scmp_filter_ctx *seccomp; - const char *sys; - bool syscalls_found = false; int r; assert(c); @@ -1599,43 +1596,9 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - for (set = syscall_filter_sets; set->set_name; set++) - if (streq(set->set_name, "@raw-io")) { - syscalls_found = true; - break; - } - - /* We should never fail here */ - if (!syscalls_found) { - r = -EOPNOTSUPP; + r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); + if (r < 0) goto finish; - } - - NULSTR_FOREACH(sys, set->value) { - int id; - bool add = true; - -#ifndef __NR_s390_pci_mmio_read - if (streq(sys, "s390_pci_mmio_read")) - add = false; -#endif -#ifndef __NR_s390_pci_mmio_write - if (streq(sys, "s390_pci_mmio_write")) - add = false; -#endif - - if (!add) - continue; - - id = seccomp_syscall_resolve_name(sys); - - r = seccomp_rule_add( - seccomp, - SCMP_ACT_ERRNO(EPERM), - id, 0); - if (r < 0) - goto finish; - } r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); if (r < 0) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 6f68e23340..118b39c1cf 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -2618,6 +2618,7 @@ int config_parse_documentation(const char *unit, } #ifdef HAVE_SECCOMP + static int syscall_filter_parse_one( const char *unit, const char *filename, @@ -2628,27 +2629,29 @@ static int syscall_filter_parse_one( bool warn) { int r; - if (*t == '@') { - const SystemCallFilterSet *set; + if (t[0] == '@') { + const SyscallFilterSet *set; + const char *i; - for (set = syscall_filter_sets; set->set_name; set++) - if (streq(set->set_name, t)) { - const char *sys; + set = syscall_filter_set_find(t); + if (!set) { + if (warn) + log_syntax(unit, LOG_WARNING, filename, line, 0, "Don't know system call group, ignoring: %s", t); + return 0; + } - NULSTR_FOREACH(sys, set->value) { - r = syscall_filter_parse_one(unit, filename, line, c, invert, sys, false); - if (r < 0) - return r; - } - break; - } + NULSTR_FOREACH(i, set->value) { + r = syscall_filter_parse_one(unit, filename, line, c, invert, i, false); + if (r < 0) + return r; + } } else { int id; id = seccomp_syscall_resolve_name(t); if (id == __NR_SCMP_ERROR) { if (warn) - log_syntax(unit, LOG_ERR, filename, line, 0, "Failed to parse system call, ignoring: %s", t); + log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse system call, ignoring: %s", t); return 0; } @@ -2662,8 +2665,9 @@ static int syscall_filter_parse_one( if (r < 0) return log_oom(); } else - set_remove(c->syscall_filter, INT_TO_PTR(id + 1)); + (void) set_remove(c->syscall_filter, INT_TO_PTR(id + 1)); } + return 0; } @@ -2682,8 +2686,7 @@ int config_parse_syscall_filter( ExecContext *c = data; Unit *u = userdata; bool invert = false; - const char *word, *state; - size_t l; + const char *p; int r; assert(filename); @@ -2722,19 +2725,24 @@ int config_parse_syscall_filter( } } - FOREACH_WORD_QUOTED(word, l, rvalue, state) { - _cleanup_free_ char *t = NULL; + p = rvalue; + for (;;) { + _cleanup_free_ char *word = NULL; - t = strndup(word, l); - if (!t) + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } - r = syscall_filter_parse_one(unit, filename, line, c, invert, t, true); + r = syscall_filter_parse_one(unit, filename, line, c, invert, word, true); if (r < 0) return r; } - if (!isempty(state)) - log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring."); /* Turn on NNP, but only if it wasn't configured explicitly * before, and only if we are in user mode. */ diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 8116c7671f..1d51f3fd1f 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -26,6 +26,7 @@ #include "macro.h" #include "seccomp-util.h" #include "string-util.h" +#include "util.h" const char* seccomp_arch_to_string(uint32_t c) { @@ -132,28 +133,30 @@ bool is_seccomp_available(void) { return cached_enabled; } -const SystemCallFilterSet syscall_filter_sets[] = { - { +const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_CLOCK] = { /* Clock */ - .set_name = "@clock", + .name = "@clock", .value = "adjtimex\0" "clock_adjtime\0" "clock_settime\0" "settimeofday\0" "stime\0" - }, { + }, + [SYSCALL_FILTER_SET_CPU_EMULATION] = { /* CPU emulation calls */ - .set_name = "@cpu-emulation", + .name = "@cpu-emulation", .value = "modify_ldt\0" "subpage_prot\0" "switch_endian\0" "vm86\0" "vm86old\0" - }, { + }, + [SYSCALL_FILTER_SET_DEBUG] = { /* Debugging/Performance Monitoring/Tracing */ - .set_name = "@debug", + .name = "@debug", .value = "lookup_dcookie\0" "perf_event_open\0" @@ -161,11 +164,14 @@ const SystemCallFilterSet syscall_filter_sets[] = { "process_vm_writev\0" "ptrace\0" "rtas\0" +#ifdef __NR_s390_runtime_instr "s390_runtime_instr\0" +#endif "sys_debug_setcontext\0" - }, { + }, + [SYSCALL_FILTER_SET_DEFAULT] = { /* Default list */ - .set_name = "@default", + .name = "@default", .value = "execve\0" "exit\0" @@ -173,9 +179,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "getrlimit\0" /* make sure processes can query stack size and such */ "rt_sigreturn\0" "sigreturn\0" - }, { + }, + [SYSCALL_FILTER_SET_IO_EVENT] = { /* Event loop use */ - .set_name = "@io-event", + .name = "@io-event", .value = "_newselect\0" "epoll_create1\0" @@ -191,9 +198,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "ppoll\0" "pselect6\0" "select\0" - }, { + }, + [SYSCALL_FILTER_SET_IPC] = { /* Message queues, SYSV IPC or other IPC: unusual */ - .set_name = "@ipc", + .name = "@ipc", .value = "ipc\0" "mq_getsetattr\0" "mq_notify\0" @@ -215,23 +223,26 @@ const SystemCallFilterSet syscall_filter_sets[] = { "shmctl\0" "shmdt\0" "shmget\0" - }, { + }, + [SYSCALL_FILTER_SET_KEYRING] = { /* Keyring */ - .set_name = "@keyring", + .name = "@keyring", .value = "add_key\0" "keyctl\0" "request_key\0" - }, { + }, + [SYSCALL_FILTER_SET_MODULE] = { /* Kernel module control */ - .set_name = "@module", + .name = "@module", .value = "delete_module\0" "finit_module\0" "init_module\0" - }, { + }, + [SYSCALL_FILTER_SET_MOUNT] = { /* Mounting */ - .set_name = "@mount", + .name = "@mount", .value = "chroot\0" "mount\0" @@ -239,9 +250,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "pivot_root\0" "umount2\0" "umount\0" - }, { + }, + [SYSCALL_FILTER_SET_NETWORK_IO] = { /* Network or Unix socket IO, should not be needed if not network facing */ - .set_name = "@network-io", + .name = "@network-io", .value = "accept4\0" "accept\0" @@ -264,9 +276,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "socket\0" "socketcall\0" "socketpair\0" - }, { + }, + [SYSCALL_FILTER_SET_OBSOLETE] = { /* Unusual, obsolete or unimplemented, some unknown even to libseccomp */ - .set_name = "@obsolete", + .name = "@obsolete", .value = "_sysctl\0" "afs_syscall\0" @@ -292,9 +305,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "uselib\0" "ustat\0" "vserver\0" - }, { + }, + [SYSCALL_FILTER_SET_PRIVILEGED] = { /* Nice grab-bag of all system calls which need superuser capabilities */ - .set_name = "@privileged", + .name = "@privileged", .value = "@clock\0" "@module\0" @@ -333,9 +347,10 @@ const SystemCallFilterSet syscall_filter_sets[] = { "swapon\0" "sysctl\0" "vhangup\0" - }, { + }, + [SYSCALL_FILTER_SET_PROCESS] = { /* Process control, execution, namespaces */ - .set_name = "@process", + .name = "@process", .value = "arch_prctl\0" "clone\0" @@ -349,19 +364,66 @@ const SystemCallFilterSet syscall_filter_sets[] = { "tkill\0" "unshare\0" "vfork\0" - }, { + }, + [SYSCALL_FILTER_SET_RAW_IO] = { /* Raw I/O ports */ - .set_name = "@raw-io", + .name = "@raw-io", .value = "ioperm\0" "iopl\0" "pciconfig_iobase\0" "pciconfig_read\0" "pciconfig_write\0" +#ifdef __NR_s390_pci_mmio_read "s390_pci_mmio_read\0" +#endif +#ifdef __NR_s390_pci_mmio_write "s390_pci_mmio_write\0" - }, { - .set_name = NULL, - .value = NULL - } +#endif + }, }; + +const SyscallFilterSet *syscall_filter_set_find(const char *name) { + unsigned i; + + if (isempty(name) || name[0] != '@') + return NULL; + + for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++) + if (streq(syscall_filter_sets[i].name, name)) + return syscall_filter_sets + i; + + return NULL; +} + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) { + const char *sys; + int r; + + assert(seccomp); + assert(set); + + NULSTR_FOREACH(sys, set->value) { + int id; + + if (sys[0] == '@') { + const SyscallFilterSet *other; + + other = syscall_filter_set_find(sys); + if (!other) + return -EINVAL; + + r = seccomp_add_syscall_filter_set(seccomp, other, action); + } else { + id = seccomp_syscall_resolve_name(sys); + if (id == __NR_SCMP_ERROR) + return -EINVAL; + + r = seccomp_rule_add(seccomp, action, id, 0); + } + if (r < 0) + return r; + } + + return 0; +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index cca7c17912..34fd49c122 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -29,9 +29,31 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c); bool is_seccomp_available(void); -typedef struct SystemCallFilterSet { - const char *set_name; +typedef struct SyscallFilterSet { + const char *name; const char *value; -} SystemCallFilterSet; - -extern const SystemCallFilterSet syscall_filter_sets[]; +} SyscallFilterSet; + +enum { + SYSCALL_FILTER_SET_CLOCK, + SYSCALL_FILTER_SET_CPU_EMULATION, + SYSCALL_FILTER_SET_DEBUG, + SYSCALL_FILTER_SET_DEFAULT, + SYSCALL_FILTER_SET_IO_EVENT, + SYSCALL_FILTER_SET_IPC, + SYSCALL_FILTER_SET_KEYRING, + SYSCALL_FILTER_SET_MODULE, + SYSCALL_FILTER_SET_MOUNT, + SYSCALL_FILTER_SET_NETWORK_IO, + SYSCALL_FILTER_SET_OBSOLETE, + SYSCALL_FILTER_SET_PRIVILEGED, + SYSCALL_FILTER_SET_PROCESS, + SYSCALL_FILTER_SET_RAW_IO, + _SYSCALL_FILTER_SET_MAX +}; + +extern const SyscallFilterSet syscall_filter_sets[]; + +const SyscallFilterSet *syscall_filter_set_find(const char *name); + +int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); -- cgit v1.2.3-54-g00ecf From 25a8d8a0cb297f75b6b9fd3cc15747ba7f56031e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 20:12:33 +0200 Subject: core: rework apply_protect_kernel_modules() to use seccomp_add_syscall_filter_set() Let's simplify this call, by making use of the new infrastructure. This is actually more in line with Djalal's original patch but instead of search the filter set in the array by its name we can now use the set index and jump directly to it. --- src/core/execute.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 18bb67cda9..f435a079c7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1534,19 +1534,14 @@ finish: } static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { - static const int module_syscalls[] = { - SCMP_SYS(delete_module), - SCMP_SYS(finit_module), - SCMP_SYS(init_module), - }; scmp_filter_ctx *seccomp; - unsigned i; + const char *sys; int r; assert(c); - /* Turn of module syscalls on ProtectKernelModules=yes */ + /* Turn off module syscalls on ProtectKernelModules=yes */ if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; @@ -1559,12 +1554,9 @@ static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - for (i = 0; i < ELEMENTSOF(module_syscalls); i++) { - r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), - module_syscalls[i], 0); - if (r < 0) - goto finish; - } + r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); + if (r < 0) + goto finish; r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); if (r < 0) -- cgit v1.2.3-54-g00ecf From 8d7b0c8fd780e88ab5a6d1d79e09e27247245bee Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 20:28:05 +0200 Subject: seccomp: add new seccomp_init_conservative() helper This adds a new seccomp_init_conservative() helper call that is mostly just a wrapper around seccomp_init(), but turns off NNP and adds in all secondary archs, for best compatibility with everything else. Pretty much all of our code used the very same constructs for these three steps, hence unifying this in one small function makes things a lot shorter. This also changes incorrect usage of the "scmp_filter_ctx" type at various places. libseccomp defines it as typedef to "void*", i.e. it is a pointer type (pretty poor choice already!) that casts implicitly to and from all other pointer types (even poorer choice: you defined a confusing type now, and don't even gain any bit of type safety through it...). A lot of the code assumed the type would refer to a structure, and hence aded additional "*" here and there. Remove that. --- src/core/execute.c | 88 ++++++++++----------------------------------- src/nspawn/nspawn-seccomp.c | 18 ++-------- src/shared/seccomp-util.c | 30 ++++++++++++++-- src/shared/seccomp-util.h | 4 ++- 4 files changed, 53 insertions(+), 87 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index f435a079c7..668504c5cf 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1197,7 +1197,7 @@ static bool skip_seccomp_unavailable(const Unit* u, const char* msg) { static int apply_seccomp(const Unit* u, const ExecContext *c) { uint32_t negative_action, action; - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; Iterator i; void *id; int r; @@ -1248,7 +1248,7 @@ finish: } static int apply_address_families(const Unit* u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; Iterator i; int r; @@ -1257,13 +1257,9 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; if (c->address_families_whitelist) { int af, first = 0, last = 0; @@ -1360,10 +1356,6 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { } } - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1372,7 +1364,7 @@ finish: } static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1380,13 +1372,9 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_rule_add( seccomp, @@ -1406,10 +1394,6 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1424,7 +1408,7 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { SCHED_IDLE, }; - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; unsigned i; int r, p, max_policy = 0; @@ -1433,13 +1417,9 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "RestrictRealtime=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; /* Determine the highest policy constant we want to allow */ for (i = 0; i < ELEMENTSOF(permitted_policies); i++) @@ -1483,10 +1463,6 @@ static int apply_restrict_realtime(const Unit* u, const ExecContext *c) { if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1495,7 +1471,7 @@ finish: } static int apply_protect_sysctl(Unit *u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1506,13 +1482,9 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelTunables=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_rule_add( seccomp, @@ -1522,10 +1494,6 @@ static int apply_protect_sysctl(Unit *u, const ExecContext *c) { if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1534,9 +1502,7 @@ finish: } static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { - - scmp_filter_ctx *seccomp; - const char *sys; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1546,22 +1512,14 @@ static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: @@ -1570,7 +1528,7 @@ finish: } static int apply_private_devices(Unit *u, const ExecContext *c) { - scmp_filter_ctx *seccomp; + scmp_filter_ctx seccomp; int r; assert(c); @@ -1580,22 +1538,14 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return -ENOMEM; - - r = seccomp_add_secondary_archs(seccomp); + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); if (r < 0) - goto finish; + return r; r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); if (r < 0) goto finish; - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) - goto finish; - r = seccomp_load(seccomp); finish: diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index 44a0b397ab..03a397d30c 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -135,15 +135,9 @@ int setup_seccomp(uint64_t cap_list_retain) { return 0; } - seccomp = seccomp_init(SCMP_ACT_ALLOW); - if (!seccomp) - return log_oom(); - - r = seccomp_add_secondary_archs(seccomp); - if (r < 0) { - log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m"); - goto finish; - } + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); r = seccomp_add_default_syscall_filter(seccomp, cap_list_retain); if (r < 0) @@ -171,12 +165,6 @@ int setup_seccomp(uint64_t cap_list_retain) { goto finish; } - r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); - if (r < 0) { - log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m"); - goto finish; - } - r = seccomp_load(seccomp); if (r < 0) { log_error_errno(r, "Failed to install seccomp audit filter: %m"); diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1d51f3fd1f..0b9fa47c44 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -74,7 +74,34 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) { return 0; } -int seccomp_add_secondary_archs(scmp_filter_ctx *c) { +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) { + scmp_filter_ctx seccomp; + int r; + + /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are + * added by default, and NNP is turned off. */ + + seccomp = seccomp_init(default_action); + if (!seccomp) + return -ENOMEM; + + r = seccomp_add_secondary_archs(seccomp); + if (r < 0) + goto finish; + + r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0); + if (r < 0) + goto finish; + + *ret = seccomp; + return 0; + +finish: + seccomp_release(seccomp); + return r; +} + +int seccomp_add_secondary_archs(scmp_filter_ctx c) { #if defined(__i386__) || defined(__x86_64__) int r; @@ -111,7 +138,6 @@ int seccomp_add_secondary_archs(scmp_filter_ctx *c) { #endif return 0; - } static bool is_basic_seccomp_available(void) { diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 34fd49c122..2de429a772 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -25,7 +25,9 @@ const char* seccomp_arch_to_string(uint32_t c); int seccomp_arch_from_string(const char *n, uint32_t *ret); -int seccomp_add_secondary_archs(scmp_filter_ctx *c); +int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action); + +int seccomp_add_secondary_archs(scmp_filter_ctx c); bool is_seccomp_available(void); -- cgit v1.2.3-54-g00ecf From a3be2849b2570482757f83181b999febbfc7bbef Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 21 Oct 2016 21:18:46 +0200 Subject: seccomp: add new helper call seccomp_load_filter_set() This allows us to unify most of the code in apply_protect_kernel_modules() and apply_private_devices(). --- src/core/execute.c | 34 ++-------------------------------- src/shared/seccomp-util.c | 24 ++++++++++++++++++++++++ src/shared/seccomp-util.h | 2 ++ 3 files changed, 28 insertions(+), 32 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 668504c5cf..5e7d7c25d7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1502,9 +1502,6 @@ finish: } static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { - scmp_filter_ctx seccomp; - int r; - assert(c); /* Turn off module syscalls on ProtectKernelModules=yes */ @@ -1512,25 +1509,10 @@ static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); } static int apply_private_devices(Unit *u, const ExecContext *c) { - scmp_filter_ctx seccomp; - int r; - assert(c); /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ @@ -1538,19 +1520,7 @@ static int apply_private_devices(Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); - if (r < 0) - return r; - - r = seccomp_add_syscall_filter_set(seccomp, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); - if (r < 0) - goto finish; - - r = seccomp_load(seccomp); - -finish: - seccomp_release(seccomp); - return r; + return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); } #endif diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index f1e9de05b2..6252cd16a6 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -452,3 +452,27 @@ int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterS return 0; } + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) { + scmp_filter_ctx seccomp; + int r; + + assert(set); + + /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */ + + r = seccomp_init_conservative(&seccomp, default_action); + if (r < 0) + return r; + + r = seccomp_add_syscall_filter_set(seccomp, set, action); + if (r < 0) + goto finish; + + r = seccomp_load(seccomp); + +finish: + seccomp_release(seccomp); + return r; + +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 2de429a772..667687b14f 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -59,3 +59,5 @@ extern const SyscallFilterSet syscall_filter_sets[]; const SyscallFilterSet *syscall_filter_set_find(const char *name); int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); + +int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); -- cgit v1.2.3-54-g00ecf From d2ffa389b8112282be1633bb4638f6f47e159299 Mon Sep 17 00:00:00 2001 From: Topi Miettinen Date: Wed, 26 Oct 2016 18:52:53 +0300 Subject: seccomp: also block shmat(..., SHM_EXEC) for MemoryDenyWriteExecute shmat(..., SHM_EXEC) can be used to create writable and executable memory, so let's block it when MemoryDenyWriteExecute is set. --- man/systemd.exec.xml | 11 +++++++---- src/core/execute.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'src/core/execute.c') diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index dbe4594730..f9a15d8db0 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1516,12 +1516,15 @@ MemoryDenyWriteExecute= Takes a boolean argument. If set, attempts to create memory mappings that are writable and - executable at the same time, or to change existing memory mappings to become executable are prohibited. + executable at the same time, or to change existing memory mappings to become executable, or mapping shared memory + segments as executable are prohibited. Specifically, a system call filter is added that rejects mmap2 - system calls with both PROT_EXEC and PROT_WRITE set - and mprotect2 - system calls with PROT_EXEC set. Note that this option is incompatible with programs + system calls with both PROT_EXEC and PROT_WRITE set, + mprotect2 + system calls with PROT_EXEC set and + shmat2 + system calls with SHM_EXEC set. Note that this option is incompatible with programs that generate program code dynamically at runtime, such as JIT execution engines, or programs compiled making use of the code "trampoline" feature of various C compilers. This option improves service security, as it makes harder for software exploits to change running code dynamically. diff --git a/src/core/execute.c b/src/core/execute.c index 5e7d7c25d7..7b42ac7bdc 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -29,8 +29,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -1394,6 +1396,15 @@ static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) if (r < 0) goto finish; + r = seccomp_rule_add( + seccomp, + SCMP_ACT_ERRNO(EPERM), + SCMP_SYS(shmat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC)); + if (r < 0) + goto finish; + r = seccomp_load(seccomp); finish: -- cgit v1.2.3-54-g00ecf From 93c6bb51b6a3c80577fd823d0108c1dfcf01a0ce Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Thu, 27 Oct 2016 09:20:18 +0200 Subject: core: move the code that setups namespaces on its own function --- src/core/execute.c | 101 ++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 47 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 5e7d7c25d7..2908ba8fa3 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2003,6 +2003,59 @@ static int compile_read_write_paths( return 0; } +static int apply_mount_namespace(Unit *u, const ExecContext *context, + const ExecParameters *params, + ExecRuntime *runtime) { + int r; + _cleanup_free_ char **rw = NULL; + char *tmp = NULL, *var = NULL; + const char *root_dir = NULL; + NameSpaceInfo ns_info = { + .private_dev = context->private_devices, + .protect_control_groups = context->protect_control_groups, + .protect_kernel_tunables = context->protect_kernel_tunables, + .protect_kernel_modules = context->protect_kernel_modules, + }; + + /* The runtime struct only contains the parent of the private /tmp, + * which is non-accessible to world users. Inside of it there's a /tmp + * that is sticky, and that's the one we want to use here. */ + + if (context->private_tmp && runtime) { + if (runtime->tmp_dir) + tmp = strjoina(runtime->tmp_dir, "/tmp"); + if (runtime->var_tmp_dir) + var = strjoina(runtime->var_tmp_dir, "/tmp"); + } + + r = compile_read_write_paths(context, params, &rw); + if (r < 0) + return r; + + if (params->flags & EXEC_APPLY_CHROOT) + root_dir = context->root_directory; + + r = setup_namespace(root_dir, &ns_info, rw, + context->read_only_paths, + context->inaccessible_paths, + tmp, + var, + context->protect_home, + context->protect_system, + context->mount_flags); + + /* If we couldn't set up the namespace this is probably due to a + * missing capability. In this case, silently proceeed. */ + if (IN_SET(r, -EPERM, -EACCES)) { + log_open(); + log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m"); + log_close(); + r = 0; + } + + return r; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -2466,57 +2519,11 @@ static int exec_child( needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime); if (needs_mount_namespace) { - _cleanup_free_ char **rw = NULL; - char *tmp = NULL, *var = NULL; - NameSpaceInfo ns_info = { - .private_dev = context->private_devices, - .protect_control_groups = context->protect_control_groups, - .protect_kernel_tunables = context->protect_kernel_tunables, - .protect_kernel_modules = context->protect_kernel_modules, - }; - - /* The runtime struct only contains the parent - * of the private /tmp, which is - * non-accessible to world users. Inside of it - * there's a /tmp that is sticky, and that's - * the one we want to use here. */ - - if (context->private_tmp && runtime) { - if (runtime->tmp_dir) - tmp = strjoina(runtime->tmp_dir, "/tmp"); - if (runtime->var_tmp_dir) - var = strjoina(runtime->var_tmp_dir, "/tmp"); - } - - r = compile_read_write_paths(context, params, &rw); + r = apply_mount_namespace(unit, context, params, runtime); if (r < 0) { *exit_status = EXIT_NAMESPACE; return r; } - - r = setup_namespace( - (params->flags & EXEC_APPLY_CHROOT) ? context->root_directory : NULL, - &ns_info, - rw, - context->read_only_paths, - context->inaccessible_paths, - tmp, - var, - context->protect_home, - context->protect_system, - context->mount_flags); - - /* If we couldn't set up the namespace this is - * probably due to a missing capability. In this case, - * silently proceeed. */ - if (r == -EPERM || r == -EACCES) { - log_open(); - log_unit_debug_errno(unit, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m"); - log_close(); - } else if (r < 0) { - *exit_status = EXIT_NAMESPACE; - return r; - } } if (context->working_directory_home) -- cgit v1.2.3-54-g00ecf From e7f1e7c6e2f91f3cad5eadfcc6ab9673caedb838 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Thu, 27 Oct 2016 09:21:44 +0200 Subject: core: move apply working directory code into its own apply_working_directory() --- src/core/execute.c | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 2908ba8fa3..642add0360 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2056,6 +2056,33 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, return r; } +static int apply_working_directory(const ExecContext *context, + const ExecParameters *params, + const char *working_directory, + const bool needs_mount_ns) { + + if (params->flags & EXEC_APPLY_CHROOT) { + if (!needs_mount_ns && context->root_directory) + if (chroot(context->root_directory) < 0) + return -errno; + + if (chdir(working_directory) < 0 && + !context->working_directory_missing_ok) + return -errno; + + } else { + const char *d; + + d = strjoina(strempty(context->root_directory), "/", + strempty(working_directory)); + if (chdir(d) < 0 && + !context->working_directory_missing_ok) + return -errno; + } + + return 0; +} + static void append_socket_pair(int *array, unsigned *n, int pair[2]) { assert(array); assert(n); @@ -2542,27 +2569,10 @@ static int exec_child( } } - if (params->flags & EXEC_APPLY_CHROOT) { - if (!needs_mount_namespace && context->root_directory) - if (chroot(context->root_directory) < 0) { - *exit_status = EXIT_CHROOT; - return -errno; - } - - if (chdir(wd) < 0 && - !context->working_directory_missing_ok) { - *exit_status = EXIT_CHDIR; - return -errno; - } - } else { - const char *d; - - d = strjoina(strempty(context->root_directory), "/", strempty(wd)); - if (chdir(d) < 0 && - !context->working_directory_missing_ok) { - *exit_status = EXIT_CHDIR; - return -errno; - } + r = apply_working_directory(context, params, wd, needs_mount_namespace); + if (r < 0) { + *exit_status = EXIT_CHROOT; + return r; } #ifdef HAVE_SELINUX -- cgit v1.2.3-54-g00ecf From 2b3c1b9e9d7a09b1f974f8d702da8ebaeff036f6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Thu, 27 Oct 2016 09:28:54 +0200 Subject: core: get the working directory value inside apply_working_directory() Improve apply_working_directory() and lets get the current working directory inside of it. --- src/core/execute.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 642add0360..0b6fcc9ac7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2017,6 +2017,8 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, .protect_kernel_modules = context->protect_kernel_modules, }; + assert(context); + /* The runtime struct only contains the parent of the private /tmp, * which is non-accessible to world users. Inside of it there's a /tmp * that is sticky, and that's the one we want to use here. */ @@ -2058,27 +2060,31 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context, static int apply_working_directory(const ExecContext *context, const ExecParameters *params, - const char *working_directory, + const char *home, const bool needs_mount_ns) { + const char *d; + const char *wd; + + assert(context); + + if (context->working_directory_home) + wd = home; + else if (context->working_directory) + wd = context->working_directory; + else + wd = "/"; if (params->flags & EXEC_APPLY_CHROOT) { if (!needs_mount_ns && context->root_directory) if (chroot(context->root_directory) < 0) return -errno; - if (chdir(working_directory) < 0 && - !context->working_directory_missing_ok) - return -errno; - - } else { - const char *d; + d = wd; + } else + d = strjoina(strempty(context->root_directory), "/", strempty(wd)); - d = strjoina(strempty(context->root_directory), "/", - strempty(working_directory)); - if (chdir(d) < 0 && - !context->working_directory_missing_ok) - return -errno; - } + if (chdir(d) < 0 && !context->working_directory_missing_ok) + return -errno; return 0; } @@ -2219,7 +2225,7 @@ static int exec_child( _cleanup_free_ char *mac_selinux_context_net = NULL; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; - const char *home = NULL, *shell = NULL, *wd; + const char *home = NULL, *shell = NULL; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; bool needs_mount_namespace; @@ -2553,13 +2559,6 @@ static int exec_child( } } - if (context->working_directory_home) - wd = home; - else if (context->working_directory) - wd = context->working_directory; - else - wd = "/"; - /* Drop group as early as possbile */ if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { r = enforce_groups(context, gid, supplementary_gids, ngids); @@ -2569,7 +2568,7 @@ static int exec_child( } } - r = apply_working_directory(context, params, wd, needs_mount_namespace); + r = apply_working_directory(context, params, home, needs_mount_namespace); if (r < 0) { *exit_status = EXIT_CHROOT; return r; -- cgit v1.2.3-54-g00ecf From 50b3dfb9d64872025450aa63765206720be471d6 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Tue, 25 Oct 2016 16:24:35 +0200 Subject: core: lets apply working directory just after mount namespaces This makes applying groups after applying the working directory, this may allow some flexibility but at same it is not a big deal since we don't execute or do anything between applying working directory and droping groups. --- src/core/execute.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 0b6fcc9ac7..a9e39f6fd7 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2559,6 +2559,13 @@ static int exec_child( } } + /* Apply just after mount namespace setup */ + r = apply_working_directory(context, params, home, needs_mount_namespace); + if (r < 0) { + *exit_status = EXIT_CHROOT; + return r; + } + /* Drop group as early as possbile */ if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { r = enforce_groups(context, gid, supplementary_gids, ngids); @@ -2568,12 +2575,6 @@ static int exec_child( } } - r = apply_working_directory(context, params, home, needs_mount_namespace); - if (r < 0) { - *exit_status = EXIT_CHROOT; - return r; - } - #ifdef HAVE_SELINUX if ((params->flags & EXEC_APPLY_PERMISSIONS) && mac_selinux_use() && -- cgit v1.2.3-54-g00ecf From 59e856c7d33e8260d0ac3a8e21318d6f0768d75b Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Thu, 27 Oct 2016 09:39:20 +0200 Subject: core: make unit argument const for apply seccomp functions --- src/core/execute.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index a9e39f6fd7..7f343c4902 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1470,7 +1470,7 @@ finish: return r; } -static int apply_protect_sysctl(Unit *u, const ExecContext *c) { +static int apply_protect_sysctl(const Unit *u, const ExecContext *c) { scmp_filter_ctx seccomp; int r; @@ -1501,7 +1501,7 @@ finish: return r; } -static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { +static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) { assert(c); /* Turn off module syscalls on ProtectKernelModules=yes */ @@ -1512,7 +1512,7 @@ static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) { return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); } -static int apply_private_devices(Unit *u, const ExecContext *c) { +static int apply_private_devices(const Unit *u, const ExecContext *c) { assert(c); /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */ -- cgit v1.2.3-54-g00ecf From 5cd9cd3537d1afca85877103615e61e6c03e7079 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 25 Oct 2016 15:52:54 +0200 Subject: execute: apply seccomp filters after changing selinux/aa/smack contexts Seccomp is generally an unprivileged operation, changing security contexts is most likely associated with some form of policy. Moreover, while seccomp may influence our own flow of code quite a bit (much more than the security context change) make sure to apply the seccomp filters immediately before executing the binary to invoke. This also moves enforcement of NNP after the security context change, so that NNP cannot affect it anymore. (However, the security policy now has to permit the NNP change). This change has a good chance of breaking current SELinux/AA/SMACK setups, because the policy might not expect this change of behaviour. However, it's technically the better choice I think and should hence be applied. Fixes: #3993 --- src/core/execute.c | 70 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 31 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index ae9df41b99..c73e2e7220 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2538,12 +2538,6 @@ static int exec_child( (void) umask(context->umask); if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - r = setup_smack(context, command); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - if (context->pam_name && username) { r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { @@ -2693,6 +2687,41 @@ static int exec_child( } } + /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to + * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires + * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls + * are restricted. */ + +#ifdef HAVE_SELINUX + if (mac_selinux_use()) { + char *exec_context = mac_selinux_context_net ?: context->selinux_context; + + if (exec_context) { + r = setexeccon(exec_context); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return r; + } + } + } +#endif + + r = setup_smack(context, command); + if (r < 0) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return r; + } + +#ifdef HAVE_APPARMOR + if (context->apparmor_profile && mac_apparmor_use()) { + r = aa_change_onexec(context->apparmor_profile); + if (r < 0 && !context->apparmor_profile_ignore) { + *exit_status = EXIT_APPARMOR_PROFILE; + return -errno; + } + } +#endif + /* PR_GET_SECUREBITS is not privileged, while * PR_SET_SECUREBITS is. So to suppress * potential EPERMs we'll try not to call @@ -2758,6 +2787,8 @@ static int exec_child( } } + /* This really should remain the last step before the execve(), to make sure our own code is unaffected + * by the filter as little as possible. */ if (context_has_syscall_filters(context)) { r = apply_seccomp(unit, context); if (r < 0) { @@ -2766,30 +2797,6 @@ static int exec_child( } } #endif - -#ifdef HAVE_SELINUX - if (mac_selinux_use()) { - char *exec_context = mac_selinux_context_net ?: context->selinux_context; - - if (exec_context) { - r = setexeccon(exec_context); - if (r < 0) { - *exit_status = EXIT_SELINUX_CONTEXT; - return r; - } - } - } -#endif - -#ifdef HAVE_APPARMOR - if (context->apparmor_profile && mac_apparmor_use()) { - r = aa_change_onexec(context->apparmor_profile); - if (r < 0 && !context->apparmor_profile_ignore) { - *exit_status = EXIT_APPARMOR_PROFILE; - return -errno; - } - } -#endif } final_argv = replace_env_argv(argv, accum_env); @@ -3611,7 +3618,8 @@ char *exec_command_line(char **argv) { STRV_FOREACH(a, argv) k += strlen(*a)+3; - if (!(n = new(char, k))) + n = new(char, k); + if (!n) return NULL; p = n; -- cgit v1.2.3-54-g00ecf From bbeea271172a4664ce9a4a41a7fa3b1ca18dbedd Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 2 Nov 2016 17:51:35 +0100 Subject: core: initialize groups list before checking SupplementaryGroups= of a unit (#4533) Always initialize the supplementary groups of caller before checking the unit SupplementaryGroups= option. Fixes https://github.com/systemd/systemd/issues/4531 --- src/core/execute.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index ae9df41b99..a1bd0c1238 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -787,6 +787,20 @@ static int get_fixed_supplementary_groups(const ExecContext *c, assert(c); + /* + * If user is given, then lookup GID and supplementary groups list. + * We avoid NSS lookups for gid=0. Also we have to initialize groups + * as early as possible so we keep the list of supplementary groups + * of the caller. + */ + if (user && gid_is_valid(gid) && gid != 0) { + /* First step, initialize groups from /etc/groups */ + if (initgroups(user, gid) < 0) + return -errno; + + keep_groups = true; + } + if (!c->supplementary_groups) return 0; @@ -803,18 +817,6 @@ static int get_fixed_supplementary_groups(const ExecContext *c, return -EOPNOTSUPP; /* For all other values */ } - /* - * If user is given, then lookup GID and supplementary group list. - * We avoid NSS lookups for gid=0. - */ - if (user && gid_is_valid(gid) && gid != 0) { - /* First step, initialize groups from /etc/groups */ - if (initgroups(user, gid) < 0) - return -errno; - - keep_groups = true; - } - l_gids = new(gid_t, ngroups_max); if (!l_gids) return -ENOMEM; @@ -2577,7 +2579,7 @@ static int exec_child( return r; } - /* Drop group as early as possbile */ + /* Drop groups as early as possbile */ if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { r = enforce_groups(context, gid, supplementary_gids, ngids); if (r < 0) { -- cgit v1.2.3-54-g00ecf From cdc5d5c55e58ff9eeb6b2258c9fc3a416ee8b53f Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Wed, 2 Nov 2016 22:42:40 +0100 Subject: core: intialize user aux groups and SupplementaryGroups= when DynamicUser= is set Make sure that when DynamicUser= is set that we intialize the user supplementary groups and that we also support SupplementaryGroups= Fixes: https://github.com/systemd/systemd/issues/4539 Thanks Evgeny Vereshchagin (@evverx) --- src/core/execute.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index 3f053602b5..f13ca30395 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -773,11 +773,9 @@ static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) return 0; } -static int get_fixed_supplementary_groups(const ExecContext *c, - const char *user, - const char *group, - gid_t gid, - gid_t **supplementary_gids, int *ngids) { +static int get_supplementary_groups(const ExecContext *c, const char *user, + const char *group, gid_t gid, + gid_t **supplementary_gids, int *ngids) { char **i; int r, k = 0; int ngroups_max; @@ -790,8 +788,8 @@ static int get_fixed_supplementary_groups(const ExecContext *c, /* * If user is given, then lookup GID and supplementary groups list. * We avoid NSS lookups for gid=0. Also we have to initialize groups - * as early as possible so we keep the list of supplementary groups - * of the caller. + * here and as early as possible so we keep the list of supplementary + * groups of the caller. */ if (user && gid_is_valid(gid) && gid != 0) { /* First step, initialize groups from /etc/groups */ @@ -2347,13 +2345,14 @@ static int exec_child( *exit_status = EXIT_GROUP; return r; } + } - r = get_fixed_supplementary_groups(context, username, groupname, - gid, &supplementary_gids, &ngids); - if (r < 0) { - *exit_status = EXIT_GROUP; - return r; - } + /* Initialize user supplementary groups and get SupplementaryGroups= ones */ + r = get_supplementary_groups(context, username, groupname, gid, + &supplementary_gids, &ngids); + if (r < 0) { + *exit_status = EXIT_GROUP; + return r; } r = send_user_lookup(unit, user_lookup_fd, uid, gid); -- cgit v1.2.3-54-g00ecf From d9454c44bc1854a291f6c37dfce378c9b6067d0b Mon Sep 17 00:00:00 2001 From: Dave Reisner Date: Wed, 9 Nov 2016 08:00:26 -0500 Subject: disable RestrictAddressFamilies on i686 Shit's broke, yo. https://github.com/systemd/systemd/issues/4575 --- src/core/execute.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/core/execute.c') diff --git a/src/core/execute.c b/src/core/execute.c index f13ca30395..85ee82c3e1 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1254,6 +1254,10 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { Iterator i; int r; +#if defined(__i386__) + return 0; +#endif + assert(c); if (skip_seccomp_unavailable(u, "RestrictAddressFamilies=")) -- cgit v1.2.3-54-g00ecf