diff options
-rw-r--r-- | .mailmap | 15 | ||||
-rw-r--r-- | Makefile.am | 4 | ||||
-rw-r--r-- | NEWS | 38 | ||||
-rw-r--r-- | TODO | 5 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | man/sd_notify.xml | 41 | ||||
-rw-r--r-- | man/systemctl.xml | 4 | ||||
-rw-r--r-- | man/systemd.exec.xml | 62 | ||||
-rw-r--r-- | man/systemd.service.xml | 4 | ||||
-rw-r--r-- | src/core/execute.c | 70 | ||||
-rw-r--r-- | src/core/job.c | 19 | ||||
-rw-r--r-- | src/core/service.c | 61 | ||||
-rw-r--r-- | src/core/unit.c | 6 | ||||
-rw-r--r-- | src/core/unit.h | 2 | ||||
-rw-r--r-- | src/shared/seccomp-util.c | 49 | ||||
-rw-r--r-- | src/shared/seccomp-util.h | 2 | ||||
-rw-r--r-- | src/systemctl/systemctl.c | 2 | ||||
-rw-r--r-- | src/test/test-unit-file.c | 2 | ||||
-rwxr-xr-x | test/TEST-04-JOURNAL/test-journal.sh | 8 |
19 files changed, 267 insertions, 129 deletions
@@ -74,3 +74,18 @@ Thomas H. P. Andersen <phomes@gmail.com> Michael Olbrich <m.olbrich@pengutronix.de> Douglas Christman <DouglasChristman@gmail.com> Alexander Kuleshov <kuleshovmail@gmail.com> <0xAX@users.noreply.github.com> +Andreas Henriksson <andreas@fatal.se> +Daniel Rusek <mail@asciiwolf.com> +Dennis Wassenberg <dennis.wassenberg@secunet.com> +Reid Price <reid.price@gmail.com> +Stefan Schweter <stefan@schweter.it> +Seraphime Kirkovski <kirkseraph@gmail.com> +Bart Rulon <barron@lexmark.com> +Richard W.M. Jones <rjones@redhat.com> +Roman Stingler <coolx67@gmx.at> +Michael Hoy <rimmington@gmail.com> +Tiago Levit <liamgliam@gmail.com> +Eric Cook <llua@users.noreply.github.com> +Lukáš Nykrýn <lnykryn@redhat.com> +Heikki Kemppainen <heikki.kemppainen@nokia.com> +Hendrik Brueckner <hbrueckner@users.noreply.github.com> diff --git a/Makefile.am b/Makefile.am index a707d4a899..34c2f60330 100644 --- a/Makefile.am +++ b/Makefile.am @@ -42,9 +42,9 @@ LIBUDEV_CURRENT=7 LIBUDEV_REVISION=5 LIBUDEV_AGE=6 -LIBSYSTEMD_CURRENT=16 +LIBSYSTEMD_CURRENT=17 LIBSYSTEMD_REVISION=0 -LIBSYSTEMD_AGE=16 +LIBSYSTEMD_AGE=17 # Dirs of external packages dbuspolicydir=@dbuspolicydir@ @@ -311,6 +311,44 @@ CHANGES WITH 232 in spe * "systemctl is-enabled --full" will now show by which symlinks a unit file is enabled in the unit dependency tree. + * Support for VeraCrypt encrypted partitions has been added to the + "cryptsetup" logic and /etc/crypttab. + + * systemd-detect-virt gained support for a new --private-users switch + that checks whether the invoking processes are running inside a user + namespace. Similar, a new special value "private-users" for the + existing ConditionVirtualization= setting has been added, permitting + skipping of specific units in user namespace environments. + + Contributions from: Alban Crequy, Alexander Kuleshov, Alfie John, + Andreas Henriksson, Andrew Jeddeloh, Balázs Úr, Bart Rulon, Benjamin + Richter, Ben Gamari, Ben Harris, Brian J. Murrell, Christian Brauner, + Christian Rebischke, Clinton Roy, Colin Walters, Cristian Rodríguez, + Daniel Hahler, Daniel Mack, Daniel Maixner, Daniel Rusek, Dan Dedrick, + Davide Cavalca, David Herrmann, David Michael, Dennis Wassenberg, + Djalal Harouni, Dongsu Park, Douglas Christman, Elias Probst, Eric + Cook, Erik Karlsson, Evgeny Vereshchagin, Felipe Sateler, Felix Zhang, + Franck Bui, George Hilliard, Giuseppe Scrivano, HATAYAMA Daisuke, + Heikki Kemppainen, Hendrik Brueckner, hi117, Ismo Puustinen, Ivan + Shapovalov, Jakub Filak, Jakub Wilk, Jan Synacek, Jason Kölker, + Jean-Sébastien Bour, Jiří Pírko, Jonathan Boulle, Jorge Niedbalski, + Keith Busch, kristbaum, Kyle Russell, Lans Zhang, Lennart Poettering, + Leonardo Brondani Schenkel, Lucas Werkmeister, Luca Bruno, Lukáš + Nykrýn, Maciek Borzecki, Mantas Mikulėnas, Marc-Antoine Perennou, + Marcel Holtmann, Marcos Mello, Martin Ejdestig, Martin Pitt, Matej + Habrnal, Maxime de Roucy, Michael Biebl, Michael Chapman, Michael Hoy, + Michael Olbrich, Michael Pope, Michal Sekletar, Michal Soltys, Mike + Gilbert, Nick Owens, Patrik Flykt, Paweł Szewczyk, Peter Hutterer, + Piotr Drąg, Reid Price, Richard W.M. Jones, Roman Stingler, Ronny + Chevalier, Seraphime Kirkovski, Stefan Schweter, Steve Muir, Susant + Sahani, Tejun Heo, Thomas Blume, Thomas H. P. Andersen, Tiago Levit, + Tobias Jungel, Tomáš Janoušek, Topi Miettinen, Torstein Husebø, Umut + Tezduyar Lindskog, Vito Caputo, WaLyong Cho, Wilhelm Schuster, Yann + E. MORIN, Yi EungJun, Yuki Inoguchi, Yu Watanabe, Zbigniew + Jędrzejewski-Szmek, Zeal Jagannatha + + — Santa Fe, 2016-11-XX + CHANGES WITH 231: * In service units the various ExecXYZ= settings have been extended @@ -32,6 +32,11 @@ Janitorial Clean-ups: Features: +* drop nss-myhostname in favour of nss-resolve? + +* drop internal dlopen() based nss-dns fallback in nss-resolve, and rely on the + external nsswitch.conf based one + * add a percentage syntax for TimeoutStopSec=, e.g. TimeoutStopSec=150%, and then use that for the setting used in user@.service. It should be understood relative to the configured default value. diff --git a/configure.ac b/configure.ac index 0e87adc38f..0b10fc7de7 100644 --- a/configure.ac +++ b/configure.ac @@ -20,7 +20,7 @@ AC_PREREQ([2.64]) AC_INIT([systemd], - [231], + [232], [http://github.com/systemd/systemd/issues], [systemd], [http://www.freedesktop.org/wiki/Software/systemd]) diff --git a/man/sd_notify.xml b/man/sd_notify.xml index 025fbec6c1..94542b80b8 100644 --- a/man/sd_notify.xml +++ b/man/sd_notify.xml @@ -205,28 +205,25 @@ <varlistentry> <term>FDSTORE=1</term> - <listitem><para>Stores additional file descriptors in the - service manager. File descriptors sent this way will be - maintained per-service by the service manager and be passed - again using the usual file descriptor passing logic on the - next invocation of the service (see - <citerefentry><refentrytitle>sd_listen_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry>). - This is useful for implementing service restart schemes where - services serialize their state to <filename>/run</filename>, - push their file descriptors to the system manager, and are - then restarted, retrieving their state again via socket - passing and <filename>/run</filename>. Note that the service - manager will accept messages for a service only if - <varname>FileDescriptorStoreMax=</varname> is set to non-zero - for it (defaults to zero). See - <citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry> - for details. Multiple arrays of file descriptors may be sent - in separate messages, in which case the arrays are combined. - Note that the service manager removes duplicate file - descriptors before passing them to the service. Use - <function>sd_pid_notify_with_fds()</function> to send messages - with <literal>FDSTORE=1</literal>, see - below.</para></listitem> + <listitem><para>Stores additional file descriptors in the service manager. File + descriptors sent this way will be maintained per-service by the service manager + and will be passed again using the usual file descriptor passing logic on the next + invocation of the service, see + <citerefentry><refentrytitle>sd_listen_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry>. + This is useful for implementing service restart schemes where services serialize + their state to <filename>/run</filename>, push their file descriptors to the + system manager, and are then restarted, retrieving their state again via socket + passing and <filename>/run</filename>. Note that the service manager will accept + messages for a service only if <varname>FileDescriptorStoreMax=</varname> is set + to non-zero for it (defaults to zero, see + <citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>). + File descriptors must be pollable, see + <citerefentry><refentrytitle>epoll_ctl</refentrytitle><manvolnum>2</manvolnum></citerefentry>. + Multiple arrays of file descriptors may be sent in separate messages, in which + case the arrays are combined. Note that the service manager removes duplicate + file descriptors before passing them to the service. Use + <function>sd_pid_notify_with_fds()</function> to send messages with + <literal>FDSTORE=1</literal>, see below.</para></listitem> </varlistentry> <varlistentry> diff --git a/man/systemctl.xml b/man/systemctl.xml index ae3296d132..dfa00e0c03 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -877,8 +877,8 @@ kobject-uevent 1 systemd-udevd-kernel.socket systemd-udevd.service <para>Show properties of one or more units, jobs, or the manager itself. If no argument is specified, properties of the manager will be shown. If a unit name is specified, - properties of the unit is shown, and if a job ID is - specified, properties of the job is shown. By default, empty + properties of the unit are shown, and if a job ID is + specified, properties of the job are shown. By default, empty properties are suppressed. Use <option>--all</option> to show those too. To select specific properties to show, use <option>--property=</option>. This command is intended to be diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 11029ca186..3c350df11f 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1255,30 +1255,28 @@ <varlistentry> <term><varname>SystemCallFilter=</varname></term> - <listitem><para>Takes a space-separated list of system call - names. If this setting is used, all system calls executed by - the unit processes except for the listed ones will result in - immediate process termination with the - <constant>SIGSYS</constant> signal (whitelisting). If the - first character of the list is <literal>~</literal>, the - effect is inverted: only the listed system calls will result - in immediate process termination (blacklisting). If running in - user mode, or in system mode, but without the - <constant>CAP_SYS_ADMIN</constant> capability (e.g. setting - <varname>User=nobody</varname>), - <varname>NoNewPrivileges=yes</varname> is implied. This - feature makes use of the Secure Computing Mode 2 interfaces of - the kernel ('seccomp filtering') and is useful for enforcing a - minimal sandboxing environment. Note that the - <function>execve</function>, - <function>rt_sigreturn</function>, - <function>sigreturn</function>, - <function>exit_group</function>, <function>exit</function> - system calls are implicitly whitelisted and do not need to be - listed explicitly. This option may be specified more than once, - in which case the filter masks are merged. If the empty string - is assigned, the filter is reset, all prior assignments will - have no effect. This does not affect commands prefixed with <literal>+</literal>.</para> + <listitem><para>Takes a space-separated list of system call names. If this setting is used, all system calls + executed by the unit processes except for the listed ones will result in immediate process termination with the + <constant>SIGSYS</constant> signal (whitelisting). If the first character of the list is <literal>~</literal>, + the effect is inverted: only the listed system calls will result in immediate process termination + (blacklisting). If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant> + capability (e.g. setting <varname>User=nobody</varname>), <varname>NoNewPrivileges=yes</varname> is + implied. This feature makes use of the Secure Computing Mode 2 interfaces of the kernel ('seccomp filtering') + and is useful for enforcing a minimal sandboxing environment. Note that the <function>execve</function>, + <function>exit</function>, <function>exit_group</function>, <function>getrlimit</function>, + <function>rt_sigreturn</function>, <function>sigreturn</function> system calls and the system calls for + querying time and sleeping are implicitly whitelisted and do not need to be listed explicitly. This option may + be specified more than once, in which case the filter masks are merged. If the empty string is assigned, the + filter is reset, all prior assignments will have no effect. This does not affect commands prefixed with + <literal>+</literal>.</para> + + <para>Note that strict system call filters may impact execution and error handling code paths of the service + invocation. Specifically, access to the <function>execve</function> system call is required for the execution + of the service binary — if it is blocked service invocation will necessarily fail. Also, if execution of the + service binary fails for some reason (for example: missing service executable), the error handling logic might + require access to an additional set of system calls in order to process and log this failure correctly. It + might be necessary to temporarily disable system call filters in order to simplify debugging of such + failures.</para> <para>If you specify both types of this option (i.e. whitelisting and blacklisting), the first encountered will @@ -1312,6 +1310,10 @@ </thead> <tbody> <row> + <entry>@basic-io</entry> + <entry>System calls for basic I/O: reading, writing, seeking, file descriptor duplication and closing (<citerefentry project='man-pages'><refentrytitle>read</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>write</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry> + </row> + <row> <entry>@clock</entry> <entry>System calls for changing the system clock (<citerefentry project='man-pages'><refentrytitle>adjtimex</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>settimeofday</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry> </row> @@ -1329,7 +1331,7 @@ </row> <row> <entry>@ipc</entry> - <entry>SysV IPC, POSIX Message Queues or other IPC (<citerefentry project='man-pages'><refentrytitle>mq_overview</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>svipc</refentrytitle><manvolnum>7</manvolnum></citerefentry>)</entry> + <entry>Pipes, SysV IPC, POSIX Message Queues and other IPC (<citerefentry project='man-pages'><refentrytitle>mq_overview</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>svipc</refentrytitle><manvolnum>7</manvolnum></citerefentry>)</entry> </row> <row> <entry>@keyring</entry> @@ -1357,17 +1359,21 @@ </row> <row> <entry>@process</entry> - <entry>Process control, execution, namespaces (<citerefentry project='man-pages'><refentrytitle>execve</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>, …</entry> + <entry>Process control, execution, namespaces (<citerefentry project='man-pages'><refentrytitle>clone</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>, …</entry> </row> <row> <entry>@raw-io</entry> - <entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …</entry> + <entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …)</entry> + </row> + <row> + <entry>@resources</entry> + <entry>System calls for changing resource limits, memory and scheduling parameters (<citerefentry project='man-pages'><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>setpriority</refentrytitle><manvolnum>2</manvolnum></citerefentry>, …)</entry> </row> </tbody> </tgroup> </table> - Note, that as new system calls are added to the kernel, additional system calls might be added to the groups + Note that as new system calls are added to the kernel, additional system calls might be added to the groups above, so the contents of the sets may change between systemd versions.</para> <para>It is recommended to combine the file system namespacing related options with diff --git a/man/systemd.service.xml b/man/systemd.service.xml index 90b1312603..5c65957bda 100644 --- a/man/systemd.service.xml +++ b/man/systemd.service.xml @@ -852,13 +852,13 @@ serialized to <filename>/run</filename> and the file descriptors passed to the service manager, to allow restarts without losing state. Defaults to 0, i.e. no file descriptors - may be stored in the service manager by default. All file + may be stored in the service manager. All file descriptors passed to the service manager from a specific service are passed back to the service's main process on the next service restart. Any file descriptors passed to the service manager are automatically closed when POLLHUP or POLLERR is seen on them, or when the service is fully stopped - and no job queued or being executed for it.</para></listitem> + and no job is queued or being executed for it.</para></listitem> </varlistentry> <varlistentry> diff --git a/src/core/execute.c b/src/core/execute.c index a1bd0c1238..3f053602b5 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2540,12 +2540,6 @@ static int exec_child( (void) umask(context->umask); if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { - r = setup_smack(context, command); - if (r < 0) { - *exit_status = EXIT_SMACK_PROCESS_LABEL; - return r; - } - if (context->pam_name && username) { r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); if (r < 0) { @@ -2695,6 +2689,41 @@ static int exec_child( } } + /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to + * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires + * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls + * are restricted. */ + +#ifdef HAVE_SELINUX + if (mac_selinux_use()) { + char *exec_context = mac_selinux_context_net ?: context->selinux_context; + + if (exec_context) { + r = setexeccon(exec_context); + if (r < 0) { + *exit_status = EXIT_SELINUX_CONTEXT; + return r; + } + } + } +#endif + + r = setup_smack(context, command); + if (r < 0) { + *exit_status = EXIT_SMACK_PROCESS_LABEL; + return r; + } + +#ifdef HAVE_APPARMOR + if (context->apparmor_profile && mac_apparmor_use()) { + r = aa_change_onexec(context->apparmor_profile); + if (r < 0 && !context->apparmor_profile_ignore) { + *exit_status = EXIT_APPARMOR_PROFILE; + return -errno; + } + } +#endif + /* PR_GET_SECUREBITS is not privileged, while * PR_SET_SECUREBITS is. So to suppress * potential EPERMs we'll try not to call @@ -2760,6 +2789,8 @@ static int exec_child( } } + /* This really should remain the last step before the execve(), to make sure our own code is unaffected + * by the filter as little as possible. */ if (context_has_syscall_filters(context)) { r = apply_seccomp(unit, context); if (r < 0) { @@ -2768,30 +2799,6 @@ static int exec_child( } } #endif - -#ifdef HAVE_SELINUX - if (mac_selinux_use()) { - char *exec_context = mac_selinux_context_net ?: context->selinux_context; - - if (exec_context) { - r = setexeccon(exec_context); - if (r < 0) { - *exit_status = EXIT_SELINUX_CONTEXT; - return r; - } - } - } -#endif - -#ifdef HAVE_APPARMOR - if (context->apparmor_profile && mac_apparmor_use()) { - r = aa_change_onexec(context->apparmor_profile); - if (r < 0 && !context->apparmor_profile_ignore) { - *exit_status = EXIT_APPARMOR_PROFILE; - return -errno; - } - } -#endif } final_argv = replace_env_argv(argv, accum_env); @@ -3613,7 +3620,8 @@ char *exec_command_line(char **argv) { STRV_FOREACH(a, argv) k += strlen(*a)+3; - if (!(n = new(char, k))) + n = new(char, k); + if (!n) return NULL; p = n; diff --git a/src/core/job.c b/src/core/job.c index 3ecc8a1a73..ac6910a906 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -690,16 +690,16 @@ _pure_ static const char *job_get_status_message_format(Unit *u, JobType t, JobR } static void job_print_status_message(Unit *u, JobType t, JobResult result) { - static struct { + static const struct { const char *color, *word; } const statuses[_JOB_RESULT_MAX] = { - [JOB_DONE] = {ANSI_GREEN, " OK "}, - [JOB_TIMEOUT] = {ANSI_HIGHLIGHT_RED, " TIME "}, - [JOB_FAILED] = {ANSI_HIGHLIGHT_RED, "FAILED"}, - [JOB_DEPENDENCY] = {ANSI_HIGHLIGHT_YELLOW, "DEPEND"}, - [JOB_SKIPPED] = {ANSI_HIGHLIGHT, " INFO "}, - [JOB_ASSERT] = {ANSI_HIGHLIGHT_YELLOW, "ASSERT"}, - [JOB_UNSUPPORTED] = {ANSI_HIGHLIGHT_YELLOW, "UNSUPP"}, + [JOB_DONE] = { ANSI_GREEN, " OK " }, + [JOB_TIMEOUT] = { ANSI_HIGHLIGHT_RED, " TIME " }, + [JOB_FAILED] = { ANSI_HIGHLIGHT_RED, "FAILED" }, + [JOB_DEPENDENCY] = { ANSI_HIGHLIGHT_YELLOW, "DEPEND" }, + [JOB_SKIPPED] = { ANSI_HIGHLIGHT, " INFO " }, + [JOB_ASSERT] = { ANSI_HIGHLIGHT_YELLOW, "ASSERT" }, + [JOB_UNSUPPORTED] = { ANSI_HIGHLIGHT_YELLOW, "UNSUPP" }, }; const char *format; @@ -767,8 +767,9 @@ static void job_log_status_message(Unit *u, JobType t, JobResult result) { if (!format) return; + /* The description might be longer than the buffer, but that's OK, we'll just truncate it here */ DISABLE_WARNING_FORMAT_NONLITERAL; - xsprintf(buf, format, unit_description(u)); + snprintf(buf, sizeof(buf), format, unit_description(u)); REENABLE_WARNING; switch (t) { diff --git a/src/core/service.c b/src/core/service.c index ee4f4983fc..a7274a758f 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -289,7 +289,17 @@ static void service_fd_store_unlink(ServiceFDStore *fs) { free(fs); } -static void service_release_resources(Unit *u) { +static void service_release_fd_store(Service *s) { + assert(s); + + log_unit_debug(UNIT(s), "Releasing all stored fds"); + while (s->fd_store) + service_fd_store_unlink(s->fd_store); + + assert(s->n_fd_store == 0); +} + +static void service_release_resources(Unit *u, bool inactive) { Service *s = SERVICE(u); assert(s); @@ -297,16 +307,14 @@ static void service_release_resources(Unit *u) { if (!s->fd_store && s->stdin_fd < 0 && s->stdout_fd < 0 && s->stderr_fd < 0) return; - log_unit_debug(u, "Releasing all resources."); + log_unit_debug(u, "Releasing resources."); s->stdin_fd = safe_close(s->stdin_fd); s->stdout_fd = safe_close(s->stdout_fd); s->stderr_fd = safe_close(s->stderr_fd); - while (s->fd_store) - service_fd_store_unlink(s->fd_store); - - assert(s->n_fd_store == 0); + if (inactive) + service_release_fd_store(s); } static void service_done(Unit *u) { @@ -350,7 +358,7 @@ static void service_done(Unit *u) { s->timer_event_source = sd_event_source_unref(s->timer_event_source); - service_release_resources(u); + service_release_resources(u, true); } static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) { @@ -360,6 +368,10 @@ static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *us assert(fs); /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */ + log_unit_debug(UNIT(fs->service), + "Received %s on stored fd %d (%s), closing.", + revents & EPOLLERR ? "EPOLLERR" : "EPOLLHUP", + fs->fd, strna(fs->fdname)); service_fd_store_unlink(fs); return 0; } @@ -368,20 +380,23 @@ static int service_add_fd_store(Service *s, int fd, const char *name) { ServiceFDStore *fs; int r; + /* fd is always consumed if we return >= 0 */ + assert(s); assert(fd >= 0); if (s->n_fd_store >= s->n_fd_store_max) - return 0; + return -EXFULL; /* Our store is full. + * Use this errno rather than E[NM]FILE to distinguish from + * the case where systemd itself hits the file limit. */ LIST_FOREACH(fd_store, fs, s->fd_store) { r = same_fd(fs->fd, fd); if (r < 0) return r; if (r > 0) { - /* Already included */ safe_close(fd); - return 1; + return 0; /* fd already included */ } } @@ -409,7 +424,7 @@ static int service_add_fd_store(Service *s, int fd, const char *name) { LIST_PREPEND(fd_store, s->fd_store, fs); s->n_fd_store++; - return 1; + return 1; /* fd newly stored */ } static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name) { @@ -417,10 +432,7 @@ static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name) { assert(s); - if (fdset_size(fds) <= 0) - return 0; - - while (s->n_fd_store < s->n_fd_store_max) { + while (fdset_size(fds) > 0) { _cleanup_close_ int fd = -1; fd = fdset_steal_first(fds); @@ -428,17 +440,17 @@ static int service_add_fd_store_set(Service *s, FDSet *fds, const char *name) { break; r = service_add_fd_store(s, fd, name); + if (r == -EXFULL) + return log_unit_warning_errno(UNIT(s), r, + "Cannot store more fds than FileDescriptorStoreMax=%u, closing remaining.", + s->n_fd_store_max); if (r < 0) - return log_unit_error_errno(UNIT(s), r, "Couldn't add fd to fd store: %m"); - if (r > 0) { - log_unit_debug(UNIT(s), "Added fd to fd store."); - fd = -1; - } + return log_unit_error_errno(UNIT(s), r, "Failed to add fd to store: %m"); + if (r > 0) + log_unit_debug(UNIT(s), "Added fd %u (%s) to fd store.", fd, strna(name)); + fd = -1; } - if (fdset_size(fds) > 0) - log_unit_warning(UNIT(s), "Tried to store more fds than FileDescriptorStoreMax=%u allows, closing remaining.", s->n_fd_store_max); - return 0; } @@ -1225,6 +1237,7 @@ static int service_spawn( return r; n_fds = r; + log_unit_debug(UNIT(s), "Passing %i fds to service", n_fds); } r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout)); @@ -2336,7 +2349,7 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, r = service_add_fd_store(s, fd, t); if (r < 0) log_unit_error_errno(u, r, "Failed to add fd to store: %m"); - else if (r > 0) + else fdset_remove(fds, fd); } diff --git a/src/core/unit.c b/src/core/unit.c index 068d319206..463e6d6a62 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -320,6 +320,7 @@ int unit_set_description(Unit *u, const char *description) { bool unit_check_gc(Unit *u) { UnitActiveState state; + bool inactive; assert(u); if (u->job) @@ -329,16 +330,17 @@ bool unit_check_gc(Unit *u) { return true; state = unit_active_state(u); + inactive = state == UNIT_INACTIVE; /* If the unit is inactive and failed and no job is queued for * it, then release its runtime resources */ if (UNIT_IS_INACTIVE_OR_FAILED(state) && UNIT_VTABLE(u)->release_resources) - UNIT_VTABLE(u)->release_resources(u); + UNIT_VTABLE(u)->release_resources(u, inactive); /* But we keep the unit object around for longer when it is * referenced or configured to not be gc'ed */ - if (state != UNIT_INACTIVE) + if (!inactive) return true; if (u->perpetual) diff --git a/src/core/unit.h b/src/core/unit.h index 899cd62c92..991543664b 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -373,7 +373,7 @@ struct UnitVTable { /* When the unit is not running and no job for it queued we * shall release its runtime resources */ - void (*release_resources)(Unit *u); + void (*release_resources)(Unit *u, bool inactive); /* Invoked on every child that died */ void (*sigchld_event)(Unit *u, pid_t pid, int code, int status); diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 1cbbb9d757..c9b24f1065 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -217,6 +217,24 @@ bool is_seccomp_available(void) { } const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { + [SYSCALL_FILTER_SET_BASIC_IO] = { + /* Basic IO */ + .name = "@basic-io", + .value = + "close\0" + "dup2\0" + "dup3\0" + "dup\0" + "lseek\0" + "pread64\0" + "preadv\0" + "pwrite64\0" + "pwritev\0" + "read\0" + "readv\0" + "write\0" + "writev\0" + }, [SYSCALL_FILTER_SET_CLOCK] = { /* Clock */ .name = "@clock", @@ -253,15 +271,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "sys_debug_setcontext\0" }, [SYSCALL_FILTER_SET_DEFAULT] = { - /* Default list */ + /* Default list: the most basic of operations */ .name = "@default", .value = + "clock_getres\0" + "clock_gettime\0" + "clock_nanosleep\0" "execve\0" "exit\0" "exit_group\0" "getrlimit\0" /* make sure processes can query stack size and such */ + "gettimeofday\0" + "nanosleep\0" + "pause\0" "rt_sigreturn\0" "sigreturn\0" + "time\0" }, [SYSCALL_FILTER_SET_IO_EVENT] = { /* Event loop use */ @@ -283,9 +308,10 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "select\0" }, [SYSCALL_FILTER_SET_IPC] = { - /* Message queues, SYSV IPC or other IPC: unusual */ + /* Message queues, SYSV IPC or other IPC */ .name = "@ipc", .value = "ipc\0" + "memfd_create\0" "mq_getsetattr\0" "mq_notify\0" "mq_open\0" @@ -296,6 +322,8 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "msgget\0" "msgrcv\0" "msgsnd\0" + "pipe2\0" + "pipe\0" "process_vm_readv\0" "process_vm_writev\0" "semctl\0" @@ -436,7 +464,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { .value = "arch_prctl\0" "clone\0" - "execve\0" "execveat\0" "fork\0" "kill\0" @@ -463,6 +490,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { "s390_pci_mmio_write\0" #endif }, + [SYSCALL_FILTER_SET_RESOURCES] = { + /* Alter resource settings */ + .name = "@resources", + .value = + "sched_setparam\0" + "sched_setscheduler\0" + "sched_setaffinity\0" + "setpriority\0" + "setrlimit\0" + "set_mempolicy\0" + "migrate_pages\0" + "move_pages\0" + "mbind\0" + "sched_setattr\0" + "prlimit64\0" + }, }; const SyscallFilterSet *syscall_filter_set_find(const char *name) { diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index 8050fc6fbf..8e209efef2 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -38,6 +38,7 @@ typedef struct SyscallFilterSet { } SyscallFilterSet; enum { + SYSCALL_FILTER_SET_BASIC_IO, SYSCALL_FILTER_SET_CLOCK, SYSCALL_FILTER_SET_CPU_EMULATION, SYSCALL_FILTER_SET_DEBUG, @@ -52,6 +53,7 @@ enum { SYSCALL_FILTER_SET_PRIVILEGED, SYSCALL_FILTER_SET_PROCESS, SYSCALL_FILTER_SET_RAW_IO, + SYSCALL_FILTER_SET_RESOURCES, _SYSCALL_FILTER_SET_MAX }; diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c index fb5a539bc6..dd3b931cd6 100644 --- a/src/systemctl/systemctl.c +++ b/src/systemctl/systemctl.c @@ -5277,7 +5277,7 @@ static int cat(int argc, char *argv[], void *userdata) { else puts(""); - if (need_daemon_reload(bus, *name)) + if (need_daemon_reload(bus, *name) > 0) /* ignore errors (<0), this is informational output */ fprintf(stderr, "%s# Warning: %s changed on disk, the version systemd has loaded is outdated.\n" "%s# This output shows the current version of the unit's original fragment and drop-in files.\n" diff --git a/src/test/test-unit-file.c b/src/test/test-unit-file.c index 7ef087a2e3..12f48bf435 100644 --- a/src/test/test-unit-file.c +++ b/src/test/test-unit-file.c @@ -589,7 +589,7 @@ static void test_install_printf(void) { assert_se(specifier_machine_id('m', NULL, NULL, &mid) >= 0 && mid); assert_se(specifier_boot_id('b', NULL, NULL, &bid) >= 0 && bid); assert_se((host = gethostname_malloc())); - assert_se((user = getusername_malloc())); + assert_se((user = uid_to_name(getuid()))); assert_se(asprintf(&uid, UID_FMT, getuid()) >= 0); #define expect(src, pattern, result) \ diff --git a/test/TEST-04-JOURNAL/test-journal.sh b/test/TEST-04-JOURNAL/test-journal.sh index 6646eccfa7..493ff00ce0 100755 --- a/test/TEST-04-JOURNAL/test-journal.sh +++ b/test/TEST-04-JOURNAL/test-journal.sh @@ -59,4 +59,12 @@ sleep 3 systemctl stop forever-print-hola [[ ! -f "/i-lose-my-logs" ]] +# https://github.com/systemd/systemd/issues/4408 +rm -f /i-lose-my-logs +systemctl start forever-print-hola +sleep 3 +systemctl kill --signal=SIGKILL systemd-journald +sleep 3 +[[ ! -f "/i-lose-my-logs" ]] + touch /testok |