diff options
80 files changed, 4465 insertions, 2633 deletions
diff --git a/Makefile-man.am b/Makefile-man.am index ab68c81b1d..3b8038611b 100644 --- a/Makefile-man.am +++ b/Makefile-man.am @@ -130,6 +130,7 @@ MANPAGES += \ man/systemd.kill.5 \ man/systemd.link.5 \ man/systemd.mount.5 \ + man/systemd.nspawn.5 \ man/systemd.path.5 \ man/systemd.preset.5 \ man/systemd.resource-control.5 \ @@ -2382,6 +2383,7 @@ EXTRA_DIST += \ man/systemd.mount.xml \ man/systemd.netdev.xml \ man/systemd.network.xml \ + man/systemd.nspawn.xml \ man/systemd.path.xml \ man/systemd.preset.xml \ man/systemd.resource-control.xml \ diff --git a/Makefile.am b/Makefile.am index 003308e580..8e0448f433 100644 --- a/Makefile.am +++ b/Makefile.am @@ -42,9 +42,9 @@ LIBUDEV_CURRENT=7 LIBUDEV_REVISION=4 LIBUDEV_AGE=6 -LIBSYSTEMD_CURRENT=10 -LIBSYSTEMD_REVISION=2 -LIBSYSTEMD_AGE=10 +LIBSYSTEMD_CURRENT=11 +LIBSYSTEMD_REVISION=0 +LIBSYSTEMD_AGE=11 # The following four libraries only exist for compatibility reasons, # their version info should not be bumped anymore @@ -155,6 +155,7 @@ noinst_PROGRAMS = TESTS = endif udevlibexec_PROGRAMS = +gperf_gperf_sources = in_files = $(filter %.in,$(EXTRA_DIST)) in_in_files = $(filter %.in.in, $(in_files)) @@ -220,6 +221,7 @@ AM_CPPFLAGS = \ -I $(top_builddir)/src/journal \ -I $(top_srcdir)/src/timedate \ -I $(top_srcdir)/src/timesync \ + -I $(top_srcdir)/src/nspawn \ -I $(top_srcdir)/src/resolve \ -I $(top_builddir)/src/resolve \ -I $(top_srcdir)/src/systemd \ @@ -2776,11 +2778,31 @@ systemd_cgtop_LDADD = \ # ------------------------------------------------------------------------------ systemd_nspawn_SOURCES = \ src/nspawn/nspawn.c \ + src/nspawn/nspawn-settings.c \ + src/nspawn/nspawn-settings.h \ + src/nspawn/nspawn-mount.c \ + src/nspawn/nspawn-mount.h \ + src/nspawn/nspawn-network.c \ + src/nspawn/nspawn-network.h \ + src/nspawn/nspawn-expose-ports.c \ + src/nspawn/nspawn-expose-ports.h \ + src/nspawn/nspawn-cgroup.c \ + src/nspawn/nspawn-cgroup.h \ + src/nspawn/nspawn-register.c \ + src/nspawn/nspawn-register.h \ + src/nspawn/nspawn-setuid.c \ + src/nspawn/nspawn-setuid.h \ src/core/mount-setup.c \ src/core/mount-setup.h \ src/core/loopback-setup.c \ src/core/loopback-setup.h +nodist_systemd_nspawn_SOURCES = \ + src/nspawn/nspawn-gperf.c + +gperf_gperf_sources += \ + src/nspawn/nspawn-gperf.gperf + systemd_nspawn_CFLAGS = \ $(AM_CFLAGS) \ $(BLKID_CFLAGS) \ @@ -3486,7 +3508,7 @@ nodist_libudev_core_la_SOURCES = \ src/udev/keyboard-keys-to-name.h \ src/udev/net/link-config-gperf.c -gperf_gperf_sources = \ +gperf_gperf_sources += \ src/udev/net/link-config-gperf.gperf libudev_core_la_CFLAGS = \ @@ -87,8 +87,8 @@ CHANGES WITH 226: * systemd-nspawn's --bind= and --bind-ro= options have been extended to allow creation of non-recursive bind mounts. - * libsystemd gained two new calls sd_pid_get_cgroup() an - sd_peer_get_cgroup() which returns the control group path of + * libsystemd gained two new calls sd_pid_get_cgroup() and + sd_peer_get_cgroup() which return the control group path of a process or peer of a connected AF_UNIX socket. This function call is particularly useful when implementing delegated subtrees support in the control group hierarchy. @@ -103,14 +103,22 @@ CHANGES WITH 226: powerful PolicyKit policies, that make decisions depending on these parameters. - Contributions from: Cristian Rodríguez, Daniel Mack, David Herrmann, - Eugene Yakubovich, Evgeny Vereshchagin, Filipe Brandenburger, Jan - Alexander Steffens (heftig), Jan Synacek, Kay Sievers, Lennart - Poettering, Mangix, Marcel Holtmann, Martin Pitt, Michal Sekletar, Peter - Hutterer, Piotr Drąg, reverendhomer, Robin Hack, Susant Sahani, Sylvain - Pasche, Thomas Hindoe Paaboel Andersen, Tom Gundersen - - -- Berlin, 2015-09-XX + * nspawn learnt support for .nspawn settings files, that may + accompany the image files or directories of containers, and + may contain additional settings for the container. This is + an alternative to configuring container parameters via the + nspawn command line. + + Contributions from: Cristian Rodríguez, Daniel Mack, David + Herrmann, Eugene Yakubovich, Evgeny Vereshchagin, Filipe + Brandenburger, Hans de Goede, Jan Alexander Steffens, Jan + Synacek, Kay Sievers, Lennart Poettering, Mangix, Marcel + Holtmann, Martin Pitt, Michael Biebl, Michael Chapman, Michal + Sekletar, Peter Hutterer, Piotr Drąg, reverendhomer, Robin + Hack, Susant Sahani, Sylvain Pasche, Thomas Hindoe Paaboel + Andersen, Tom Gundersen, Torstein Husebø + + -- Berlin, 2015-09-08 CHANGES WITH 225: @@ -26,8 +26,6 @@ External: Features: -* sd-event: somehow handle multiple queued incoming signals - * sd-event: maybe add support for inotify events * PID 1 should send out sd_notify("WATCHDOG=1") messages (for usage in the --user mode, and when run via nspawn) @@ -205,8 +203,6 @@ Features: * "machinectl list-images" should show os-release data, as well as machine-info data (including deployment level) -* nspawn: when start a container "foobar" look for its configuration in a file "foobar.nspawn" in /etc/systemd/nspawn/ as well as next to the actualy directory or image to boot - * Port various tools to make use of verbs.[ch], where applicable * "machinectl history" diff --git a/configure.ac b/configure.ac index a7070916fe..2024939ad0 100644 --- a/configure.ac +++ b/configure.ac @@ -20,7 +20,7 @@ AC_PREREQ([2.64]) AC_INIT([systemd], - [225], + [226], [http://github.com/systemd/systemd/issues], [systemd], [http://www.freedesktop.org/wiki/Software/systemd]) diff --git a/man/os-release.xml b/man/os-release.xml index 4ca2e59706..d2e2598204 100644 --- a/man/os-release.xml +++ b/man/os-release.xml @@ -214,10 +214,11 @@ <varlistentry> <term><varname>CPE_NAME=</varname></term> - <listitem><para>A CPE name for the operating system, following - the <ulink url="https://cpe.mitre.org/specification/">Common + <listitem><para>A CPE name for the operating system, in URI + binding syntax, following the + <ulink url="http://scap.nist.gov/specifications/cpe/">Common Platform Enumeration Specification</ulink> as proposed by the - MITRE Corporation. This field is optional. Example: + NIST. This field is optional. Example: <literal>CPE_NAME="cpe:/o:fedoraproject:fedora:17"</literal> </para></listitem> </varlistentry> diff --git a/man/systemctl.xml b/man/systemctl.xml index 20d143741b..37ba4ab6de 100644 --- a/man/systemctl.xml +++ b/man/systemctl.xml @@ -918,6 +918,7 @@ kobject-uevent 1 systemd-udevd-kernel.socket systemd-udevd.service <varname>RequiresOverridable=</varname>, <varname>Requisite=</varname>, <varname>RequisiteOverridable=</varname>, + <varname>ConsistsOf=</varname>, <varname>Wants=</varname>, <varname>BindsTo=</varname> dependencies. If no unit is specified, <filename>default.target</filename> is implied.</para> diff --git a/man/systemd-gpt-auto-generator.xml b/man/systemd-gpt-auto-generator.xml index 27ec72c986..f569ea3cde 100644 --- a/man/systemd-gpt-auto-generator.xml +++ b/man/systemd-gpt-auto-generator.xml @@ -69,7 +69,7 @@ units are explicitly configured (for example, listed in <citerefentry project='man-pages'><refentrytitle>fstab</refentrytitle><manvolnum>5</manvolnum></citerefentry>), - the units this generator creates are overriden, but additional + the units this generator creates are overridden, but additional automatic dependencies might be created.</para> <para>This generator will only look for root partitions on the diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 6165fe1357..bc5dacd98f 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -1,4 +1,4 @@ -<?xml version='1.0'?> <!--*-nxml-*--> +<?xml version='1.0'?> <!--*- Mode: nxml; nxml-child-indent: 2; indent-tabs-mode: nil -*--> <!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd"> @@ -748,34 +748,86 @@ </varlistentry> <varlistentry> - <term><option>--volatile</option><replaceable>=MODE</replaceable></term> + <term><option>--volatile</option></term> + <term><option>--volatile=</option><replaceable>MODE</replaceable></term> <listitem><para>Boots the container in volatile mode. When no mode parameter is passed or when mode is specified as - <literal>yes</literal> full volatile mode is enabled. This + <option>yes</option> full volatile mode is enabled. This means the root directory is mounted as mostly unpopulated <literal>tmpfs</literal> instance, and <filename>/usr</filename> from the OS tree is mounted into it, read-only (the system thus starts up with read-only OS resources, but pristine state and configuration, any changes to the either are lost on shutdown). When the mode parameter - is specified as <literal>state</literal> the OS tree is + is specified as <option>state</option> the OS tree is mounted read-only, but <filename>/var</filename> is mounted as <literal>tmpfs</literal> instance into it (the system thus starts up with read-only OS resources and configuration, but pristine state, any changes to the latter are lost on shutdown). When the mode parameter is specified as - <literal>no</literal> (the default) the whole OS tree is made + <option>no</option> (the default) the whole OS tree is made available writable.</para> - <para>Note that setting this to <literal>yes</literal> or - <literal>state</literal> will only work correctly with + <para>Note that setting this to <option>yes</option> or + <option>state</option> will only work correctly with operating systems in the container that can boot up with only <filename>/usr</filename> mounted, and are able to populate <filename>/var</filename> automatically, as needed.</para></listitem> </varlistentry> + <varlistentry> + <term><option>--settings=</option><replaceable>MODE</replaceable></term> + + <listitem><para>Controls whether + <command>systemd-nspawn</command> shall search for and use + additional per-container settings from + <filename>.nspawn</filename> files. Takes a boolean or the + special values <option>override</option> or + <option>trusted</option>.</para> + + <para>If enabled (the default) a settings file named after the + machine (as specified with the <option>--machine=</option> + setting, or derived from the directory or image file name) + with the suffix <filename>.nspawn</filename> is searched in + <filename>/etc/systemd/nspawn/</filename> and + <filename>/run/systemd/nspawn/</filename>. If it is found + there, its settings are read and used. If it is not found + there it is subsequently searched in the same directory as the + image file or in the immediate parent of the root directory of + the container. In this case, if the file is found its settings + will be also read and used, but potentially unsafe settings + are ignored. Note that in both these cases settings on the + command line take precedence over the corresponding settings + from loaded <filename>.nspawn</filename> files, if both are + specified. Unsafe settings are considered all settings that + elevate the container's privileges or grant access to + additional resources such as files or directories of the + host. For details about the format and contents of + <filename>.nspawn</filename> files consult + <citerefentry><refentrytitle>systemd.nspawn</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para> + + <para>If this option is set to <option>override</option> the + file is searched, read and used the same way, however the order of + precedence is reversed: settings read from the + <filename>.nspawn</filename> file will take precedence over + the corresponding command line options, if both are + specified.</para> + + <para>If this option is set to <option>trusted</option> the + file is searched, read and used the same way, but regardless + if found in <filename>/etc/systemd/nspawn/</filename>, + <filename>/run/systemd/nspawn/</filename> or next to the image + file or container root directory, all settings will take + effect, however command line arguments still take precedence + over corresponding settings.</para> + + <para>If disabled no <filename>.nspawn</filename> file is read + and no settings except the ones on the command line are in + effect.</para></listitem> + </varlistentry> + <xi:include href="standard-options.xml" xpointer="help" /> <xi:include href="standard-options.xml" xpointer="version" /> </variablelist> @@ -859,6 +911,7 @@ <title>See Also</title> <para> <citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry>, + <citerefentry><refentrytitle>systemd.nspawn</refentrytitle><manvolnum>5</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>1</manvolnum></citerefentry>, <citerefentry project='mankier'><refentrytitle>dnf</refentrytitle><manvolnum>8</manvolnum></citerefentry>, <citerefentry project='die-net'><refentrytitle>yum</refentrytitle><manvolnum>8</manvolnum></citerefentry>, diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml new file mode 100644 index 0000000000..7bfafb424f --- /dev/null +++ b/man/systemd.nspawn.xml @@ -0,0 +1,383 @@ +<?xml version='1.0'?> <!--*- Mode: nxml; nxml-child-indent: 2; indent-tabs-mode: nil -*--> +<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" + "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd" [ +<!ENTITY % entities SYSTEM "custom-entities.ent" > +%entities; +]> + +<!-- + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +--> + +<refentry id="systemd.nspawn"> + + <refentryinfo> + <title>systemd.nspawn</title> + <productname>systemd</productname> + + <authorgroup> + <author> + <contrib>Developer</contrib> + <firstname>Lennart</firstname> + <surname>Poettering</surname> + <email>lennart@poettering.net</email> + </author> + </authorgroup> + </refentryinfo> + + <refmeta> + <refentrytitle>systemd.nspawn</refentrytitle> + <manvolnum>5</manvolnum> + </refmeta> + + <refnamediv> + <refname>systemd.nspawn</refname> + <refpurpose>Container settings</refpurpose> + </refnamediv> + + <refsynopsisdiv> + <para><filename>/etc/systemd/nspawn/<replaceable>machine</replaceable>.nspawn</filename></para> + <para><filename>/run/systemd/nspawn/<replaceable>machine</replaceable>.nspawn</filename></para> + <para><filename>/var/lib/machines/<replaceable>machine</replaceable>.nspawn</filename></para> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para>An nspawn container settings file (suffix + <filename>.nspawn</filename>) encodes additional runtime + information about a local container, and is searched, read and + used by + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + when starting a container. Files of this type are named after the + containers they define settings for. They are optional, and only + required for containers whose execution environment shall differ + from the defaults. Files of this type mostly contain settings that + may also be set on the <command>systemd-nspawn</command> command + line, and make it easier to persistently attach specific settings + to specific containers. The syntax of these files is inspired by + <filename>.desktop</filename> files following the <ulink + url="http://standards.freedesktop.org/desktop-entry-spec/latest/">XDG + Desktop Entry Specification</ulink>, which are in turn inspired by + Microsoft Windows <filename>.ini</filename> files.</para> + + <para>Boolean arguments used in these settings files can be + written in various formats. For positive settings the strings + <option>1</option>, <option>yes</option>, <option>true</option> + and <option>on</option> are equivalent. For negative settings, the + strings <option>0</option>, <option>no</option>, + <option>false</option> and <option>off</option> are + equivalent.</para> + + <para>Empty lines and lines starting with # or ; are + ignored. This may be used for commenting. Lines ending + in a backslash are concatenated with the following + line while reading and the backslash is replaced by a + space character. This may be used to wrap long lines.</para> + + </refsect1> + + <refsect1> + <title><filename>.nspawn</filename> File Discovery</title> + + <para>Files are searched by appending the + <filename>.nspawn</filename> suffix to the machine name of the + container, as specified with the <option>--machine=</option> + switch of <command>systemd-nspawn</command>, or derived from the + directory or image file name. This file is first searched in + <filename>/etc/systemd/nspawn/</filename> and + <filename>/run/systemd/nspawn/</filename>. If found in these + directories its settings are read and all of them take full effect + (but are possibly overridden by corresponding command line + arguments). If not found the file will then be searched next to + the image file or in the immediate parent of the root directory of + the container. If the file is found there only a subset of the + settings will take effect however. All settings that possibly + elevate privileges or grant additional access to resources of the + host (such as files or directories) are ignored. To which options + this applies is documented below.</para> + + <para>Persistent settings file created and maintained by the + administrator (and thus trusted) should be placed in + <filename>/etc/systemd/nspawn/</filename>, while automatically + downloaded (and thus potentially untrusted) settings files are + placed in <filename>/var/lib/machines/</filename> instead (next to + the container images), where their security impact is limited. In + order to add privileged settings to <filename>.nspawn</filename> + files acquired from the image vendor it is recommended to copy the + settings files into <filename>/etc/systemd/nspawn/</filename> and + edit them there, so that the privileged options become + available. The precise algorithm how the files are searched and + interpreted may be configured with + <command>systemd-nspawn</command>'s <option>--settings=</option> + switch, see + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + for details.</para> + </refsect1> + + <refsect1> + <title>[Exec] Section Options</title> + + <para>Settings files may include an <literal>[Exec]</literal> + section, which carries various execution parameters:</para> + + <variablelist> + + <varlistentry> + <term><varname>Boot=</varname></term> + + <listitem><para>Takes a boolean argument, defaults to off. If + enabled <command>systemd-nspawn</command> will automatically + search for an <filename>init</filename> executable and invoke + it. In this case the specified parameters using + <varname>Parameters=</varname> are passed as additional + arguments to the <filename>init</filename> process. This + setting corresponds to the <option>--boot</option> switch on + the <command>systemd-nspawn</command> command + line. </para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Parameters=</varname></term> + + <listitem><para>Takes a space separated list of + arguments. This is either a command line, beginning with the + binary name to execute, or – if <varname>Boot=</varname> is + enabled – the list of arguments to pass to the init + process. This setting corresponds to the command line + parameters passed on the <command>systemd-nspawn</command> + command line.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Environment=</varname></term> + + <listitem><para>Takes an environment variable assignment + consisting of key and value, separated by + <literal>=</literal>. Sets an environment variable for the + main process invoked in the container. This setting may be + used multiple times to set multiple environment variables. It + corresponds to the <option>--setenv=</option> command line + switch.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>User=</varname></term> + + <listitem><para>Takes a UNIX user name. Specifies the user + name to invoke the main process of the container as. This user + must be known in the container's user database. This + corresponds to the <option>--user=</option> command line + switch.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Capability=</varname></term> + <term><varname>DropCapability=</varname></term> + + <listitem><para>Takes a space separated list of Linux process + capabilities (see + <citerefentry><refentrytitle>capabilities</refentrytitle><manvolnum>7</manvolnum></citerefentry> + for details). The <varname>Capability=</varname> setting + specifies additional capabilities to pass on top of the + default set of capabilities. The + <varname>DropCapability=</varname> setting specifies + capabilities to drop from the default set. These settings + correspond to the <option>--capability=</option> and + <option>--drop-capability=</option> command line + switches. Note that <varname>Capability=</varname> is a + privileged setting, and only takes effect in + <filename>.nspawn</filename> files in + <filename>/etc/systemd/nspawn/</filename> and + <filename>/run/system/nspawn/</filename> (see above). On the + other hand <varname>DropCapability=</varname> takes effect in + all cases.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Personality=</varname></term> + + <listitem><para>Configures the kernel personality for the + container. This is equivalent to the + <option>--personality=</option> switch.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>MachineID=</varname></term> + + <listitem><para>Configures the 128bit machine ID (UUID) to pass to + the container. This is equivalent to the + <option>--uuid=</option> command line switch. This option is + privileged (see above). </para></listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>[Files] Section Options</title> + + <para>Settings files may include a <literal>[Files]</literal> + section, which carries various parameters configuring the file + system of the container:</para> + + <variablelist> + + <varlistentry> + <term><varname>ReadOnly=</varname></term> + + <listitem><para>Takes a boolean argument, defaults to off. If + specified the container will be run with a read-only file + system. This setting corresponds to the + <option>--read-only</option> command line + switch.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Volatile=</varname></term> + + <listitem><para>Takes a boolean argument, or the special value + <literal>state</literal>. This configures whether to run the + container with volatile state and/or configuration. This + option is equivalent to <option>--volatile=</option>, see + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + for details about the specific options + supported.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Bind=</varname></term> + <term><varname>BindReadOnly=</varname></term> + + <listitem><para>Adds a bind mount from the host into the + container. Takes a single path, a pair of two paths separated + by a colon, or a triplet of two paths plus an option string + separated by colons. This option may be used multiple times to + configure multiple bind mounts. This option is equivalent to + the command line switches <option>--bind=</option> and + <option>--bind-ro=</option>, see + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + for details about the specific options supported. This setting + is privileged (see above).</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>TemporaryFileSystem=</varname></term> + + <listitem><para>Adds a <literal>tmpfs</literal> mount to the + container. Takes a path or a pair of path and option string, + separated by a colon. This option may be used multiple times to + configure multiple <literal>tmpfs</literal> mounts. This + option is equivalent to the command line switch + <option>--tmpfs=</option>, see + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + for details about the specific options supported. This setting + is privileged (see above).</para></listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>[Network] Section Options</title> + + <para>Settings files may include a <literal>[Network]</literal> + section, which carries various parameters configuring the network + connectivity of the container:</para> + + <variablelist> + + <varlistentry> + <term><varname>Private=</varname></term> + + <listitem><para>Takes a boolean argument, defaults to off. If + enabled the container will run in its own network namespace + and not share network interfaces and configuration with the + host. This setting corresponds to the + <option>--private-network</option> command line + switch.</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>VirtualEthernet=</varname></term> + + <listitem><para>Takes a boolean argument. Configures whether + to create a virtual ethernet connection + (<literal>veth</literal>) between host and the container. This + setting implies <varname>Private=yes</varname>. This setting + corresponds to the <option>--network-veth</option> command + line switch. This option is privileged (see + above).</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Interface=</varname></term> + + <listitem><para>Takes a space separated list of interfaces to + add to the container. This option corresponds to the + <option>--network-interface=</option> command line switch and + implies <varname>Private=yes</varname>. This option is + privileged (see above).</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>MACVLAN=</varname></term> + <term><varname>IPVLAN=</varname></term> + + <listitem><para>Takes a space separated list of interfaces to + add MACLVAN or IPVLAN interfaces to, which are then added to + the container. These options correspond to the + <option>--network-macvlan=</option> and + <option>--network-ipvlan=</option> command line switches and + imply <varname>Private=yes</varname>. These options are + privileged (see above).</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Bridge=</varname></term> + + <listitem><para>Takes an interface name. This setting implies + <varname>VirtualEthernet=yes</varname> and + <varname>Private=yes</varname> and has the effect that the + host side of the created virtual Ethernet link is connected to + the specified bridge interface. This option corresponds to the + <option>--network-bridge=</option> command line switch. This + option is privileged (see above).</para></listitem> + </varlistentry> + + <varlistentry> + <term><varname>Port=</varname></term> + + <listitem><para>Exposes a TCP or UDP port of the container on + the host. This option corresponds to the + <option>--port=</option> command line switch, see + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + for the precise syntax of the argument this option takes. This + option is privileged (see above).</para></listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>See Also</title> + <para> + <citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry>, + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>, + <citerefentry><refentrytitle>systemd.directives</refentrytitle><manvolnum>7</manvolnum></citerefentry> + </para> + </refsect1> + +</refentry> diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml index 407f6d32eb..ea58580bba 100644 --- a/man/systemd.unit.xml +++ b/man/systemd.unit.xml @@ -256,7 +256,7 @@ </refsect1> <refsect1> - <title>Unit Load Path</title> + <title>Unit File Load Path</title> <para>Unit files are loaded from a set of paths determined during compilation, described in the two tables below. Unit files found diff --git a/shell-completion/bash/systemd-analyze b/shell-completion/bash/systemd-analyze index 00947029c6..7a5f46ba1d 100644 --- a/shell-completion/bash/systemd-analyze +++ b/shell-completion/bash/systemd-analyze @@ -35,8 +35,8 @@ _systemd_analyze() { local cur=${COMP_WORDS[COMP_CWORD]} prev=${COMP_WORDS[COMP_CWORD-1]} local -A OPTS=( - [STANDALONE]='--help --version --system --user --from-pattern --to-pattern --order --require --no-pager' - [ARG]='-H --host -M --machine --fuzz --man' + [STANDALONE]='--help --version --system --user --order --require --no-pager --man' + [ARG]='-H --host -M --machine --fuzz --from-pattern --to-pattern ' ) local -A VERBS=( @@ -102,7 +102,7 @@ _systemd_analyze() { elif __contains_word "$verb" ${VERBS[VERIFY]}; then if [[ $cur = -* ]]; then - comps='--help --version --system --user --no-man' + comps='--help --version --system --user --man' else comps=$( compgen -A file -- "$cur" ) compopt -o filenames diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index a298b29382..f661a54d99 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2211,7 +2211,7 @@ static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = { [CGROUP_CONTROLLER_CPUACCT] = "cpuacct", [CGROUP_CONTROLLER_BLKIO] = "blkio", [CGROUP_CONTROLLER_MEMORY] = "memory", - [CGROUP_CONTROLLER_DEVICE] = "device", + [CGROUP_CONTROLLER_DEVICE] = "devices", }; DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController); diff --git a/src/basic/conf-files.h b/src/basic/conf-files.h index 3169a907f1..d8aebc5e5b 100644 --- a/src/basic/conf-files.h +++ b/src/basic/conf-files.h @@ -22,7 +22,6 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ - -int conf_files_list(char ***strv, const char *suffix, const char *root, const char *dir, ...); -int conf_files_list_strv(char ***strv, const char *suffix, const char *root, const char* const* dirs); -int conf_files_list_nulstr(char ***strv, const char *suffix, const char *root, const char *dirs); +int conf_files_list(char ***ret, const char *suffix, const char *root, const char *dir, ...); +int conf_files_list_strv(char ***ret, const char *suffix, const char *root, const char* const* dirs); +int conf_files_list_nulstr(char ***ret, const char *suffix, const char *root, const char *dirs); diff --git a/src/basic/util.c b/src/basic/util.c index 86aacad307..3e41e6ad41 100644 --- a/src/basic/util.c +++ b/src/basic/util.c @@ -4817,7 +4817,7 @@ int shall_restore_state(void) { int proc_cmdline(char **ret) { assert(ret); - if (detect_container(NULL) > 0) + if (detect_container() > 0) return get_process_cmdline(1, 0, false, ret); else return read_one_line_file("/proc/cmdline", ret); diff --git a/src/basic/virt.c b/src/basic/virt.c index 4a4bebd528..1fc6c1baba 100644 --- a/src/basic/virt.c +++ b/src/basic/virt.c @@ -28,25 +28,24 @@ #include "virt.h" #include "fileio.h" -static int detect_vm_cpuid(const char **_id) { +static int detect_vm_cpuid(void) { /* Both CPUID and DMI are x86 specific interfaces... */ #if defined(__i386__) || defined(__x86_64__) - static const char cpuid_vendor_table[] = - "XenVMMXenVMM\0" "xen\0" - "KVMKVMKVM\0" "kvm\0" + static const struct { + const char *cpuid; + int id; + } cpuid_vendor_table[] = { + { "XenVMMXenVMM", VIRTUALIZATION_XEN }, + { "KVMKVMKVM", VIRTUALIZATION_KVM }, /* http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1009458 */ - "VMwareVMware\0" "vmware\0" + { "VMwareVMware", VIRTUALIZATION_VMWARE }, /* http://msdn.microsoft.com/en-us/library/ff542428.aspx */ - "Microsoft Hv\0" "microsoft\0"; + { "Microsoft Hv", VIRTUALIZATION_MICROSOFT }, + }; uint32_t eax, ecx; - union { - uint32_t sig32[3]; - char text[13]; - } sig = {}; - const char *j, *k; bool hypervisor; /* http://lwn.net/Articles/301888/ */ @@ -74,6 +73,11 @@ static int detect_vm_cpuid(const char **_id) { hypervisor = !!(ecx & 0x80000000U); if (hypervisor) { + union { + uint32_t sig32[3]; + char text[13]; + } sig = {}; + unsigned j; /* There is a hypervisor, see what it is */ eax = 0x40000000U; @@ -88,57 +92,54 @@ static int detect_vm_cpuid(const char **_id) { : "0" (eax) ); - NULSTR_FOREACH_PAIR(j, k, cpuid_vendor_table) - if (streq(sig.text, j)) { - *_id = k; - return 1; - } + for (j = 0; j < ELEMENTSOF(cpuid_vendor_table); j ++) + if (streq(sig.text, cpuid_vendor_table[j].cpuid)) + return cpuid_vendor_table[j].id; - *_id = "other"; - return 0; + return VIRTUALIZATION_VM_OTHER; } #endif - return 0; + return VIRTUALIZATION_NONE; } -static int detect_vm_devicetree(const char **_id) { +static int detect_vm_device_tree(void) { #if defined(__arm__) || defined(__aarch64__) || defined(__powerpc__) || defined(__powerpc64__) _cleanup_free_ char *hvtype = NULL; int r; r = read_one_line_file("/proc/device-tree/hypervisor/compatible", &hvtype); - if (r >= 0) { - if (streq(hvtype, "linux,kvm")) { - *_id = "kvm"; - return 1; - } else if (strstr(hvtype, "xen")) { - *_id = "xen"; - return 1; - } - } else if (r == -ENOENT) { + if (r == -ENOENT) { _cleanup_closedir_ DIR *dir = NULL; struct dirent *dent; dir = opendir("/proc/device-tree"); if (!dir) { if (errno == ENOENT) - return 0; + return VIRTUALIZATION_NONE; return -errno; } - FOREACH_DIRENT(dent, dir, return -errno) { - if (strstr(dent->d_name, "fw-cfg")) { - *_id = "qemu"; - return 1; - } - } - } + FOREACH_DIRENT(dent, dir, return -errno) + if (strstr(dent->d_name, "fw-cfg")) + return VIRTUALIZATION_QEMU; + + return VIRTUALIZATION_NONE; + } else if (r < 0) + return r; + + if (streq(hvtype, "linux,kvm")) + return VIRTUALIZATION_KVM; + else if (strstr(hvtype, "xen")) + return VIRTUALIZATION_XEN; + else + return VIRTUALIZATION_VM_OTHER; +#else + return VIRTUALIZATION_NONE; #endif - return 0; } -static int detect_vm_dmi(const char **_id) { +static int detect_vm_dmi(void) { /* Both CPUID and DMI are x86 specific interfaces... */ #if defined(__i386__) || defined(__x86_64__) @@ -149,188 +150,195 @@ static int detect_vm_dmi(const char **_id) { "/sys/class/dmi/id/bios_vendor" }; - static const char dmi_vendor_table[] = - "QEMU\0" "qemu\0" + static const struct { + const char *vendor; + int id; + } dmi_vendor_table[] = { + { "QEMU", VIRTUALIZATION_QEMU }, /* http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1009458 */ - "VMware\0" "vmware\0" - "VMW\0" "vmware\0" - "innotek GmbH\0" "oracle\0" - "Xen\0" "xen\0" - "Bochs\0" "bochs\0" - "Parallels\0" "parallels\0"; + { "VMware", VIRTUALIZATION_VMWARE }, + { "VMW", VIRTUALIZATION_VMWARE }, + { "innotek GmbH", VIRTUALIZATION_ORACLE }, + { "Xen", VIRTUALIZATION_XEN }, + { "Bochs", VIRTUALIZATION_BOCHS }, + { "Parallels", VIRTUALIZATION_PARALLELS }, + }; unsigned i; + int r; for (i = 0; i < ELEMENTSOF(dmi_vendors); i++) { _cleanup_free_ char *s = NULL; - const char *j, *k; - int r; + unsigned j; r = read_one_line_file(dmi_vendors[i], &s); if (r < 0) { - if (r != -ENOENT) - return r; + if (r == -ENOENT) + continue; - continue; + return r; } - NULSTR_FOREACH_PAIR(j, k, dmi_vendor_table) - if (startswith(s, j)) { - *_id = k; - return 1; - } + for (j = 0; j < ELEMENTSOF(dmi_vendor_table); j++) + if (startswith(s, dmi_vendor_table[j].vendor)) + return dmi_vendor_table[j].id; } #endif - return 0; + return VIRTUALIZATION_NONE; } -/* Returns a short identifier for the various VM implementations */ -int detect_vm(const char **id) { - _cleanup_free_ char *domcap = NULL, *cpuinfo_contents = NULL; - static thread_local int cached_found = -1; - static thread_local const char *cached_id = NULL; - const char *_id = NULL, *_id_cpuid = NULL; +static int detect_vm_xen(void) { + _cleanup_free_ char *domcap = NULL; + char *cap, *i; int r; - if (_likely_(cached_found >= 0)) { + r = read_one_line_file("/proc/xen/capabilities", &domcap); + if (r == -ENOENT) + return VIRTUALIZATION_NONE; - if (id) - *id = cached_id; + i = domcap; + while ((cap = strsep(&i, ","))) + if (streq(cap, "control_d")) + break; - return cached_found; - } - - /* Try xen capabilities file first, if not found try high-level hypervisor sysfs file: - * - * https://bugs.freedesktop.org/show_bug.cgi?id=77271 */ - r = read_one_line_file("/proc/xen/capabilities", &domcap); - if (r >= 0) { - char *cap, *i = domcap; + return cap ? VIRTUALIZATION_NONE : VIRTUALIZATION_XEN; +} - while ((cap = strsep(&i, ","))) - if (streq(cap, "control_d")) - break; +static int detect_vm_hypervisor(void) { + _cleanup_free_ char *hvtype = NULL; + int r; - if (!cap) { - _id = "xen"; - r = 1; - } + r = read_one_line_file("/sys/hypervisor/type", &hvtype); + if (r == -ENOENT) + return VIRTUALIZATION_NONE; + if (r < 0) + return r; - goto finish; + if (streq(hvtype, "xen")) + return VIRTUALIZATION_XEN; + else + return VIRTUALIZATION_VM_OTHER; +} - } else if (r == -ENOENT) { - _cleanup_free_ char *hvtype = NULL; +static int detect_vm_uml(void) { + _cleanup_free_ char *cpuinfo_contents = NULL; + int r; - r = read_one_line_file("/sys/hypervisor/type", &hvtype); - if (r >= 0) { - if (streq(hvtype, "xen")) { - _id = "xen"; - r = 1; - goto finish; - } - } else if (r != -ENOENT) - return r; - } else + /* Detect User-Mode Linux by reading /proc/cpuinfo */ + r = read_full_file("/proc/cpuinfo", &cpuinfo_contents, NULL); + if (r < 0) return r; + if (strstr(cpuinfo_contents, "\nvendor_id\t: User Mode Linux\n")) + return VIRTUALIZATION_UML; - /* this will set _id to "other" and return 0 for unknown hypervisors */ - r = detect_vm_cpuid(&_id); + return VIRTUALIZATION_NONE; +} - /* finish when found a known hypervisor other than kvm */ - if (r < 0 || (r > 0 && !streq(_id, "kvm"))) - goto finish; +static int detect_vm_zvm(void) { - _id_cpuid = _id; +#if defined(__s390__) + _cleanup_free_ char *t = NULL; + int r; - r = detect_vm_dmi(&_id); + r = get_status_field("/proc/sysinfo", "VM00 Control Program:", &t); + if (r == -ENOENT) + return VIRTUALIZATION_NONE; + if (r < 0) + return r; - /* kvm with and without Virtualbox */ - /* Parallels exports KVMKVMKVM leaf */ - if (streq_ptr(_id_cpuid, "kvm")) { - if (r > 0 && (streq(_id, "oracle") || streq(_id, "parallels"))) - goto finish; + if (streq(t, "z/VM")) + return VIRTUALIZATION_ZVM; + else + return VIRTUALIZATION_KVM; +#else + return VIRTUALIZATION_NONE; +#endif +} - _id = _id_cpuid; - r = 1; - goto finish; - } +/* Returns a short identifier for the various VM implementations */ +int detect_vm(void) { + static thread_local int cached_found = _VIRTUALIZATION_INVALID; + int r; - /* information from dmi */ - if (r != 0) - goto finish; + if (cached_found >= 0) + return cached_found; - r = detect_vm_devicetree(&_id); - if (r != 0) + /* Try xen capabilities file first, if not found try + * high-level hypervisor sysfs file: + * + * https://bugs.freedesktop.org/show_bug.cgi?id=77271 */ + + r = detect_vm_xen(); + if (r < 0) + return r; + if (r != VIRTUALIZATION_NONE) goto finish; - if (_id) { - /* "other" */ - r = 1; + r = detect_vm_dmi(); + if (r < 0) + return r; + if (r != VIRTUALIZATION_NONE) goto finish; - } - /* Detect User-Mode Linux by reading /proc/cpuinfo */ - r = read_full_file("/proc/cpuinfo", &cpuinfo_contents, NULL); + r = detect_vm_cpuid(); if (r < 0) return r; - if (strstr(cpuinfo_contents, "\nvendor_id\t: User Mode Linux\n")) { - _id = "uml"; - r = 1; + if (r != VIRTUALIZATION_NONE) goto finish; - } -#if defined(__s390__) - { - _cleanup_free_ char *t = NULL; + r = detect_vm_hypervisor(); + if (r < 0) + return r; + if (r != VIRTUALIZATION_NONE) + goto finish; - r = get_status_field("/proc/sysinfo", "VM00 Control Program:", &t); - if (r >= 0) { - if (streq(t, "z/VM")) - _id = "zvm"; - else - _id = "kvm"; - r = 1; + r = detect_vm_device_tree(); + if (r < 0) + return r; + if (r != VIRTUALIZATION_NONE) + goto finish; - goto finish; - } - } -#endif + r = detect_vm_uml(); + if (r < 0) + return r; + if (r != VIRTUALIZATION_NONE) + goto finish; - r = 0; + r = detect_vm_zvm(); + if (r < 0) + return r; finish: cached_found = r; - - cached_id = _id; - if (id) - *id = _id; - return r; } -int detect_container(const char **id) { +int detect_container(void) { - static thread_local int cached_found = -1; - static thread_local const char *cached_id = NULL; + static const struct { + const char *value; + int id; + } value_table[] = { + { "lxc", VIRTUALIZATION_LXC }, + { "lxc-libvirt", VIRTUALIZATION_LXC_LIBVIRT }, + { "systemd-nspawn", VIRTUALIZATION_SYSTEMD_NSPAWN }, + { "docker", VIRTUALIZATION_DOCKER }, + }; + static thread_local int cached_found = _VIRTUALIZATION_INVALID; _cleanup_free_ char *m = NULL; - const char *_id = NULL, *e = NULL; + const char *e = NULL; + unsigned j; int r; - if (_likely_(cached_found >= 0)) { - - if (id) - *id = cached_id; - + if (cached_found >= 0) return cached_found; - } /* /proc/vz exists in container and outside of the container, * /proc/bc only outside of the container. */ if (access("/proc/vz", F_OK) >= 0 && access("/proc/bc", F_OK) < 0) { - _id = "openvz"; - r = 1; + r = VIRTUALIZATION_OPENVZ; goto finish; } @@ -340,7 +348,7 @@ int detect_container(const char **id) { e = getenv("container"); if (isempty(e)) { - r = 0; + r = VIRTUALIZATION_NONE; goto finish; } } else { @@ -369,7 +377,7 @@ int detect_container(const char **id) { * as /proc/1/environ is only readable * with privileges. */ - r = 0; + r = VIRTUALIZATION_NONE; goto finish; } } @@ -379,46 +387,49 @@ int detect_container(const char **id) { e = m; } - /* We only recognize a selected few here, since we want to - * enforce a redacted namespace */ - if (streq(e, "lxc")) - _id ="lxc"; - else if (streq(e, "lxc-libvirt")) - _id = "lxc-libvirt"; - else if (streq(e, "systemd-nspawn")) - _id = "systemd-nspawn"; - else if (streq(e, "docker")) - _id = "docker"; - else - _id = "other"; + for (j = 0; j < ELEMENTSOF(value_table); j++) + if (streq(e, value_table[j].value)) { + r = value_table[j].id; + goto finish; + } - r = 1; + r = VIRTUALIZATION_NONE; finish: cached_found = r; - - cached_id = _id; - if (id) - *id = _id; - return r; } -/* Returns a short identifier for the various VM/container implementations */ -int detect_virtualization(const char **id) { +int detect_virtualization(void) { int r; - r = detect_container(id); - if (r < 0) - return r; - if (r > 0) - return VIRTUALIZATION_CONTAINER; - - r = detect_vm(id); - if (r < 0) + r = detect_container(); + if (r != 0) return r; - if (r > 0) - return VIRTUALIZATION_VM; - return VIRTUALIZATION_NONE; + return detect_vm(); } + +static const char *const virtualization_table[_VIRTUALIZATION_MAX] = { + [VIRTUALIZATION_NONE] = "none", + [VIRTUALIZATION_KVM] = "kvm", + [VIRTUALIZATION_QEMU] = "qemu", + [VIRTUALIZATION_BOCHS] = "bochs", + [VIRTUALIZATION_XEN] = "xen", + [VIRTUALIZATION_UML] = "uml", + [VIRTUALIZATION_VMWARE] = "vmware", + [VIRTUALIZATION_ORACLE] = "oracle", + [VIRTUALIZATION_MICROSOFT] = "microsoft", + [VIRTUALIZATION_ZVM] = "zvm", + [VIRTUALIZATION_PARALLELS] = "parallels", + [VIRTUALIZATION_VM_OTHER] = "vm-other", + + [VIRTUALIZATION_SYSTEMD_NSPAWN] = "systemd-nspawn", + [VIRTUALIZATION_LXC_LIBVIRT] = "lxc-libvirt", + [VIRTUALIZATION_LXC] = "lxc", + [VIRTUALIZATION_OPENVZ] = "openvz", + [VIRTUALIZATION_DOCKER] = "docker", + [VIRTUALIZATION_CONTAINER_OTHER] = "container-other", +}; + +DEFINE_STRING_TABLE_LOOKUP(virtualization, int); diff --git a/src/basic/virt.h b/src/basic/virt.h index 7194ab2bf7..449e069901 100644 --- a/src/basic/virt.h +++ b/src/basic/virt.h @@ -21,15 +21,51 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ -int detect_vm(const char **id); -int detect_container(const char **id); +#include <stdbool.h> + +#include "macro.h" enum { VIRTUALIZATION_NONE = 0, - VIRTUALIZATION_VM, - VIRTUALIZATION_CONTAINER, + + VIRTUALIZATION_VM_FIRST, + VIRTUALIZATION_KVM = VIRTUALIZATION_VM_FIRST, + VIRTUALIZATION_QEMU, + VIRTUALIZATION_BOCHS, + VIRTUALIZATION_XEN, + VIRTUALIZATION_UML, + VIRTUALIZATION_VMWARE, + VIRTUALIZATION_ORACLE, + VIRTUALIZATION_MICROSOFT, + VIRTUALIZATION_ZVM, + VIRTUALIZATION_PARALLELS, + VIRTUALIZATION_VM_OTHER, + VIRTUALIZATION_VM_LAST = VIRTUALIZATION_VM_OTHER, + + VIRTUALIZATION_CONTAINER_FIRST, + VIRTUALIZATION_SYSTEMD_NSPAWN = VIRTUALIZATION_CONTAINER_FIRST, + VIRTUALIZATION_LXC_LIBVIRT, + VIRTUALIZATION_LXC, + VIRTUALIZATION_OPENVZ, + VIRTUALIZATION_DOCKER, + VIRTUALIZATION_CONTAINER_OTHER, + VIRTUALIZATION_CONTAINER_LAST = VIRTUALIZATION_CONTAINER_OTHER, + _VIRTUALIZATION_MAX, _VIRTUALIZATION_INVALID = -1 }; -int detect_virtualization(const char **id); +static inline bool VIRTUALIZATION_IS_VM(int x) { + return x >= VIRTUALIZATION_VM_FIRST && x <= VIRTUALIZATION_VM_LAST; +} + +static inline bool VIRTUALIZATION_IS_CONTAINER(int x) { + return x >= VIRTUALIZATION_CONTAINER_FIRST && x <= VIRTUALIZATION_CONTAINER_LAST; +} + +int detect_vm(void); +int detect_container(void); +int detect_virtualization(void); + +const char *virtualization_to_string(int v) _const_; +int virtualization_from_string(const char *s) _pure_; diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 0b365391ec..4e5d67fc19 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -81,14 +81,10 @@ static int property_get_virtualization( void *userdata, sd_bus_error *error) { - const char *id = NULL; - assert(bus); assert(reply); - detect_virtualization(&id); - - return sd_bus_message_append(reply, "s", id); + return sd_bus_message_append(reply, "s", virtualization_to_string(detect_virtualization())); } static int property_get_architecture( diff --git a/src/core/job.c b/src/core/job.c index 15f5cc0cc9..2a35d1e2de 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -1137,7 +1137,7 @@ void job_shutdown_magic(Job *j) { /* In case messages on console has been disabled on boot */ j->unit->manager->no_console_output = false; - if (detect_container(NULL) > 0) + if (detect_container() > 0) return; asynchronous_sync(); diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 60b97722db..edd55b9e45 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -106,7 +106,7 @@ m4_define(`KILL_CONTEXT_CONFIG_ITEMS', `$1.SendSIGKILL, config_parse_bool, 0, offsetof($1, kill_context.send_sigkill) $1.SendSIGHUP, config_parse_bool, 0, offsetof($1, kill_context.send_sighup) $1.KillMode, config_parse_kill_mode, 0, offsetof($1, kill_context.kill_mode) -$1.KillSignal, config_parse_kill_signal, 0, offsetof($1, kill_context.kill_signal)' +$1.KillSignal, config_parse_signal, 0, offsetof($1, kill_context.kill_signal)' )m4_dnl m4_define(`CGROUP_CONTEXT_CONFIG_ITEMS', `$1.Slice, config_parse_unit_slice, 0, 0 diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 745291c5c6..542bf8eb26 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -1048,7 +1048,7 @@ int config_parse_bounding_set(const char *unit, cap = capability_from_name(t); if (cap < 0) { - log_syntax(unit, LOG_ERR, filename, line, errno, "Failed to parse capability in bounding set, ignoring: %s", t); + log_syntax(unit, LOG_ERR, filename, line, EINVAL, "Failed to parse capability in bounding set, ignoring: %s", t); continue; } @@ -1143,39 +1143,8 @@ int config_parse_sysv_priority(const char *unit, #endif DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode"); - DEFINE_CONFIG_PARSE_ENUM(config_parse_kill_mode, kill_mode, KillMode, "Failed to parse kill mode"); -int config_parse_kill_signal(const char *unit, - const char *filename, - unsigned line, - const char *section, - unsigned section_line, - const char *lvalue, - int ltype, - const char *rvalue, - void *data, - void *userdata) { - - int *sig = data; - int r; - - assert(filename); - assert(lvalue); - assert(rvalue); - assert(sig); - - r = signal_from_string_try_harder(rvalue); - if (r <= 0) { - log_syntax(unit, LOG_ERR, filename, line, -r, - "Failed to parse kill signal, ignoring: %s", rvalue); - return 0; - } - - *sig = r; - return 0; -} - int config_parse_exec_mount_flags(const char *unit, const char *filename, unsigned line, @@ -3021,36 +2990,6 @@ int config_parse_job_mode_isolate( return 0; } -int config_parse_personality( - const char *unit, - const char *filename, - unsigned line, - const char *section, - unsigned section_line, - const char *lvalue, - int ltype, - const char *rvalue, - void *data, - void *userdata) { - - unsigned long *personality = data, p; - - assert(filename); - assert(lvalue); - assert(rvalue); - assert(personality); - - p = personality_from_string(rvalue); - if (p == PERSONALITY_INVALID) { - log_syntax(unit, LOG_ERR, filename, line, EINVAL, - "Failed to parse personality, ignoring: %s", rvalue); - return 0; - } - - *personality = p; - return 0; -} - int config_parse_runtime_directory( const char *unit, const char *filename, @@ -3720,7 +3659,7 @@ void unit_dump_config_items(FILE *f) { { config_parse_sysv_priority, "SYSVPRIORITY" }, #endif { config_parse_kill_mode, "KILLMODE" }, - { config_parse_kill_signal, "SIGNAL" }, + { config_parse_signal, "SIGNAL" }, { config_parse_socket_listen, "SOCKET [...]" }, { config_parse_socket_bind, "SOCKETBIND" }, { config_parse_socket_bindtodevice, "NETWORKINTERFACE" }, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index fcca2b0221..1d128716c4 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -92,7 +92,6 @@ int config_parse_blockio_bandwidth(const char *unit, const char *filename, unsig int config_parse_job_mode(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_job_mode_isolate(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_exec_selinux_context(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); -int config_parse_personality(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_exec_apparmor_profile(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_exec_smack_process_label(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_address_families(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/core/locale-setup.c b/src/core/locale-setup.c index 108072ca9f..6961c26674 100644 --- a/src/core/locale-setup.c +++ b/src/core/locale-setup.c @@ -35,7 +35,7 @@ int locale_setup(char ***environment) { char *variables[_VARIABLE_LC_MAX] = {}; int r = 0, i; - if (detect_container(NULL) <= 0) { + if (detect_container() <= 0) { r = parse_env_file("/proc/cmdline", WHITESPACE, "locale.LANG", &variables[VARIABLE_LANG], "locale.LANGUAGE", &variables[VARIABLE_LANGUAGE], diff --git a/src/core/machine-id-setup.c b/src/core/machine-id-setup.c index 2d5ae3b3b9..8f682c6d10 100644 --- a/src/core/machine-id-setup.c +++ b/src/core/machine-id-setup.c @@ -108,7 +108,7 @@ static int generate_machine_id(char id[34], const char *root) { unsigned char *p; sd_id128_t buf; char *q; - const char *vm_id, *dbus_machine_id; + const char *dbus_machine_id; assert(id); @@ -133,8 +133,8 @@ static int generate_machine_id(char id[34], const char *root) { /* If that didn't work, see if we are running in a container, * and a machine ID was passed in via $container_uuid the way * libvirt/LXC does it */ - r = detect_container(NULL); - if (r > 0) { + + if (detect_container() > 0) { _cleanup_free_ char *e = NULL; r = getenv_for_pid(1, "container_uuid", &e); @@ -146,26 +146,24 @@ static int generate_machine_id(char id[34], const char *root) { } } - } else { + } else if (detect_vm() == VIRTUALIZATION_KVM) { + /* If we are not running in a container, see if we are * running in qemu/kvm and a machine ID was passed in * via -uuid on the qemu/kvm command line */ - r = detect_vm(&vm_id); - if (r > 0 && streq(vm_id, "kvm")) { - char uuid[36]; + char uuid[36]; - fd = open("/sys/class/dmi/id/product_uuid", O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); - if (fd >= 0) { - r = loop_read_exact(fd, uuid, 36, false); - safe_close(fd); + fd = open("/sys/class/dmi/id/product_uuid", O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW); + if (fd >= 0) { + r = loop_read_exact(fd, uuid, 36, false); + safe_close(fd); + if (r >= 0) { + r = shorten_uuid(id, uuid); if (r >= 0) { - r = shorten_uuid(id, uuid); - if (r >= 0) { - log_info("Initializing machine ID from KVM UUID."); - return 0; - } + log_info("Initializing machine ID from KVM UUID."); + return 0; } } } diff --git a/src/core/main.c b/src/core/main.c index 4cd2b08c38..fe8f1924bd 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -374,7 +374,7 @@ static int parse_proc_cmdline_item(const char *key, const char *value) { /* Note that log_parse_environment() handles 'debug' * too, and sets the log level to LOG_DEBUG. */ - if (detect_container(NULL) > 0) + if (detect_container() > 0) log_set_target(LOG_TARGET_CONSOLE); } else if (!in_initrd() && !value) { @@ -1297,7 +1297,7 @@ int main(int argc, char *argv[]) { if (getpid() == 1) umask(0); - if (getpid() == 1 && detect_container(NULL) <= 0) { + if (getpid() == 1 && detect_container() <= 0) { /* Running outside of a container as PID 1 */ arg_running_as = MANAGER_SYSTEM; @@ -1551,14 +1551,14 @@ int main(int argc, char *argv[]) { } if (arg_running_as == MANAGER_SYSTEM) { - const char *virtualization = NULL; + int v; log_info(PACKAGE_STRING " running in %ssystem mode. (" SYSTEMD_FEATURES ")", arg_action == ACTION_TEST ? "test " : "" ); - detect_virtualization(&virtualization); - if (virtualization) - log_info("Detected virtualization %s.", virtualization); + v = detect_virtualization(); + if (v > 0) + log_info("Detected virtualization %s.", virtualization_to_string(v)); write_container_id(); @@ -2046,7 +2046,7 @@ finish: /* Avoid the creation of new processes forked by the * kernel; at this point, we will not listen to the * signals anyway */ - if (detect_container(NULL) <= 0) + if (detect_container() <= 0) (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER); execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block); diff --git a/src/core/manager.c b/src/core/manager.c index fc10ddb5d9..56f2c92feb 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -554,7 +554,7 @@ int manager_new(ManagerRunningAs running_as, bool test_run, Manager **_m) { return -ENOMEM; #ifdef ENABLE_EFI - if (running_as == MANAGER_SYSTEM && detect_container(NULL) <= 0) + if (running_as == MANAGER_SYSTEM && detect_container() <= 0) boot_timestamps(&m->userspace_timestamp, &m->firmware_timestamp, &m->loader_timestamp); #endif @@ -2156,7 +2156,7 @@ void manager_send_unit_plymouth(Manager *m, Unit *u) { if (m->running_as != MANAGER_SYSTEM) return; - if (detect_container(NULL) > 0) + if (detect_container() > 0) return; if (u->type != UNIT_SERVICE && @@ -2613,7 +2613,7 @@ static void manager_notify_finished(Manager *m) { if (m->test_run) return; - if (m->running_as == MANAGER_SYSTEM && detect_container(NULL) <= 0) { + if (m->running_as == MANAGER_SYSTEM && detect_container() <= 0) { /* Note that m->kernel_usec.monotonic is always at 0, * and m->firmware_usec.monotonic and diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c index c6f3569915..e84f80b61b 100644 --- a/src/core/mount-setup.c +++ b/src/core/mount-setup.c @@ -164,7 +164,7 @@ static int mount_one(const MountPoint *p, bool relabel) { return 0; /* Skip securityfs in a container */ - if (!(p->mode & MNT_IN_CONTAINER) && detect_container(NULL) > 0) + if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0) return 0; /* The access mode here doesn't really matter too much, since @@ -385,7 +385,7 @@ int mount_setup(bool loaded_policy) { * nspawn and the container tools work out of the box. If * specific setups need other settings they can reset the * propagation mode to private if needed. */ - if (detect_container(NULL) <= 0) + if (detect_container() <= 0) if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0) log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m"); diff --git a/src/core/shutdown.c b/src/core/shutdown.c index aba16b4689..8a6fd25f31 100644 --- a/src/core/shutdown.c +++ b/src/core/shutdown.c @@ -202,7 +202,7 @@ int main(int argc, char *argv[]) { log_info("Sending SIGKILL to remaining processes..."); broadcast_signal(SIGKILL, true, false); - in_container = detect_container(NULL) > 0; + in_container = detect_container() > 0; need_umount = !in_container; need_swapoff = !in_container; diff --git a/src/core/swap.c b/src/core/swap.c index 4f3ddc9f04..2f462e339d 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -215,7 +215,7 @@ static int swap_add_default_dependencies(Swap *s) { if (UNIT(s)->manager->running_as != MANAGER_SYSTEM) return 0; - if (detect_container(NULL) > 0) + if (detect_container() > 0) return 0; return unit_add_two_dependencies_by_name(UNIT(s), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET, NULL, true); @@ -824,7 +824,7 @@ static int swap_start(Unit *u) { assert(s->state == SWAP_DEAD || s->state == SWAP_FAILED); - if (detect_container(NULL) > 0) + if (detect_container() > 0) return -EPERM; /* If there's a job for another swap unit for the same node @@ -857,7 +857,7 @@ static int swap_stop(Unit *u) { s->state == SWAP_ACTIVATING_DONE || s->state == SWAP_ACTIVE); - if (detect_container(NULL) > 0) + if (detect_container() > 0) return -EPERM; swap_enter_deactivating(s); @@ -1404,7 +1404,7 @@ static bool swap_supported(void) { if (supported < 0) supported = access("/proc/swaps", F_OK) >= 0 && - detect_container(NULL) <= 0; + detect_container() <= 0; return supported; } diff --git a/src/core/umount.c b/src/core/umount.c index d59b5d0ffb..22dbe67259 100644 --- a/src/core/umount.c +++ b/src/core/umount.c @@ -368,7 +368,7 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e read-only mount anything as that brings no real benefits, but might confuse the host, as we remount the superblock here, not the bind mound. */ - if (detect_container(NULL) <= 0) { + if (detect_container() <= 0) { /* We always try to remount directories * read-only first, before we go on and umount * them. diff --git a/src/core/unit.c b/src/core/unit.c index a5714adf38..24a6747b10 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1416,6 +1416,7 @@ int unit_start(Unit *u) { assert(u); + /* Units that aren't loaded cannot be started */ if (u->load_state != UNIT_LOADED) return -EINVAL; @@ -1444,6 +1445,15 @@ int unit_start(Unit *u) { return -EPROTO; } + /* Units of types that aren't supported cannot be + * started. Note that we do this test only after the condition + * checks, so that we rather return condition check errors + * (which are usually not considered a true failure) than "not + * supported" errors (which are considered a failure). + */ + if (!unit_supported(u)) + return -EOPNOTSUPP; + /* Forward to the main object, if we aren't it. */ following = unit_following(u); if (following) { @@ -1451,9 +1461,6 @@ int unit_start(Unit *u) { return unit_start(following); } - if (!unit_supported(u)) - return -EOPNOTSUPP; - /* If it is stopped, but we cannot start it, then fail */ if (!UNIT_VTABLE(u)->start) return -EBADR; @@ -1472,6 +1479,12 @@ int unit_start(Unit *u) { bool unit_can_start(Unit *u) { assert(u); + if (u->load_state != UNIT_LOADED) + return false; + + if (!unit_supported(u)) + return false; + return !!UNIT_VTABLE(u)->start; } @@ -3497,7 +3510,7 @@ int unit_kill_context( * them.*/ if (cg_unified() > 0 || - (detect_container(NULL) == 0 && !unit_cgroup_delegate(u))) + (detect_container() == 0 && !unit_cgroup_delegate(u))) wait_for_exit = true; if (c->send_sighup && k != KILL_KILL) { diff --git a/src/detect-virt/detect-virt.c b/src/detect-virt/detect-virt.c index 606d073cbc..97ae569ca5 100644 --- a/src/detect-virt/detect-virt.c +++ b/src/detect-virt/detect-virt.c @@ -108,9 +108,7 @@ static int parse_argv(int argc, char *argv[]) { } int main(int argc, char *argv[]) { - const char *id = NULL; - int retval = EXIT_SUCCESS; - int r; + int retval = EXIT_SUCCESS, r; /* This is mostly intended to be used for scripts which want * to detect whether we are being run in a virtualized @@ -125,42 +123,39 @@ int main(int argc, char *argv[]) { switch (arg_mode) { - case ANY_VIRTUALIZATION: { - int v; - - v = detect_virtualization(&id); - if (v < 0) { - log_error_errno(v, "Failed to check for virtualization: %m"); + case ONLY_VM: + r = detect_vm(); + if (r < 0) { + log_error_errno(r, "Failed to check for vm: %m"); return EXIT_FAILURE; } - retval = v != VIRTUALIZATION_NONE ? EXIT_SUCCESS : EXIT_FAILURE; break; - } case ONLY_CONTAINER: - r = detect_container(&id); + r = detect_container(); if (r < 0) { log_error_errno(r, "Failed to check for container: %m"); return EXIT_FAILURE; } - retval = r > 0 ? EXIT_SUCCESS : EXIT_FAILURE; break; - case ONLY_VM: - r = detect_vm(&id); + case ANY_VIRTUALIZATION: + default: + r = detect_virtualization(); if (r < 0) { - log_error_errno(r, "Failed to check for vm: %m"); + log_error_errno(r, "Failed to check for virtualization: %m"); return EXIT_FAILURE; } - retval = r > 0 ? EXIT_SUCCESS : EXIT_FAILURE; break; } if (!arg_quiet) - puts(id ? id : "none"); + puts(virtualization_to_string(r)); + + retval = r != VIRTUALIZATION_NONE ? EXIT_SUCCESS : EXIT_FAILURE; return retval; } diff --git a/src/fstab-generator/fstab-generator.c b/src/fstab-generator/fstab-generator.c index a88b68e2c0..3f8ea5647c 100644 --- a/src/fstab-generator/fstab-generator.c +++ b/src/fstab-generator/fstab-generator.c @@ -65,7 +65,7 @@ static int add_swap( return 0; } - if (detect_container(NULL) > 0) { + if (detect_container() > 0) { log_info("Running in a container, ignoring fstab swap entry for %s.", what); return 0; } diff --git a/src/getty-generator/getty-generator.c b/src/getty-generator/getty-generator.c index d23caab44a..9a4b038ef3 100644 --- a/src/getty-generator/getty-generator.c +++ b/src/getty-generator/getty-generator.c @@ -142,7 +142,7 @@ int main(int argc, char *argv[]) { umask(0022); - if (detect_container(NULL) > 0) { + if (detect_container() > 0) { _cleanup_free_ char *container_ttys = NULL; log_debug("Automatically adding console shell."); diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c index 0a34f86be7..bb821797f1 100644 --- a/src/gpt-auto-generator/gpt-auto-generator.c +++ b/src/gpt-auto-generator/gpt-auto-generator.c @@ -460,7 +460,7 @@ static int add_boot(const char *what) { return 0; } - if (detect_container(NULL) > 0) { + if (detect_container() > 0) { log_debug("In a container, ignoring /boot."); return 0; } @@ -526,9 +526,9 @@ static int add_boot(const char *what) { what, "/boot", "vfat", - "EFI System Partition Automount", - false, + true, "umask=0077", + "EFI System Partition Automount", 120 * USEC_PER_SEC); return r; @@ -666,6 +666,7 @@ static int enumerate_partitions(dev_t devnum) { first = udev_enumerate_get_list_entry(e); udev_list_entry_foreach(item, first) { _cleanup_udev_device_unref_ struct udev_device *q; + unsigned long long flags; const char *stype, *subnode; sd_id128_t type_id; blkid_partition pp; @@ -705,10 +706,10 @@ static int enumerate_partitions(dev_t devnum) { if (sd_id128_from_string(stype, &type_id) < 0) continue; + flags = blkid_partition_get_flags(pp); + if (sd_id128_equal(type_id, GPT_SWAP)) { - unsigned long long flags; - flags = blkid_partition_get_flags(pp); if (flags & GPT_FLAG_NO_AUTO) continue; @@ -727,6 +728,10 @@ static int enumerate_partitions(dev_t devnum) { if (boot && nr >= boot_nr) continue; + /* Note that we do not honour the "no-auto" + * flag for the ESP, as it is often unset, to + * hide it from Windows. */ + boot_nr = nr; r = free_and_strdup(&boot, subnode); @@ -734,9 +739,7 @@ static int enumerate_partitions(dev_t devnum) { return log_oom(); } else if (sd_id128_equal(type_id, GPT_HOME)) { - unsigned long long flags; - flags = blkid_partition_get_flags(pp); if (flags & GPT_FLAG_NO_AUTO) continue; @@ -752,9 +755,7 @@ static int enumerate_partitions(dev_t devnum) { return log_oom(); } else if (sd_id128_equal(type_id, GPT_SRV)) { - unsigned long long flags; - flags = blkid_partition_get_flags(pp); if (flags & GPT_FLAG_NO_AUTO) continue; @@ -799,6 +800,10 @@ static int get_block_device(const char *path, dev_t *dev) { assert(path); assert(dev); + /* Get's the block device directly backing a file system. If + * the block device is encrypted, returns the device mapper + * block device. */ + if (lstat(path, &st)) return -errno; @@ -816,6 +821,77 @@ static int get_block_device(const char *path, dev_t *dev) { return 0; } +static int get_block_device_harder(const char *path, dev_t *dev) { + _cleanup_closedir_ DIR *d = NULL; + _cleanup_free_ char *p = NULL, *t = NULL; + struct dirent *de, *found = NULL; + const char *q; + unsigned maj, min; + dev_t dt; + int r; + + assert(path); + assert(dev); + + /* Gets the backing block device for a file system, and + * handles LUKS encrypted file systems, looking for its + * immediate parent, if there is one. */ + + r = get_block_device(path, &dt); + if (r <= 0) + return r; + + if (asprintf(&p, "/sys/dev/block/%u:%u/slaves", major(dt), minor(dt)) < 0) + return -ENOMEM; + + d = opendir(p); + if (!d) { + if (errno == ENOENT) + goto fallback; + + return -errno; + } + + FOREACH_DIRENT_ALL(de, d, return -errno) { + + if (STR_IN_SET(de->d_name, ".", "..")) + continue; + + if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN)) + continue; + + if (found) /* Don't try to support multiple backing block devices */ + goto fallback; + + found = de; + break; + } + + if (!found) + goto fallback; + + q = strjoina(p, "/", found->d_name, "/dev"); + + r = read_one_line_file(q, &t); + if (r == -ENOENT) + goto fallback; + if (r < 0) + return r; + + if (sscanf(t, "%u:%u", &maj, &min) != 2) + return -EINVAL; + + if (maj == 0) + goto fallback; + + *dev = makedev(maj, min); + return 1; + +fallback: + *dev = dt; + return 1; +} + static int parse_proc_cmdline_item(const char *key, const char *value) { int r; @@ -883,11 +959,11 @@ static int add_mounts(void) { dev_t devno; int r; - r = get_block_device("/", &devno); + r = get_block_device_harder("/", &devno); if (r < 0) return log_error_errno(r, "Failed to determine block device of root file system: %m"); else if (r == 0) { - r = get_block_device("/usr", &devno); + r = get_block_device_harder("/usr", &devno); if (r < 0) return log_error_errno(r, "Failed to determine block device of /usr file system: %m"); else if (r == 0) { @@ -916,7 +992,7 @@ int main(int argc, char *argv[]) { umask(0022); - if (detect_container(NULL) > 0) { + if (detect_container() > 0) { log_debug("In a container, exiting."); return EXIT_SUCCESS; } diff --git a/src/hostname/hostnamed.c b/src/hostname/hostnamed.c index c423be3767..e90ae54156 100644 --- a/src/hostname/hostnamed.c +++ b/src/hostname/hostnamed.c @@ -155,11 +155,11 @@ static const char* fallback_chassis(void) { unsigned t; int v; - v = detect_virtualization(NULL); + v = detect_virtualization(); - if (v == VIRTUALIZATION_VM) + if (VIRTUALIZATION_IS_VM(v)) return "vm"; - if (v == VIRTUALIZATION_CONTAINER) + if (VIRTUALIZATION_IS_CONTAINER(v)) return "container"; r = read_one_line_file("/sys/firmware/acpi/pm_profile", &type); diff --git a/src/journal/journald-kmsg.c b/src/journal/journald-kmsg.c index e5be7f7766..51fe3aa50a 100644 --- a/src/journal/journald-kmsg.c +++ b/src/journal/journald-kmsg.c @@ -190,7 +190,7 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { for (j = 0; l > 0 && j < N_IOVEC_KERNEL_FIELDS; j++) { char *m; - /* Meta data fields attached */ + /* Metadata fields attached */ if (*k != ' ') break; diff --git a/src/libsystemd-network/dhcp-identifier.c b/src/libsystemd-network/dhcp-identifier.c index 70c68ad131..7d9cad2a70 100644 --- a/src/libsystemd-network/dhcp-identifier.c +++ b/src/libsystemd-network/dhcp-identifier.c @@ -66,7 +66,7 @@ int dhcp_identifier_set_iaid(int ifindex, uint8_t *mac, size_t mac_len, void *_i const char *name = NULL; uint64_t id; - if (detect_container(NULL) <= 0) { + if (detect_container() <= 0) { /* not in a container, udev will be around */ _cleanup_udev_unref_ struct udev *udev; char ifindex_str[2 + DECIMAL_STR_MAX(int)]; diff --git a/src/libsystemd-network/test-dhcp6-client.c b/src/libsystemd-network/test-dhcp6-client.c index 12daac3211..9c7f9ffb1b 100644 --- a/src/libsystemd-network/test-dhcp6-client.c +++ b/src/libsystemd-network/test-dhcp6-client.c @@ -357,18 +357,6 @@ static int test_hangcheck(sd_event_source *s, uint64_t usec, void *userdata) { return 0; } -int detect_vm(const char **id) { - return 1; -} - -int detect_container(const char **id) { - return 1; -} - -int detect_virtualization(const char **id) { - return 1; -} - static void test_client_solicit_cb(sd_dhcp6_client *client, int event, void *userdata) { sd_event *e = userdata; diff --git a/src/libsystemd/sd-bus/bus-container.c b/src/libsystemd/sd-bus/bus-container.c index 56dc086ae2..5c607f49b1 100644 --- a/src/libsystemd/sd-bus/bus-container.c +++ b/src/libsystemd/sd-bus/bus-container.c @@ -29,10 +29,12 @@ #include "bus-container.h" int bus_container_connect_socket(sd_bus *b) { + _cleanup_close_pair_ int pair[2] = { -1, -1 }; _cleanup_close_ int pidnsfd = -1, mntnsfd = -1, usernsfd = -1, rootfd = -1; pid_t child; siginfo_t si; - int r; + int r, error_buf = 0; + ssize_t n; assert(b); assert(b->input_fd < 0); @@ -57,6 +59,9 @@ int bus_container_connect_socket(sd_bus *b) { bus_socket_setup(b); + if (socketpair(AF_UNIX, SOCK_SEQPACKET, 0, pair) < 0) + return -errno; + child = fork(); if (child < 0) return -errno; @@ -64,9 +69,11 @@ int bus_container_connect_socket(sd_bus *b) { if (child == 0) { pid_t grandchild; + pair[0] = safe_close(pair[0]); + r = namespace_enter(pidnsfd, mntnsfd, -1, usernsfd, rootfd); if (r < 0) - _exit(255); + _exit(EXIT_FAILURE); /* We just changed PID namespace, however it will only * take effect on the children we now fork. Hence, @@ -77,16 +84,16 @@ int bus_container_connect_socket(sd_bus *b) { grandchild = fork(); if (grandchild < 0) - _exit(255); + _exit(EXIT_FAILURE); if (grandchild == 0) { r = connect(b->input_fd, &b->sockaddr.sa, b->sockaddr_size); if (r < 0) { - if (errno == EINPROGRESS) - _exit(1); - - _exit(255); + /* Try to send error up */ + error_buf = errno; + (void) write(pair[1], &error_buf, sizeof(error_buf)); + _exit(EXIT_FAILURE); } _exit(EXIT_SUCCESS); @@ -94,24 +101,41 @@ int bus_container_connect_socket(sd_bus *b) { r = wait_for_terminate(grandchild, &si); if (r < 0) - _exit(255); + _exit(EXIT_FAILURE); if (si.si_code != CLD_EXITED) - _exit(255); + _exit(EXIT_FAILURE); _exit(si.si_status); } + pair[1] = safe_close(pair[1]); + r = wait_for_terminate(child, &si); if (r < 0) return r; + n = read(pair[0], &error_buf, sizeof(error_buf)); + if (n < 0) + return -errno; + + if (n > 0) { + if (n != sizeof(error_buf)) + return -EIO; + + if (error_buf < 0) + return -EIO; + + if (error_buf == EINPROGRESS) + return 1; + + if (error_buf > 0) + return -error_buf; + } + if (si.si_code != CLD_EXITED) return -EIO; - if (si.si_status == 1) - return 1; - if (si.si_status != EXIT_SUCCESS) return -EIO; @@ -157,7 +181,7 @@ int bus_container_connect_kernel(sd_bus *b) { if (r < 0) return r; - if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) < 0) + if (socketpair(AF_UNIX, SOCK_SEQPACKET, 0, pair) < 0) return -errno; child = fork(); diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 62b63ec3d9..fd39a56225 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -1708,7 +1708,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) { s->enabled = m; - r = event_make_signal_data(s->event, s->signal.sig, NULL); + r = event_make_signal_data(s->event, SIGCHLD, NULL); if (r < 0) { s->enabled = SD_EVENT_OFF; s->event->n_enabled_child_sources--; diff --git a/src/libsystemd/sd-login/sd-login.c b/src/libsystemd/sd-login/sd-login.c index 55da26e9d9..265c7c7db2 100644 --- a/src/libsystemd/sd-login/sd-login.c +++ b/src/libsystemd/sd-login/sd-login.c @@ -645,10 +645,10 @@ _public_ int sd_seat_get_active(const char *seat, char **session, uid_t *uid) { return r; if (session && !s) - return -ENOENT; + return -ENODATA; if (uid && !t) - return -ENOENT; + return -ENODATA; if (uid && t) { r = parse_uid(t, uid); diff --git a/src/locale/localectl.c b/src/locale/localectl.c index 1e06f2251c..4a91c7420a 100644 --- a/src/locale/localectl.c +++ b/src/locale/localectl.c @@ -96,7 +96,7 @@ static void print_overridden_variables(void) { LocaleVariable j; bool print_warning = true; - if (detect_container(NULL) > 0 || arg_host) + if (detect_container() > 0 || arg_host) return; r = parse_env_file("/proc/cmdline", WHITESPACE, diff --git a/src/login/logind-dbus.c b/src/login/logind-dbus.c index 14b6e0ddad..b6fa50aa52 100644 --- a/src/login/logind-dbus.c +++ b/src/login/logind-dbus.c @@ -1789,10 +1789,9 @@ static int nologin_timeout_handler( } static int update_schedule_file(Manager *m) { - - int r; + _cleanup_free_ char *temp_path = NULL; _cleanup_fclose_ FILE *f = NULL; - _cleanup_free_ char *t = NULL, *temp_path = NULL; + int r; assert(m); @@ -1800,12 +1799,6 @@ static int update_schedule_file(Manager *m) { if (r < 0) return log_error_errno(r, "Failed to create shutdown subdirectory: %m"); - if (!isempty(m->wall_message)) { - t = cescape(m->wall_message); - if (!t) - return log_oom(); - } - r = fopen_temporary("/run/systemd/shutdown/scheduled", &f, &temp_path); if (r < 0) return log_error_errno(r, "Failed to save information about scheduled shutdowns: %m"); @@ -1820,8 +1813,17 @@ static int update_schedule_file(Manager *m) { m->enable_wall_messages, m->scheduled_shutdown_type); - if (t) + if (!isempty(m->wall_message)) { + _cleanup_free_ char *t; + + t = cescape(m->wall_message); + if (!t) { + r = -ENOMEM; + goto fail; + } + fprintf(f, "WALL_MESSAGE=%s\n", t); + } r = fflush_and_check(f); if (r < 0) diff --git a/src/login/pam_systemd.c b/src/login/pam_systemd.c index 71c84d8e0a..f66f1ce842 100644 --- a/src/login/pam_systemd.c +++ b/src/login/pam_systemd.c @@ -185,7 +185,7 @@ static int export_legacy_dbus_address( if (asprintf(&s, KERNEL_USER_BUS_ADDRESS_FMT ";" UNIX_USER_BUS_ADDRESS_FMT, uid, runtime) < 0) goto error; } else { - /* FIXME: We *realy* should move the access() check into the + /* FIXME: We *really* should move the access() check into the * daemons that spawn dbus-daemon, instead of forcing * DBUS_SESSION_BUS_ADDRESS= here. */ diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c index fbeace0ed6..cc38116704 100644 --- a/src/machine/machine-dbus.c +++ b/src/machine/machine-dbus.c @@ -519,7 +519,7 @@ int bus_machine_method_open_pty(sd_bus_message *message, void *userdata, sd_bus_ return sd_bus_send(NULL, reply, NULL); } -static int container_bus_new(Machine *m, sd_bus **ret) { +static int container_bus_new(Machine *m, sd_bus_error *error, sd_bus **ret) { int r; assert(m); @@ -548,6 +548,8 @@ static int container_bus_new(Machine *m, sd_bus **ret) { bus->is_system = true; r = sd_bus_start(bus); + if (r == -ENOENT) + return sd_bus_error_set_errnof(error, r, "There is no system bus in container %s.", m->name); if (r < 0) return r; @@ -602,7 +604,7 @@ int bus_machine_method_open_login(sd_bus_message *message, void *userdata, sd_bu if (!p) return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "PTS name %s is invalid", pty_name); - r = container_bus_new(m, &allocated_bus); + r = container_bus_new(m, error, &allocated_bus); if (r < 0) return r; @@ -704,7 +706,7 @@ int bus_machine_method_open_shell(sd_bus_message *message, void *userdata, sd_bu utmp_id = path_startswith(pty_name, "/dev/"); assert(utmp_id); - r = container_bus_new(m, &allocated_bus); + r = container_bus_new(m, error, &allocated_bus); if (r < 0) return r; diff --git a/src/machine/machine.c b/src/machine/machine.c index 0d1b119dc1..a056ec3b08 100644 --- a/src/machine/machine.c +++ b/src/machine/machine.c @@ -547,8 +547,18 @@ int machine_openpt(Machine *m, int flags) { switch (m->class) { - case MACHINE_HOST: - return posix_openpt(flags); + case MACHINE_HOST: { + int fd; + + fd = posix_openpt(flags); + if (fd < 0) + return -errno; + + if (unlockpt(fd) < 0) + return -errno; + + return fd; + } case MACHINE_CONTAINER: if (m->leader <= 0) diff --git a/src/network/networkd-link.c b/src/network/networkd-link.c index 1dc9db0fca..9d4a69b0db 100644 --- a/src/network/networkd-link.c +++ b/src/network/networkd-link.c @@ -2152,7 +2152,7 @@ int link_add(Manager *m, sd_netlink_message *message, Link **ret) { log_link_debug(link, "Link %d added", link->ifindex); - if (detect_container(NULL) <= 0) { + if (detect_container() <= 0) { /* not in a container, udev will be around */ sprintf(ifindex_str, "n%d", link->ifindex); device = udev_device_new_from_device_id(m->udev, ifindex_str); diff --git a/src/network/networkd-manager.c b/src/network/networkd-manager.c index 92b607297d..b4259cafef 100644 --- a/src/network/networkd-manager.c +++ b/src/network/networkd-manager.c @@ -241,7 +241,7 @@ static int manager_connect_udev(Manager *m) { /* udev does not initialize devices inside containers, * so we rely on them being already initialized before * entering the container */ - if (detect_container(NULL) > 0) + if (detect_container() > 0) return 0; m->udev = udev_new(); diff --git a/src/nspawn/.gitignore b/src/nspawn/.gitignore new file mode 100644 index 0000000000..85c81fff24 --- /dev/null +++ b/src/nspawn/.gitignore @@ -0,0 +1 @@ +/nspawn-gperf.c diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c new file mode 100644 index 0000000000..c0e9ccd7a4 --- /dev/null +++ b/src/nspawn/nspawn-cgroup.c @@ -0,0 +1,162 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/mount.h> + +#include "util.h" +#include "strv.h" +#include "mkdir.h" +#include "fileio.h" +#include "cgroup-util.h" + +#include "nspawn-cgroup.h" + +int chown_cgroup(pid_t pid, uid_t uid_shift) { + _cleanup_free_ char *path = NULL, *fs = NULL; + _cleanup_close_ int fd = -1; + const char *fn; + int r; + + r = cg_pid_get_path(NULL, pid, &path); + if (r < 0) + return log_error_errno(r, "Failed to get container cgroup path: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (fd < 0) + return log_error_errno(errno, "Failed to open %s: %m", fs); + + FOREACH_STRING(fn, + ".", + "tasks", + "notify_on_release", + "cgroup.procs", + "cgroup.clone_children", + "cgroup.controllers", + "cgroup.subtree_control", + "cgroup.populated") + if (fchownat(fd, fn, uid_shift, uid_shift, 0) < 0) + log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, + "Failed to chown() cgroup file %s, ignoring: %m", fn); + + return 0; +} + +int sync_cgroup(pid_t pid, bool unified_requested) { + _cleanup_free_ char *cgroup = NULL; + char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; + bool undo_mount = false; + const char *fn; + int unified, r; + + unified = cg_unified(); + if (unified < 0) + return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + + if ((unified > 0) == unified_requested) + return 0; + + /* When the host uses the legacy cgroup setup, but the + * container shall use the unified hierarchy, let's make sure + * we copy the path from the name=systemd hierarchy into the + * unified hierarchy. Similar for the reverse situation. */ + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid); + + /* In order to access the unified hierarchy we need to mount it */ + if (!mkdtemp(tree)) + return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m"); + + if (unified) + r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); + else + r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior"); + if (r < 0) { + r = log_error_errno(errno, "Failed to mount unified hierarchy: %m"); + goto finish; + } + + undo_mount = true; + + fn = strjoina(tree, cgroup, "/cgroup.procs"); + (void) mkdir_parents(fn, 0755); + + sprintf(pid_string, PID_FMT, pid); + r = write_string_file(fn, pid_string, 0); + if (r < 0) + log_error_errno(r, "Failed to move process: %m"); + +finish: + if (undo_mount) + (void) umount(tree); + + (void) rmdir(tree); + return r; +} + +int create_subcgroup(pid_t pid, bool unified_requested) { + _cleanup_free_ char *cgroup = NULL; + const char *child; + int unified, r; + CGroupMask supported; + + /* In the unified hierarchy inner nodes may only only contain + * subgroups, but not processes. Hence, if we running in the + * unified hierarchy and the container does the same, and we + * did not create a scope unit for the container move us and + * the container into two separate subcgroups. */ + + if (!unified_requested) + return 0; + + unified = cg_unified(); + if (unified < 0) + return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + if (unified == 0) + return 0; + + r = cg_mask_supported(&supported); + if (r < 0) + return log_error_errno(r, "Failed to determine supported controllers: %m"); + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (r < 0) + return log_error_errno(r, "Failed to get our control group: %m"); + + child = strjoina(cgroup, "/payload"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + + child = strjoina(cgroup, "/supervisor"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + + /* Try to enable as many controllers as possible for the new payload. */ + (void) cg_enable_everywhere(supported, supported, cgroup); + return 0; +} diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h new file mode 100644 index 0000000000..985fdfaad5 --- /dev/null +++ b/src/nspawn/nspawn-cgroup.h @@ -0,0 +1,29 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> +#include <stdbool.h> + +int chown_cgroup(pid_t pid, uid_t uid_shift); +int sync_cgroup(pid_t pid, bool unified_requested); +int create_subcgroup(pid_t pid, bool unified_requested); diff --git a/src/nspawn/nspawn-expose-ports.c b/src/nspawn/nspawn-expose-ports.c new file mode 100644 index 0000000000..38250b6e02 --- /dev/null +++ b/src/nspawn/nspawn-expose-ports.c @@ -0,0 +1,277 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include "sd-netlink.h" + +#include "util.h" +#include "in-addr-util.h" +#include "firewall-util.h" +#include "local-addresses.h" +#include "netlink-util.h" + +#include "nspawn-expose-ports.h" + +int expose_port_parse(ExposePort **l, const char *s) { + + const char *split, *e; + uint16_t container_port, host_port; + int protocol; + ExposePort *p; + int r; + + assert(l); + assert(s); + + if ((e = startswith(s, "tcp:"))) + protocol = IPPROTO_TCP; + else if ((e = startswith(s, "udp:"))) + protocol = IPPROTO_UDP; + else { + e = s; + protocol = IPPROTO_TCP; + } + + split = strchr(e, ':'); + if (split) { + char v[split - e + 1]; + + memcpy(v, e, split - e); + v[split - e] = 0; + + r = safe_atou16(v, &host_port); + if (r < 0 || host_port <= 0) + return -EINVAL; + + r = safe_atou16(split + 1, &container_port); + } else { + r = safe_atou16(e, &container_port); + host_port = container_port; + } + + if (r < 0 || container_port <= 0) + return -EINVAL; + + LIST_FOREACH(ports, p, *l) + if (p->protocol == protocol && p->host_port == host_port) + return -EEXIST; + + p = new(ExposePort, 1); + if (!p) + return -ENOMEM; + + p->protocol = protocol; + p->host_port = host_port; + p->container_port = container_port; + + LIST_PREPEND(ports, *l, p); + + return 0; +} + +void expose_port_free_all(ExposePort *p) { + + while (p) { + ExposePort *q = p; + LIST_REMOVE(ports, p, q); + free(q); + } +} + +int expose_port_flush(ExposePort* l, union in_addr_union *exposed) { + ExposePort *p; + int r, af = AF_INET; + + assert(exposed); + + if (!l) + return 0; + + if (in_addr_is_null(af, exposed)) + return 0; + + log_debug("Lost IP address."); + + LIST_FOREACH(ports, p, l) { + r = fw_add_local_dnat(false, + af, + p->protocol, + NULL, + NULL, 0, + NULL, 0, + p->host_port, + exposed, + p->container_port, + NULL); + if (r < 0) + log_warning_errno(r, "Failed to modify firewall: %m"); + } + + *exposed = IN_ADDR_NULL; + return 0; +} + +int expose_port_execute(sd_netlink *rtnl, ExposePort *l, union in_addr_union *exposed) { + _cleanup_free_ struct local_address *addresses = NULL; + _cleanup_free_ char *pretty = NULL; + union in_addr_union new_exposed; + ExposePort *p; + bool add; + int af = AF_INET, r; + + assert(exposed); + + /* Invoked each time an address is added or removed inside the + * container */ + + if (!l) + return 0; + + r = local_addresses(rtnl, 0, af, &addresses); + if (r < 0) + return log_error_errno(r, "Failed to enumerate local addresses: %m"); + + add = r > 0 && + addresses[0].family == af && + addresses[0].scope < RT_SCOPE_LINK; + + if (!add) + return expose_port_flush(l, exposed); + + new_exposed = addresses[0].address; + if (in_addr_equal(af, exposed, &new_exposed)) + return 0; + + in_addr_to_string(af, &new_exposed, &pretty); + log_debug("New container IP is %s.", strna(pretty)); + + LIST_FOREACH(ports, p, l) { + + r = fw_add_local_dnat(true, + af, + p->protocol, + NULL, + NULL, 0, + NULL, 0, + p->host_port, + &new_exposed, + p->container_port, + in_addr_is_null(af, exposed) ? NULL : exposed); + if (r < 0) + log_warning_errno(r, "Failed to modify firewall: %m"); + } + + *exposed = new_exposed; + return 0; +} + +int expose_port_send_rtnl(int send_fd) { + union { + struct cmsghdr cmsghdr; + uint8_t buf[CMSG_SPACE(sizeof(int))]; + } control = {}; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + _cleanup_close_ int fd = -1; + ssize_t k; + + assert(send_fd >= 0); + + fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE); + if (fd < 0) + return log_error_errno(errno, "Failed to allocate container netlink: %m"); + + cmsg = CMSG_FIRSTHDR(&mh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); + + mh.msg_controllen = cmsg->cmsg_len; + + /* Store away the fd in the socket, so that it stays open as + * long as we run the child */ + k = sendmsg(send_fd, &mh, MSG_NOSIGNAL); + if (k < 0) + return log_error_errno(errno, "Failed to send netlink fd: %m"); + + return 0; +} + +int expose_port_watch_rtnl( + sd_event *event, + int recv_fd, + sd_netlink_message_handler_t handler, + union in_addr_union *exposed, + sd_netlink **ret) { + + union { + struct cmsghdr cmsghdr; + uint8_t buf[CMSG_SPACE(sizeof(int))]; + } control = {}; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + int fd, r; + ssize_t k; + + assert(event); + assert(recv_fd >= 0); + assert(ret); + + k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL); + if (k < 0) + return log_error_errno(errno, "Failed to recv netlink fd: %m"); + + cmsg = CMSG_FIRSTHDR(&mh); + assert(cmsg->cmsg_level == SOL_SOCKET); + assert(cmsg->cmsg_type == SCM_RIGHTS); + assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); + memcpy(&fd, CMSG_DATA(cmsg), sizeof(int)); + + r = sd_netlink_open_fd(&rtnl, fd); + if (r < 0) { + safe_close(fd); + return log_error_errno(r, "Failed to create rtnl object: %m"); + } + + r = sd_netlink_add_match(rtnl, RTM_NEWADDR, handler, exposed); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m"); + + r = sd_netlink_add_match(rtnl, RTM_DELADDR, handler, exposed); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m"); + + r = sd_netlink_attach_event(rtnl, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to add to even loop: %m"); + + *ret = rtnl; + rtnl = NULL; + + return 0; +} diff --git a/src/nspawn/nspawn-expose-ports.h b/src/nspawn/nspawn-expose-ports.h new file mode 100644 index 0000000000..39cec28695 --- /dev/null +++ b/src/nspawn/nspawn-expose-ports.h @@ -0,0 +1,45 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <inttypes.h> + +#include "sd-event.h" +#include "sd-netlink.h" +#include "list.h" +#include "in-addr-util.h" + +typedef struct ExposePort { + int protocol; + uint16_t host_port; + uint16_t container_port; + LIST_FIELDS(struct ExposePort, ports); +} ExposePort; + +void expose_port_free_all(ExposePort *p); +int expose_port_parse(ExposePort **l, const char *s); + +int expose_port_watch_rtnl(sd_event *event, int recv_fd, sd_netlink_message_handler_t handler, union in_addr_union *exposed, sd_netlink **ret); +int expose_port_send_rtnl(int send_fd); + +int expose_port_execute(sd_netlink *rtnl, ExposePort *l, union in_addr_union *exposed); +int expose_port_flush(ExposePort* l, union in_addr_union *exposed); diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf new file mode 100644 index 0000000000..074b380306 --- /dev/null +++ b/src/nspawn/nspawn-gperf.gperf @@ -0,0 +1,38 @@ +%{ +#include <stddef.h> +#include "conf-parser.h" +#include "nspawn-settings.h" +#include "nspawn-expose-ports.h" +%} +struct ConfigPerfItem; +%null_strings +%language=ANSI-C +%define slot-name section_and_lvalue +%define hash-function-name nspawn_gperf_hash +%define lookup-function-name nspawn_gperf_lookup +%readonly-tables +%omit-struct-type +%struct-type +%includes +%% +Exec.Boot, config_parse_tristate, 0, offsetof(Settings, boot) +Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) +Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) +Exec.User, config_parse_string, 0, offsetof(Settings, user) +Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability) +Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability) +Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal) +Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) +Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) +Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) +Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) +Files.Bind, config_parse_bind, 0, 0 +Files.BindReadOnly, config_parse_bind, 1, 0 +Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) +Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) +Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) +Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan) +Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) +Network.Bridge config_parse_string, 0, offsetof(Settings, network_bridge) +Network.Port, config_parse_expose_port, 0, 0 diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c new file mode 100644 index 0000000000..2bca39f45d --- /dev/null +++ b/src/nspawn/nspawn-mount.c @@ -0,0 +1,869 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/mount.h> + +#include "util.h" +#include "rm-rf.h" +#include "strv.h" +#include "path-util.h" +#include "mkdir.h" +#include "label.h" +#include "set.h" +#include "cgroup-util.h" + +#include "nspawn-mount.h" + +CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) { + CustomMount *c, *ret; + + assert(l); + assert(n); + assert(t >= 0); + assert(t < _CUSTOM_MOUNT_TYPE_MAX); + + c = realloc(*l, (*n + 1) * sizeof(CustomMount)); + if (!c) + return NULL; + + *l = c; + ret = *l + *n; + (*n)++; + + *ret = (CustomMount) { .type = t }; + + return ret; +} + +void custom_mount_free_all(CustomMount *l, unsigned n) { + unsigned i; + + for (i = 0; i < n; i++) { + CustomMount *m = l + i; + + free(m->source); + free(m->destination); + free(m->options); + + if (m->work_dir) { + (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL); + free(m->work_dir); + } + + strv_free(m->lower); + } + + free(l); +} + +int custom_mount_compare(const void *a, const void *b) { + const CustomMount *x = a, *y = b; + int r; + + r = path_compare(x->destination, y->destination); + if (r != 0) + return r; + + if (x->type < y->type) + return -1; + if (x->type > y->type) + return 1; + + return 0; +} + +int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) { + _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL; + const char *p = s; + CustomMount *m; + int r; + + assert(l); + assert(n); + + r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if (r == 1) { + destination = strdup(source); + if (!destination) + return -ENOMEM; + } + + if (r == 2 && !isempty(p)) { + opts = strdup(p); + if (!opts) + return -ENOMEM; + } + + if (!path_is_absolute(source)) + return -EINVAL; + + if (!path_is_absolute(destination)) + return -EINVAL; + + m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND); + if (!m) + return log_oom(); + + m->source = source; + m->destination = destination; + m->read_only = read_only; + m->options = opts; + + source = destination = opts = NULL; + return 0; +} + +int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) { + _cleanup_free_ char *path = NULL, *opts = NULL; + const char *p = s; + CustomMount *m; + int r; + + assert(l); + assert(n); + assert(s); + + r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if (isempty(p)) + opts = strdup("mode=0755"); + else + opts = strdup(p); + if (!opts) + return -ENOMEM; + + if (!path_is_absolute(path)) + return -EINVAL; + + m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS); + if (!m) + return -ENOMEM; + + m->destination = path; + m->options = opts; + + path = opts = NULL; + return 0; +} + +static int tmpfs_patch_options( + const char *options, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context, + char **ret) { + + char *buf = NULL; + + if (userns && uid_shift != 0) { + assert(uid_shift != UID_INVALID); + + if (options) + (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, uid_shift, uid_shift); + else + (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift); + if (!buf) + return -ENOMEM; + + options = buf; + } + +#ifdef HAVE_SELINUX + if (selinux_apifs_context) { + char *t; + + if (options) + t = strjoin(options, ",context=\"", selinux_apifs_context, "\"", NULL); + else + t = strjoin("context=\"", selinux_apifs_context, "\"", NULL); + if (!t) { + free(buf); + return -ENOMEM; + } + + free(buf); + buf = t; + } +#endif + + *ret = buf; + return !!buf; +} + +int mount_all(const char *dest, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + typedef struct MountPoint { + const char *what; + const char *where; + const char *type; + const char *options; + unsigned long flags; + bool fatal; + bool userns; + } MountPoint; + + static const MountPoint mount_table[] = { + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */ + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, + { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, + { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false }, +#ifdef HAVE_SELINUX + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */ +#endif + }; + + unsigned k; + int r; + + for (k = 0; k < ELEMENTSOF(mount_table); k++) { + _cleanup_free_ char *where = NULL, *options = NULL; + const char *o; + + if (userns != mount_table[k].userns) + continue; + + where = prefix_root(dest, mount_table[k].where); + if (!where) + return log_oom(); + + r = path_is_mount_point(where, AT_SYMLINK_FOLLOW); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); + + /* Skip this entry if it is not a remount. */ + if (mount_table[k].what && r > 0) + continue; + + r = mkdir_p(where, 0755); + if (r < 0) { + if (mount_table[k].fatal) + return log_error_errno(r, "Failed to create directory %s: %m", where); + + log_warning_errno(r, "Failed to create directory %s: %m", where); + continue; + } + + o = mount_table[k].options; + if (streq_ptr(mount_table[k].type, "tmpfs")) { + r = tmpfs_patch_options(o, userns, uid_shift, uid_range, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + if (r > 0) + o = options; + } + + if (mount(mount_table[k].what, + where, + mount_table[k].type, + mount_table[k].flags, + o) < 0) { + + if (mount_table[k].fatal) + return log_error_errno(errno, "mount(%s) failed: %m", where); + + log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where); + } + } + + return 0; +} + +static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) { + const char *p = options; + unsigned long flags = *mount_flags; + char *opts = NULL; + + assert(options); + + for (;;) { + _cleanup_free_ char *word = NULL; + int r = extract_first_word(&p, &word, ",", 0); + if (r < 0) + return log_error_errno(r, "Failed to extract mount option: %m"); + if (r == 0) + break; + + if (streq(word, "rbind")) + flags |= MS_REC; + else if (streq(word, "norbind")) + flags &= ~MS_REC; + else { + log_error("Invalid bind mount option: %s", word); + return -EINVAL; + } + } + + *mount_flags = flags; + /* in the future mount_opts will hold string options for mount(2) */ + *mount_opts = opts; + + return 0; +} + +static int mount_bind(const char *dest, CustomMount *m) { + struct stat source_st, dest_st; + const char *where; + unsigned long mount_flags = MS_BIND | MS_REC; + _cleanup_free_ char *mount_opts = NULL; + int r; + + assert(m); + + if (m->options) { + r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts); + if (r < 0) + return r; + } + + if (stat(m->source, &source_st) < 0) + return log_error_errno(errno, "Failed to stat %s: %m", m->source); + + where = prefix_roota(dest, m->destination); + + if (stat(where, &dest_st) >= 0) { + if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) { + log_error("Cannot bind mount directory %s on file %s.", m->source, where); + return -EINVAL; + } + + if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) { + log_error("Cannot bind mount file %s on directory %s.", m->source, where); + return -EINVAL; + } + + } else if (errno == ENOENT) { + r = mkdir_parents_label(where, 0755); + if (r < 0) + return log_error_errno(r, "Failed to make parents of %s: %m", where); + } else { + log_error_errno(errno, "Failed to stat %s: %m", where); + return -errno; + } + + /* Create the mount point. Any non-directory file can be + * mounted on any non-directory file (regular, fifo, socket, + * char, block). + */ + if (S_ISDIR(source_st.st_mode)) + r = mkdir_label(where, 0755); + else + r = touch(where); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Failed to create mount point %s: %m", where); + + if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0) + return log_error_errno(errno, "mount(%s) failed: %m", where); + + if (m->read_only) { + r = bind_remount_recursive(where, true); + if (r < 0) + return log_error_errno(r, "Read-only bind mount failed: %m"); + } + + return 0; +} + +static int mount_tmpfs( + const char *dest, + CustomMount *m, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + const char *where, *options; + _cleanup_free_ char *buf = NULL; + int r; + + assert(dest); + assert(m); + + where = prefix_roota(dest, m->destination); + + r = mkdir_p_label(where, 0755); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where); + + r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf); + if (r < 0) + return log_oom(); + options = r > 0 ? buf : m->options; + + if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0) + return log_error_errno(errno, "tmpfs mount to %s failed: %m", where); + + return 0; +} + +static char *joined_and_escaped_lower_dirs(char * const *lower) { + _cleanup_strv_free_ char **sv = NULL; + + sv = strv_copy(lower); + if (!sv) + return NULL; + + strv_reverse(sv); + + if (!strv_shell_escape(sv, ",:")) + return NULL; + + return strv_join(sv, ":"); +} + +static int mount_overlay(const char *dest, CustomMount *m) { + _cleanup_free_ char *lower = NULL; + const char *where, *options; + int r; + + assert(dest); + assert(m); + + where = prefix_roota(dest, m->destination); + + r = mkdir_label(where, 0755); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where); + + (void) mkdir_p_label(m->source, 0755); + + lower = joined_and_escaped_lower_dirs(m->lower); + if (!lower) + return log_oom(); + + if (m->read_only) { + _cleanup_free_ char *escaped_source = NULL; + + escaped_source = shell_escape(m->source, ",:"); + if (!escaped_source) + return log_oom(); + + options = strjoina("lowerdir=", escaped_source, ":", lower); + } else { + _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL; + + assert(m->work_dir); + (void) mkdir_label(m->work_dir, 0700); + + escaped_source = shell_escape(m->source, ",:"); + if (!escaped_source) + return log_oom(); + escaped_work_dir = shell_escape(m->work_dir, ",:"); + if (!escaped_work_dir) + return log_oom(); + + options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir); + } + + if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0) + return log_error_errno(errno, "overlay mount to %s failed: %m", where); + + return 0; +} + +int mount_custom( + const char *dest, + CustomMount *mounts, unsigned n, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + unsigned i; + int r; + + assert(dest); + + for (i = 0; i < n; i++) { + CustomMount *m = mounts + i; + + switch (m->type) { + + case CUSTOM_MOUNT_BIND: + r = mount_bind(dest, m); + break; + + case CUSTOM_MOUNT_TMPFS: + r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context); + break; + + case CUSTOM_MOUNT_OVERLAY: + r = mount_overlay(dest, m); + break; + + default: + assert_not_reached("Unknown custom mount type"); + } + + if (r < 0) + return r; + } + + return 0; +} + +static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { + char *to; + int r; + + to = strjoina(dest, "/sys/fs/cgroup/", hierarchy); + + r = path_is_mount_point(to, 0); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); + if (r > 0) + return 0; + + mkdir_p(to, 0755); + + /* The superblock mount options of the mount point need to be + * identical to the hosts', and hence writable... */ + if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0) + return log_error_errno(errno, "Failed to mount to %s: %m", to); + + /* ... hence let's only make the bind mount read-only, not the + * superblock. */ + if (read_only) { + if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) + return log_error_errno(errno, "Failed to remount %s read-only: %m", to); + } + return 1; +} + +static int mount_legacy_cgroups( + const char *dest, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_set_free_free_ Set *controllers = NULL; + const char *cgroup_root; + int r; + + cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); + + /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ + r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); + if (r == 0) { + _cleanup_free_ char *options = NULL; + + r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options); + if (r < 0) + return log_oom(); + + if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0) + return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m"); + } + + if (cg_unified() > 0) + goto skip_controllers; + + controllers = set_new(&string_hash_ops); + if (!controllers) + return log_oom(); + + r = cg_kernel_controllers(controllers); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup controllers: %m"); + + for (;;) { + _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; + + controller = set_steal_first(controllers); + if (!controller) + break; + + origin = prefix_root("/sys/fs/cgroup/", controller); + if (!origin) + return log_oom(); + + r = readlink_malloc(origin, &combined); + if (r == -EINVAL) { + /* Not a symbolic link, but directly a single cgroup hierarchy */ + + r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); + if (r < 0) + return r; + + } else if (r < 0) + return log_error_errno(r, "Failed to read link %s: %m", origin); + else { + _cleanup_free_ char *target = NULL; + + target = prefix_root(dest, origin); + if (!target) + return log_oom(); + + /* A symbolic link, a combination of controllers in one hierarchy */ + + if (!filename_is_valid(combined)) { + log_warning("Ignoring invalid combined hierarchy %s.", combined); + continue; + } + + r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); + if (r < 0) + return r; + + r = symlink_idempotent(combined, target); + if (r == -EINVAL) { + log_error("Invalid existing symlink for combined hierarchy"); + return r; + } + if (r < 0) + return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); + } + } + +skip_controllers: + r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false); + if (r < 0) + return r; + + if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) + return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); + + return 0; +} + +static int mount_unified_cgroups(const char *dest) { + const char *p; + int r; + + assert(dest); + + p = strjoina(dest, "/sys/fs/cgroup"); + + r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); + if (r < 0) + return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); + if (r > 0) { + p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs"); + if (access(p, F_OK) >= 0) + return 0; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); + + log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + return -EINVAL; + } + + if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0) + return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p); + + return 0; +} + +int mount_cgroups( + const char *dest, + bool unified_requested, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + if (unified_requested) + return mount_unified_cgroups(dest); + else + return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context); +} + +int mount_systemd_cgroup_writable( + const char *dest, + bool unified_requested) { + + _cleanup_free_ char *own_cgroup_path = NULL; + const char *systemd_root, *systemd_own; + int r; + + assert(dest); + + r = cg_pid_get_path(NULL, 0, &own_cgroup_path); + if (r < 0) + return log_error_errno(r, "Failed to determine our own cgroup path: %m"); + + /* If we are living in the top-level, then there's nothing to do... */ + if (path_equal(own_cgroup_path, "/")) + return 0; + + if (unified_requested) { + systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); + } else { + systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); + systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + } + + /* Make our own cgroup a (writable) bind mount */ + if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0) + return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path); + + /* And then remount the systemd cgroup root read-only */ + if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) + return log_error_errno(errno, "Failed to mount cgroup root read-only: %m"); + + return 0; +} + +int setup_volatile_state( + const char *directory, + VolatileMode mode, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + _cleanup_free_ char *buf = NULL; + const char *p, *options; + int r; + + assert(directory); + + if (mode != VOLATILE_STATE) + return 0; + + /* --volatile=state means we simply overmount /var + with a tmpfs, and the rest read-only. */ + + r = bind_remount_recursive(directory, true); + if (r < 0) + return log_error_errno(r, "Failed to remount %s read-only: %m", directory); + + p = prefix_roota(directory, "/var"); + r = mkdir(p, 0755); + if (r < 0 && errno != EEXIST) + return log_error_errno(errno, "Failed to create %s: %m", directory); + + options = "mode=755"; + r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf); + if (r < 0) + return log_oom(); + if (r > 0) + options = buf; + + if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0) + return log_error_errno(errno, "Failed to mount tmpfs to /var: %m"); + + return 0; +} + +int setup_volatile( + const char *directory, + VolatileMode mode, + bool userns, uid_t uid_shift, uid_t uid_range, + const char *selinux_apifs_context) { + + bool tmpfs_mounted = false, bind_mounted = false; + char template[] = "/tmp/nspawn-volatile-XXXXXX"; + _cleanup_free_ char *buf = NULL; + const char *f, *t, *options; + int r; + + assert(directory); + + if (mode != VOLATILE_YES) + return 0; + + /* --volatile=yes means we mount a tmpfs to the root dir, and + the original /usr to use inside it, and that read-only. */ + + if (!mkdtemp(template)) + return log_error_errno(errno, "Failed to create temporary directory: %m"); + + options = "mode=755"; + r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf); + if (r < 0) + return log_oom(); + if (r > 0) + options = buf; + + if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) { + r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m"); + goto fail; + } + + tmpfs_mounted = true; + + f = prefix_roota(directory, "/usr"); + t = prefix_roota(template, "/usr"); + + r = mkdir(t, 0755); + if (r < 0 && errno != EEXIST) { + r = log_error_errno(errno, "Failed to create %s: %m", t); + goto fail; + } + + if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) { + r = log_error_errno(errno, "Failed to create /usr bind mount: %m"); + goto fail; + } + + bind_mounted = true; + + r = bind_remount_recursive(t, true); + if (r < 0) { + log_error_errno(r, "Failed to remount %s read-only: %m", t); + goto fail; + } + + if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) { + r = log_error_errno(errno, "Failed to move root mount: %m"); + goto fail; + } + + (void) rmdir(template); + + return 0; + +fail: + if (bind_mounted) + (void) umount(t); + + if (tmpfs_mounted) + (void) umount(template); + (void) rmdir(template); + return r; +} + +VolatileMode volatile_mode_from_string(const char *s) { + int b; + + if (isempty(s)) + return _VOLATILE_MODE_INVALID; + + b = parse_boolean(s); + if (b > 0) + return VOLATILE_YES; + if (b == 0) + return VOLATILE_NO; + + if (streq(s, "state")) + return VOLATILE_STATE; + + return _VOLATILE_MODE_INVALID; +} diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h new file mode 100644 index 0000000000..5abd44cc4b --- /dev/null +++ b/src/nspawn/nspawn-mount.h @@ -0,0 +1,70 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <stdbool.h> + +typedef enum VolatileMode { + VOLATILE_NO, + VOLATILE_YES, + VOLATILE_STATE, + _VOLATILE_MODE_MAX, + _VOLATILE_MODE_INVALID = -1 +} VolatileMode; + +typedef enum CustomMountType { + CUSTOM_MOUNT_BIND, + CUSTOM_MOUNT_TMPFS, + CUSTOM_MOUNT_OVERLAY, + _CUSTOM_MOUNT_TYPE_MAX, + _CUSTOM_MOUNT_TYPE_INVALID = -1 +} CustomMountType; + +typedef struct CustomMount { + CustomMountType type; + bool read_only; + char *source; /* for overlayfs this is the upper directory */ + char *destination; + char *options; + char *work_dir; + char **lower; +} CustomMount; + +CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t); + +void custom_mount_free_all(CustomMount *l, unsigned n); +int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only); +int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s); + +int custom_mount_compare(const void *a, const void *b); + +int mount_all(const char *dest, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); + +int mount_cgroups(const char *dest, bool unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int mount_systemd_cgroup_writable(const char *dest, bool unified_requested); + +int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); + +int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); + +VolatileMode volatile_mode_from_string(const char *s); diff --git a/src/nspawn/nspawn-network.c b/src/nspawn/nspawn-network.c new file mode 100644 index 0000000000..74abe5379a --- /dev/null +++ b/src/nspawn/nspawn-network.c @@ -0,0 +1,440 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <linux/veth.h> +#include <net/if.h> + +#include "sd-id128.h" +#include "sd-netlink.h" +#include "libudev.h" + +#include "util.h" +#include "ether-addr-util.h" +#include "siphash24.h" +#include "netlink-util.h" +#include "udev-util.h" + +#include "nspawn-network.h" + +#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1) +#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2) +#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f) + +static int generate_mac( + const char *machine_name, + struct ether_addr *mac, + sd_id128_t hash_key, + uint64_t idx) { + + uint8_t result[8]; + size_t l, sz; + uint8_t *v, *i; + int r; + + l = strlen(machine_name); + sz = sizeof(sd_id128_t) + l; + if (idx > 0) + sz += sizeof(idx); + + v = alloca(sz); + + /* fetch some persistent data unique to the host */ + r = sd_id128_get_machine((sd_id128_t*) v); + if (r < 0) + return r; + + /* combine with some data unique (on this host) to this + * container instance */ + i = mempcpy(v + sizeof(sd_id128_t), machine_name, l); + if (idx > 0) { + idx = htole64(idx); + memcpy(i, &idx, sizeof(idx)); + } + + /* Let's hash the host machine ID plus the container name. We + * use a fixed, but originally randomly created hash key here. */ + siphash24(result, v, sz, hash_key.bytes); + + assert_cc(ETH_ALEN <= sizeof(result)); + memcpy(mac->ether_addr_octet, result, ETH_ALEN); + + /* see eth_random_addr in the kernel */ + mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */ + mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */ + + return 0; +} + +int setup_veth(const char *machine_name, + pid_t pid, + char iface_name[IFNAMSIZ], + bool bridge) { + + _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + struct ether_addr mac_host, mac_container; + int r, i; + + /* Use two different interface name prefixes depending whether + * we are in bridge mode or not. */ + snprintf(iface_name, IFNAMSIZ - 1, "%s-%s", + bridge ? "vb" : "ve", machine_name); + + r = generate_mac(machine_name, &mac_container, CONTAINER_HASH_KEY, 0); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m"); + + r = generate_mac(machine_name, &mac_host, HOST_HASH_KEY, 0); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m"); + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container(m, VETH_INFO_PEER); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0"); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name); + + i = (int) if_nametoindex(iface_name); + if (i <= 0) + return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name); + + return i; +} + +int setup_bridge(const char *veth_name, const char *bridge_name) { + _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + int r, bridge_ifi; + + assert(veth_name); + assert(bridge_name); + + bridge_ifi = (int) if_nametoindex(bridge_name); + if (bridge_ifi <= 0) + return log_error_errno(errno, "Failed to resolve interface %s: %m", bridge_name); + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP); + if (r < 0) + return log_error_errno(r, "Failed to set IFF_UP flag: %m"); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name field: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge_ifi); + if (r < 0) + return log_error_errno(r, "Failed to add netlink master field: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add veth interface to bridge: %m"); + + return bridge_ifi; +} + +static int parse_interface(struct udev *udev, const char *name) { + _cleanup_udev_device_unref_ struct udev_device *d = NULL; + char ifi_str[2 + DECIMAL_STR_MAX(int)]; + int ifi; + + ifi = (int) if_nametoindex(name); + if (ifi <= 0) + return log_error_errno(errno, "Failed to resolve interface %s: %m", name); + + sprintf(ifi_str, "n%i", ifi); + d = udev_device_new_from_device_id(udev, ifi_str); + if (!d) + return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name); + + if (udev_device_get_is_initialized(d) <= 0) { + log_error("Network interface %s is not initialized yet.", name); + return -EBUSY; + } + + return ifi; +} + +int move_network_interfaces(pid_t pid, char **ifaces) { + _cleanup_udev_unref_ struct udev *udev = NULL; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + char **i; + int r; + + if (strv_isempty(ifaces)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + udev = udev_new(); + if (!udev) { + log_error("Failed to connect to udev."); + return -ENOMEM; + } + + STRV_FOREACH(i, ifaces) { + _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; + int ifi; + + ifi = parse_interface(udev, *i); + if (ifi < 0) + return ifi; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to append namespace PID to netlink message: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i); + } + + return 0; +} + +int setup_macvlan(const char *machine_name, pid_t pid, char **ifaces) { + _cleanup_udev_unref_ struct udev *udev = NULL; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + unsigned idx = 0; + char **i; + int r; + + if (strv_isempty(ifaces)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + udev = udev_new(); + if (!udev) { + log_error("Failed to connect to udev."); + return -ENOMEM; + } + + STRV_FOREACH(i, ifaces) { + _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; + _cleanup_free_ char *n = NULL; + struct ether_addr mac; + int ifi; + + ifi = parse_interface(udev, *i); + if (ifi < 0) + return ifi; + + r = generate_mac(machine_name, &mac, MACVLAN_HASH_KEY, idx++); + if (r < 0) + return log_error_errno(r, "Failed to create MACVLAN MAC address: %m"); + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface index: %m"); + + n = strappend("mv-", *i); + if (!n) + return log_oom(); + + strshorten(n, IFNAMSIZ-1); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac); + if (r < 0) + return log_error_errno(r, "Failed to add netlink MAC address: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE); + if (r < 0) + return log_error_errno(r, "Failed to append macvlan mode: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new macvlan interfaces: %m"); + } + + return 0; +} + +int setup_ipvlan(const char *machine_name, pid_t pid, char **ifaces) { + _cleanup_udev_unref_ struct udev *udev = NULL; + _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; + char **i; + int r; + + if (strv_isempty(ifaces)) + return 0; + + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + udev = udev_new(); + if (!udev) { + log_error("Failed to connect to udev."); + return -ENOMEM; + } + + STRV_FOREACH(i, ifaces) { + _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; + _cleanup_free_ char *n = NULL; + int ifi; + + ifi = parse_interface(udev, *i); + if (ifi < 0) + return ifi; + + r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); + if (r < 0) + return log_error_errno(r, "Failed to allocate netlink message: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface index: %m"); + + n = strappend("iv-", *i); + if (!n) + return log_oom(); + + strshorten(n, IFNAMSIZ-1); + + r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); + if (r < 0) + return log_error_errno(r, "Failed to add netlink interface name: %m"); + + r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); + if (r < 0) + return log_error_errno(r, "Failed to add netlink namespace field: %m"); + + r = sd_netlink_message_open_container(m, IFLA_LINKINFO); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan"); + if (r < 0) + return log_error_errno(r, "Failed to open netlink container: %m"); + + r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2); + if (r < 0) + return log_error_errno(r, "Failed to add ipvlan mode: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_message_close_container(m); + if (r < 0) + return log_error_errno(r, "Failed to close netlink container: %m"); + + r = sd_netlink_call(rtnl, m, 0, NULL); + if (r < 0) + return log_error_errno(r, "Failed to add new ipvlan interfaces: %m"); + } + + return 0; +} diff --git a/src/nspawn/nspawn-network.h b/src/nspawn/nspawn-network.h new file mode 100644 index 0000000000..311e6d06cb --- /dev/null +++ b/src/nspawn/nspawn-network.h @@ -0,0 +1,36 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <net/if.h> + +#include <sys/types.h> +#include <stdbool.h> + +int setup_veth(const char *machine_name, pid_t pid, char iface_name[IFNAMSIZ], bool bridge); + +int setup_bridge(const char *veth_name, const char *bridge_name); + +int setup_macvlan(const char *machine_name, pid_t pid, char **ifaces); +int setup_ipvlan(const char *machine_name, pid_t pid, char **ifaces); + +int move_network_interfaces(pid_t pid, char **ifaces); diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c new file mode 100644 index 0000000000..b2776a61c2 --- /dev/null +++ b/src/nspawn/nspawn-register.c @@ -0,0 +1,244 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include "sd-bus.h" + +#include "util.h" +#include "strv.h" +#include "bus-util.h" +#include "bus-error.h" + +#include "nspawn-register.h" + +int register_machine( + const char *machine_name, + pid_t pid, + const char *directory, + sd_id128_t uuid, + int local_ifindex, + const char *slice, + CustomMount *mounts, + unsigned n_mounts, + int kill_signal, + char **properties, + bool keep_unit) { + + _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; + int r; + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); + + if (keep_unit) { + r = sd_bus_call_method( + bus, + "org.freedesktop.machine1", + "/org/freedesktop/machine1", + "org.freedesktop.machine1.Manager", + "RegisterMachineWithNetwork", + &error, + NULL, + "sayssusai", + machine_name, + SD_BUS_MESSAGE_APPEND_ID128(uuid), + "nspawn", + "container", + (uint32_t) pid, + strempty(directory), + local_ifindex > 0 ? 1 : 0, local_ifindex); + } else { + _cleanup_bus_message_unref_ sd_bus_message *m = NULL; + char **i; + unsigned j; + + r = sd_bus_message_new_method_call( + bus, + &m, + "org.freedesktop.machine1", + "/org/freedesktop/machine1", + "org.freedesktop.machine1.Manager", + "CreateMachineWithNetwork"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append( + m, + "sayssusai", + machine_name, + SD_BUS_MESSAGE_APPEND_ID128(uuid), + "nspawn", + "container", + (uint32_t) pid, + strempty(directory), + local_ifindex > 0 ? 1 : 0, local_ifindex); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + if (!isempty(slice)) { + r = sd_bus_message_append(m, "(sv)", "Slice", "s", slice); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict"); + if (r < 0) + return bus_log_create_error(r); + + /* If you make changes here, also make sure to update + * systemd-nspawn@.service, to keep the device + * policies in sync regardless if we are run with or + * without the --keep-unit switch. */ + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9, + /* Allow the container to + * access and create the API + * device nodes, so that + * PrivateDevices= in the + * container can work + * fine */ + "/dev/null", "rwm", + "/dev/zero", "rwm", + "/dev/full", "rwm", + "/dev/random", "rwm", + "/dev/urandom", "rwm", + "/dev/tty", "rwm", + "/dev/net/tun", "rwm", + /* Allow the container + * access to ptys. However, + * do not permit the + * container to ever create + * these device nodes. */ + "/dev/pts/ptmx", "rw", + "char-pts", "rw"); + if (r < 0) + return bus_log_create_error(r); + + for (j = 0; j < n_mounts; j++) { + CustomMount *cm = mounts + j; + + if (cm->type != CUSTOM_MOUNT_BIND) + continue; + + r = is_device_node(cm->source); + if (r < 0) + return log_error_errno(r, "Failed to stat %s: %m", cm->source); + + if (r) { + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1, + cm->source, cm->read_only ? "r" : "rw"); + if (r < 0) + return log_error_errno(r, "Failed to append message arguments: %m"); + } + } + + if (kill_signal != 0) { + r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", kill_signal); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed"); + if (r < 0) + return bus_log_create_error(r); + } + + STRV_FOREACH(i, properties) { + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = bus_append_unit_property_assignment(m, *i); + if (r < 0) + return r; + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, NULL); + } + + if (r < 0) { + log_error("Failed to register machine: %s", bus_error_message(&error, r)); + return r; + } + + return 0; +} + +int terminate_machine(pid_t pid) { + _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_bus_message_unref_ sd_bus_message *reply = NULL; + _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; + const char *path; + int r; + + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); + + r = sd_bus_call_method( + bus, + "org.freedesktop.machine1", + "/org/freedesktop/machine1", + "org.freedesktop.machine1.Manager", + "GetMachineByPID", + &error, + &reply, + "u", + (uint32_t) pid); + if (r < 0) { + /* Note that the machine might already have been + * cleaned up automatically, hence don't consider it a + * failure if we cannot get the machine object. */ + log_debug("Failed to get machine: %s", bus_error_message(&error, r)); + return 0; + } + + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_call_method( + bus, + "org.freedesktop.machine1", + path, + "org.freedesktop.machine1.Machine", + "Terminate", + &error, + NULL, + NULL); + if (r < 0) { + log_debug("Failed to terminate machine: %s", bus_error_message(&error, r)); + return 0; + } + + return 0; +} diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h new file mode 100644 index 0000000000..b27841ff59 --- /dev/null +++ b/src/nspawn/nspawn-register.h @@ -0,0 +1,31 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> + +#include "sd-id128.h" + +#include "nspawn-mount.h" + +int register_machine(const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, bool keep_unit); +int terminate_machine(pid_t pid); diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c new file mode 100644 index 0000000000..419f5d1c40 --- /dev/null +++ b/src/nspawn/nspawn-settings.c @@ -0,0 +1,262 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include "util.h" +#include "conf-parser.h" +#include "strv.h" +#include "cap-list.h" + +#include "nspawn-settings.h" + +int settings_load(FILE *f, const char *path, Settings **ret) { + _cleanup_(settings_freep) Settings *s = NULL; + int r; + + assert(path); + assert(ret); + + s = new0(Settings, 1); + if (!s) + return -ENOMEM; + + s->boot = -1; + s->personality = PERSONALITY_INVALID; + + s->read_only = -1; + s->volatile_mode = _VOLATILE_MODE_INVALID; + + s->private_network = -1; + s->network_veth = -1; + + r = config_parse(NULL, path, f, + "Exec\0" + "Network\0" + "Files\0", + config_item_perf_lookup, nspawn_gperf_lookup, + false, + false, + true, + s); + if (r < 0) + return r; + + *ret = s; + s = NULL; + + return 0; +} + +Settings* settings_free(Settings *s) { + + if (!s) + return NULL; + + strv_free(s->parameters); + strv_free(s->environment); + free(s->user); + + strv_free(s->network_interfaces); + strv_free(s->network_macvlan); + strv_free(s->network_ipvlan); + free(s->network_bridge); + expose_port_free_all(s->expose_ports); + + custom_mount_free_all(s->custom_mounts, s->n_custom_mounts); + free(s); + + return NULL; +} + +DEFINE_CONFIG_PARSE_ENUM(config_parse_volatile_mode, volatile_mode, VolatileMode, "Failed to parse volatile mode"); + +int config_parse_expose_port( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *s = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = expose_port_parse(&s->expose_ports, rvalue); + if (r == -EEXIST) { + log_syntax(unit, LOG_ERR, filename, line, r, "Duplicate port specification, ignoring: %s", rvalue); + return 0; + } + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse host port %s: %m", rvalue); + return 0; + } + + return 0; +} + +int config_parse_capability( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + uint64_t u = 0, *result = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + for (;;) { + _cleanup_free_ char *word = NULL; + int cap; + + r = extract_first_word(&rvalue, &word, NULL, 0); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to extract capability string, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + cap = capability_from_name(word); + if (cap < 0) { + log_syntax(unit, LOG_ERR, filename, line, cap, "Failed to parse capability, ignoring: %s", word); + continue; + } + + u |= 1 << ((uint64_t) cap); + } + + if (u == 0) + return 0; + + *result |= u; + return 0; +} + +int config_parse_id128( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + sd_id128_t t, *result = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = sd_id128_from_string(rvalue, &t); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse 128bit ID/UUID, ignoring: %s", rvalue); + return 0; + } + + *result = t; + return 0; +} + +int config_parse_bind( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = bind_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue, ltype); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid bind mount specification %s: %m", rvalue); + return 0; + } + + return 0; +} + +int config_parse_tmpfs( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = tmpfs_mount_parse(&settings->custom_mounts, &settings->n_custom_mounts, rvalue); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid temporary file system specification %s: %m", rvalue); + return 0; + } + + if (settings->network_bridge) + settings->network_veth = true; + + if (settings->network_interfaces || + settings->network_macvlan || + settings->network_ipvlan || + settings->network_bridge || + settings->network_veth) + settings->private_network = true; + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h new file mode 100644 index 0000000000..4cec40c1b7 --- /dev/null +++ b/src/nspawn/nspawn-settings.h @@ -0,0 +1,87 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <stdio.h> + +#include "macro.h" + +#include "nspawn-mount.h" +#include "nspawn-expose-ports.h" + +typedef enum SettingsMask { + SETTING_BOOT = 1 << 0, + SETTING_ENVIRONMENT = 1 << 1, + SETTING_USER = 1 << 2, + SETTING_CAPABILITY = 1 << 3, + SETTING_KILL_SIGNAL = 1 << 4, + SETTING_PERSONALITY = 1 << 5, + SETTING_MACHINE_ID = 1 << 6, + SETTING_NETWORK = 1 << 7, + SETTING_EXPOSE_PORTS = 1 << 8, + SETTING_READ_ONLY = 1 << 9, + SETTING_VOLATILE_MODE = 1 << 10, + SETTING_CUSTOM_MOUNTS = 1 << 11, + _SETTINGS_MASK_ALL = (1 << 12) -1 +} SettingsMask; + +typedef struct Settings { + /* [Run] */ + int boot; + char **parameters; + char **environment; + char *user; + uint64_t capability; + uint64_t drop_capability; + int kill_signal; + unsigned long personality; + sd_id128_t machine_id; + + /* [Image] */ + int read_only; + VolatileMode volatile_mode; + CustomMount *custom_mounts; + unsigned n_custom_mounts; + + /* [Network] */ + int private_network; + int network_veth; + char *network_bridge; + char **network_interfaces; + char **network_macvlan; + char **network_ipvlan; + ExposePort *expose_ports; +} Settings; + +int settings_load(FILE *f, const char *path, Settings **ret); +Settings* settings_free(Settings *s); + +DEFINE_TRIVIAL_CLEANUP_FUNC(Settings*, settings_free); + +const struct ConfigPerfItem* nspawn_gperf_lookup(const char *key, unsigned length); + +int config_parse_capability(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_id128(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_expose_port(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_volatile_mode(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_bind(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/nspawn/nspawn-setuid.c b/src/nspawn/nspawn-setuid.c new file mode 100644 index 0000000000..eda7f62900 --- /dev/null +++ b/src/nspawn/nspawn-setuid.c @@ -0,0 +1,272 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <sys/types.h> +#include <unistd.h> +#include <grp.h> + +#include "util.h" +#include "signal-util.h" +#include "mkdir.h" +#include "process-util.h" + +#include "nspawn-setuid.h" + +static int spawn_getent(const char *database, const char *key, pid_t *rpid) { + int pipe_fds[2]; + pid_t pid; + + assert(database); + assert(key); + assert(rpid); + + if (pipe2(pipe_fds, O_CLOEXEC) < 0) + return log_error_errno(errno, "Failed to allocate pipe: %m"); + + pid = fork(); + if (pid < 0) + return log_error_errno(errno, "Failed to fork getent child: %m"); + else if (pid == 0) { + int nullfd; + char *empty_env = NULL; + + if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0) + _exit(EXIT_FAILURE); + + if (pipe_fds[0] > 2) + safe_close(pipe_fds[0]); + if (pipe_fds[1] > 2) + safe_close(pipe_fds[1]); + + nullfd = open("/dev/null", O_RDWR); + if (nullfd < 0) + _exit(EXIT_FAILURE); + + if (dup3(nullfd, STDIN_FILENO, 0) < 0) + _exit(EXIT_FAILURE); + + if (dup3(nullfd, STDERR_FILENO, 0) < 0) + _exit(EXIT_FAILURE); + + if (nullfd > 2) + safe_close(nullfd); + + (void) reset_all_signal_handlers(); + (void) reset_signal_mask(); + close_all_fds(NULL, 0); + + execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env); + execle("/bin/getent", "getent", database, key, NULL, &empty_env); + _exit(EXIT_FAILURE); + } + + pipe_fds[1] = safe_close(pipe_fds[1]); + + *rpid = pid; + + return pipe_fds[0]; +} + +int change_uid_gid(const char *user, char **_home) { + char line[LINE_MAX], *x, *u, *g, *h; + const char *word, *state; + _cleanup_free_ uid_t *uids = NULL; + _cleanup_free_ char *home = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_close_ int fd = -1; + unsigned n_uids = 0; + size_t sz = 0, l; + uid_t uid; + gid_t gid; + pid_t pid; + int r; + + assert(_home); + + if (!user || streq(user, "root") || streq(user, "0")) { + /* Reset everything fully to 0, just in case */ + + r = reset_uid_gid(); + if (r < 0) + return log_error_errno(r, "Failed to become root: %m"); + + *_home = NULL; + return 0; + } + + /* First, get user credentials */ + fd = spawn_getent("passwd", user, &pid); + if (fd < 0) + return fd; + + f = fdopen(fd, "r"); + if (!f) + return log_oom(); + fd = -1; + + if (!fgets(line, sizeof(line), f)) { + + if (!ferror(f)) { + log_error("Failed to resolve user %s.", user); + return -ESRCH; + } + + log_error_errno(errno, "Failed to read from getent: %m"); + return -errno; + } + + truncate_nl(line); + + wait_for_terminate_and_warn("getent passwd", pid, true); + + x = strchr(line, ':'); + if (!x) { + log_error("/etc/passwd entry has invalid user field."); + return -EIO; + } + + u = strchr(x+1, ':'); + if (!u) { + log_error("/etc/passwd entry has invalid password field."); + return -EIO; + } + + u++; + g = strchr(u, ':'); + if (!g) { + log_error("/etc/passwd entry has invalid UID field."); + return -EIO; + } + + *g = 0; + g++; + x = strchr(g, ':'); + if (!x) { + log_error("/etc/passwd entry has invalid GID field."); + return -EIO; + } + + *x = 0; + h = strchr(x+1, ':'); + if (!h) { + log_error("/etc/passwd entry has invalid GECOS field."); + return -EIO; + } + + h++; + x = strchr(h, ':'); + if (!x) { + log_error("/etc/passwd entry has invalid home directory field."); + return -EIO; + } + + *x = 0; + + r = parse_uid(u, &uid); + if (r < 0) { + log_error("Failed to parse UID of user."); + return -EIO; + } + + r = parse_gid(g, &gid); + if (r < 0) { + log_error("Failed to parse GID of user."); + return -EIO; + } + + home = strdup(h); + if (!home) + return log_oom(); + + /* Second, get group memberships */ + fd = spawn_getent("initgroups", user, &pid); + if (fd < 0) + return fd; + + fclose(f); + f = fdopen(fd, "r"); + if (!f) + return log_oom(); + fd = -1; + + if (!fgets(line, sizeof(line), f)) { + if (!ferror(f)) { + log_error("Failed to resolve user %s.", user); + return -ESRCH; + } + + log_error_errno(errno, "Failed to read from getent: %m"); + return -errno; + } + + truncate_nl(line); + + wait_for_terminate_and_warn("getent initgroups", pid, true); + + /* Skip over the username and subsequent separator whitespace */ + x = line; + x += strcspn(x, WHITESPACE); + x += strspn(x, WHITESPACE); + + FOREACH_WORD(word, l, x, state) { + char c[l+1]; + + memcpy(c, word, l); + c[l] = 0; + + if (!GREEDY_REALLOC(uids, sz, n_uids+1)) + return log_oom(); + + r = parse_uid(c, &uids[n_uids++]); + if (r < 0) { + log_error("Failed to parse group data from getent."); + return -EIO; + } + } + + r = mkdir_parents(home, 0775); + if (r < 0) + return log_error_errno(r, "Failed to make home root directory: %m"); + + r = mkdir_safe(home, 0755, uid, gid); + if (r < 0 && r != -EEXIST) + return log_error_errno(r, "Failed to make home directory: %m"); + + (void) fchown(STDIN_FILENO, uid, gid); + (void) fchown(STDOUT_FILENO, uid, gid); + (void) fchown(STDERR_FILENO, uid, gid); + + if (setgroups(n_uids, uids) < 0) + return log_error_errno(errno, "Failed to set auxiliary groups: %m"); + + if (setresgid(gid, gid, gid) < 0) + return log_error_errno(errno, "setregid() failed: %m"); + + if (setresuid(uid, uid, uid) < 0) + return log_error_errno(errno, "setreuid() failed: %m"); + + if (_home) { + *_home = home; + home = NULL; + } + + return 0; +} diff --git a/src/nspawn/nspawn-setuid.h b/src/nspawn/nspawn-setuid.h new file mode 100644 index 0000000000..33be44a946 --- /dev/null +++ b/src/nspawn/nspawn-setuid.h @@ -0,0 +1,24 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2015 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +int change_uid_gid(const char *user, char **ret); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 1c64c3e771..33943a4b2f 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -19,95 +19,78 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ -#include <signal.h> -#include <sched.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/mount.h> -#include <stdlib.h> -#include <string.h> -#include <stdio.h> +#ifdef HAVE_BLKID +#include <blkid/blkid.h> +#endif #include <errno.h> -#include <sys/prctl.h> #include <getopt.h> -#include <grp.h> -#include <linux/fs.h> -#include <sys/socket.h> -#include <linux/netlink.h> -#include <net/if.h> -#include <linux/veth.h> -#include <sys/personality.h> #include <linux/loop.h> -#include <sys/file.h> - -#ifdef HAVE_SELINUX -#include <selinux/selinux.h> -#endif - +#include <sched.h> #ifdef HAVE_SECCOMP #include <seccomp.h> #endif - -#ifdef HAVE_BLKID -#include <blkid/blkid.h> +#ifdef HAVE_SELINUX +#include <selinux/selinux.h> #endif +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/personality.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <unistd.h> #include "sd-daemon.h" -#include "sd-bus.h" #include "sd-id128.h" -#include "sd-netlink.h" -#include "random-util.h" -#include "log.h" -#include "util.h" -#include "mkdir.h" -#include "rm-rf.h" -#include "macro.h" -#include "missing.h" + +#include "barrier.h" +#include "base-filesystem.h" +#include "blkid-util.h" +#include "btrfs-util.h" +#include "build.h" +#include "cap-list.h" +#include "capability.h" #include "cgroup-util.h" -#include "strv.h" -#include "path-util.h" -#include "loopback-setup.h" +#include "copy.h" #include "dev-setup.h" +#include "env-util.h" +#include "event-util.h" #include "fdset.h" -#include "build.h" #include "fileio.h" -#include "bus-util.h" -#include "bus-error.h" -#include "ptyfwd.h" -#include "env-util.h" -#include "netlink-util.h" -#include "udev-util.h" -#include "blkid-util.h" +#include "formats-util.h" #include "gpt.h" -#include "siphash24.h" -#include "copy.h" -#include "base-filesystem.h" -#include "barrier.h" -#include "event-util.h" -#include "capability.h" -#include "cap-list.h" -#include "btrfs-util.h" +#include "hostname-util.h" +#include "log.h" +#include "loopback-setup.h" #include "machine-image.h" -#include "list.h" -#include "in-addr-util.h" -#include "firewall-util.h" -#include "local-addresses.h" -#include "formats-util.h" +#include "macro.h" +#include "missing.h" +#include "mkdir.h" +#include "netlink-util.h" +#include "path-util.h" #include "process-util.h" -#include "terminal-util.h" -#include "hostname-util.h" -#include "signal-util.h" - +#include "ptyfwd.h" +#include "random-util.h" +#include "rm-rf.h" #ifdef HAVE_SECCOMP #include "seccomp-util.h" #endif +#include "signal-util.h" +#include "strv.h" +#include "terminal-util.h" +#include "udev-util.h" +#include "util.h" -typedef struct ExposePort { - int protocol; - uint16_t host_port; - uint16_t container_port; - LIST_FIELDS(struct ExposePort, ports); -} ExposePort; +#include "nspawn-settings.h" +#include "nspawn-mount.h" +#include "nspawn-network.h" +#include "nspawn-expose-ports.h" +#include "nspawn-cgroup.h" +#include "nspawn-register.h" +#include "nspawn-setuid.h" typedef enum ContainerStatus { CONTAINER_TERMINATED, @@ -121,28 +104,6 @@ typedef enum LinkJournal { LINK_GUEST } LinkJournal; -typedef enum Volatile { - VOLATILE_NO, - VOLATILE_YES, - VOLATILE_STATE, -} Volatile; - -typedef enum CustomMountType { - CUSTOM_MOUNT_BIND, - CUSTOM_MOUNT_TMPFS, - CUSTOM_MOUNT_OVERLAY, -} CustomMountType; - -typedef struct CustomMount { - CustomMountType type; - bool read_only; - char *source; /* for overlayfs this is the upper directory */ - char *destination; - char *options; - char *work_dir; - char **lower; -} CustomMount; - static char *arg_directory = NULL; static char *arg_template = NULL; static char *arg_user = NULL; @@ -195,16 +156,19 @@ static char **arg_network_interfaces = NULL; static char **arg_network_macvlan = NULL; static char **arg_network_ipvlan = NULL; static bool arg_network_veth = false; -static const char *arg_network_bridge = NULL; +static char *arg_network_bridge = NULL; static unsigned long arg_personality = PERSONALITY_INVALID; static char *arg_image = NULL; -static Volatile arg_volatile = VOLATILE_NO; +static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; +static SettingsMask arg_settings_mask = 0; +static int arg_settings_trusted = -1; +static char **arg_parameters = NULL; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -275,62 +239,10 @@ static void help(void) { " --keep-unit Do not register a scope for the machine, reuse\n" " the service unit nspawn is running in\n" " --volatile[=MODE] Run the system in volatile mode\n" + " --settings=BOOLEAN Load additional settings from .nspawn file\n" , program_invocation_short_name); } -static CustomMount* custom_mount_add(CustomMountType t) { - CustomMount *c, *ret; - - c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount)); - if (!c) - return NULL; - - arg_custom_mounts = c; - ret = arg_custom_mounts + arg_n_custom_mounts; - arg_n_custom_mounts++; - - *ret = (CustomMount) { .type = t }; - - return ret; -} - -static void custom_mount_free_all(void) { - unsigned i; - - for (i = 0; i < arg_n_custom_mounts; i++) { - CustomMount *m = &arg_custom_mounts[i]; - - free(m->source); - free(m->destination); - free(m->options); - - if (m->work_dir) { - (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL); - free(m->work_dir); - } - - strv_free(m->lower); - } - - arg_custom_mounts = mfree(arg_custom_mounts); - arg_n_custom_mounts = 0; -} - -static int custom_mount_compare(const void *a, const void *b) { - const CustomMount *x = a, *y = b; - int r; - - r = path_compare(x->destination, y->destination); - if (r != 0) - return r; - - if (x->type < y->type) - return -1; - if (x->type > y->type) - return 1; - - return 0; -} static int custom_mounts_prepare(void) { unsigned i; @@ -439,6 +351,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_PROPERTY, ARG_PRIVATE_USERS, ARG_KILL_SIGNAL, + ARG_SETTINGS, }; static const struct option options[] = { @@ -481,11 +394,13 @@ static int parse_argv(int argc, char *argv[]) { { "property", required_argument, NULL, ARG_PROPERTY }, { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, + { "settings", required_argument, NULL, ARG_SETTINGS }, {} }; int c, r; uint64_t plus = 0, minus = 0; + bool mask_all_settings = false, mask_no_settings = false; assert(argc >= 0); assert(argv); @@ -533,16 +448,20 @@ static int parse_argv(int argc, char *argv[]) { if (r < 0) return log_oom(); + arg_settings_mask |= SETTING_USER; break; case ARG_NETWORK_BRIDGE: - arg_network_bridge = optarg; + r = free_and_strdup(&arg_network_bridge, optarg); + if (r < 0) + return log_oom(); /* fall through */ case 'n': arg_network_veth = true; arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case ARG_NETWORK_INTERFACE: @@ -550,6 +469,7 @@ static int parse_argv(int argc, char *argv[]) { return log_oom(); arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case ARG_NETWORK_MACVLAN: @@ -557,6 +477,7 @@ static int parse_argv(int argc, char *argv[]) { return log_oom(); arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case ARG_NETWORK_IPVLAN: @@ -567,10 +488,12 @@ static int parse_argv(int argc, char *argv[]) { case ARG_PRIVATE_NETWORK: arg_private_network = true; + arg_settings_mask |= SETTING_NETWORK; break; case 'b': arg_boot = true; + arg_settings_mask |= SETTING_BOOT; break; case ARG_UUID: @@ -579,6 +502,8 @@ static int parse_argv(int argc, char *argv[]) { log_error("Invalid UUID: %s", optarg); return r; } + + arg_settings_mask |= SETTING_MACHINE_ID; break; case 'S': @@ -611,6 +536,7 @@ static int parse_argv(int argc, char *argv[]) { case ARG_READ_ONLY: arg_read_only = true; + arg_settings_mask |= SETTING_READ_ONLY; break; case ARG_CAPABILITY: @@ -646,6 +572,7 @@ static int parse_argv(int argc, char *argv[]) { } } + arg_settings_mask |= SETTING_CAPABILITY; break; } @@ -681,83 +608,21 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_BIND: - case ARG_BIND_RO: { - const char *current = optarg; - _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL; - CustomMount *m; - - r = extract_many_words(¤t, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL); - switch (r) { - case 1: - destination = strdup(source); - case 2: - case 3: - break; - case -ENOMEM: - return log_oom(); - default: - log_error("Invalid bind mount specification: %s", optarg); - return -EINVAL; - } - - if (!source || !destination) - return log_oom(); - - if (!path_is_absolute(source) || !path_is_absolute(destination)) { - log_error("Invalid bind mount specification: %s", optarg); - return -EINVAL; - } - - m = custom_mount_add(CUSTOM_MOUNT_BIND); - if (!m) - return log_oom(); - - m->source = source; - m->destination = destination; - m->read_only = c == ARG_BIND_RO; - m->options = opts; - - source = destination = opts = NULL; + case ARG_BIND_RO: + r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO); + if (r < 0) + return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg); + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; - } - - case ARG_TMPFS: { - const char *current = optarg; - _cleanup_free_ char *path = NULL, *opts = NULL; - CustomMount *m; - r = extract_first_word(¤t, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS); - if (r == -ENOMEM) - return log_oom(); - else if (r < 0) { - log_error("Invalid tmpfs specification: %s", optarg); - return r; - } - if (r) - opts = strdup(current); - else - opts = strdup("mode=0755"); - - if (!path || !opts) - return log_oom(); - - if (!path_is_absolute(path)) { - log_error("Invalid tmpfs specification: %s", optarg); - return -EINVAL; - } - - m = custom_mount_add(CUSTOM_MOUNT_TMPFS); - if (!m) - return log_oom(); - - m->destination = path; - m->options = opts; - - path = opts = NULL; + case ARG_TMPFS: + r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg); + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; - } case ARG_OVERLAY: case ARG_OVERLAY_RO: { @@ -808,7 +673,7 @@ static int parse_argv(int argc, char *argv[]) { lower[n - 2] = NULL; } - m = custom_mount_add(CUSTOM_MOUNT_OVERLAY); + m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY); if (!m) return log_oom(); @@ -820,6 +685,7 @@ static int parse_argv(int argc, char *argv[]) { upper = destination = NULL; lower = NULL; + arg_settings_mask |= SETTING_CUSTOM_MOUNTS; break; } @@ -837,6 +703,8 @@ static int parse_argv(int argc, char *argv[]) { strv_free(arg_setenv); arg_setenv = n; + + arg_settings_mask |= SETTING_ENVIRONMENT; break; } @@ -870,85 +738,36 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } + arg_settings_mask |= SETTING_PERSONALITY; break; case ARG_VOLATILE: if (!optarg) - arg_volatile = VOLATILE_YES; - else { - r = parse_boolean(optarg); - if (r < 0) { - if (streq(optarg, "state")) - arg_volatile = VOLATILE_STATE; - else { - log_error("Failed to parse --volatile= argument: %s", optarg); - return r; - } - } else - arg_volatile = r ? VOLATILE_YES : VOLATILE_NO; - } - - break; - - case 'p': { - const char *split, *e; - uint16_t container_port, host_port; - int protocol; - ExposePort *p; - - if ((e = startswith(optarg, "tcp:"))) - protocol = IPPROTO_TCP; - else if ((e = startswith(optarg, "udp:"))) - protocol = IPPROTO_UDP; + arg_volatile_mode = VOLATILE_YES; else { - e = optarg; - protocol = IPPROTO_TCP; - } - - split = strchr(e, ':'); - if (split) { - char v[split - e + 1]; - - memcpy(v, e, split - e); - v[split - e] = 0; - - r = safe_atou16(v, &host_port); - if (r < 0 || host_port <= 0) { - log_error("Failed to parse host port: %s", optarg); - return -EINVAL; - } - - r = safe_atou16(split + 1, &container_port); - } else { - r = safe_atou16(e, &container_port); - host_port = container_port; - } + VolatileMode m; - if (r < 0 || container_port <= 0) { - log_error("Failed to parse host port: %s", optarg); - return -EINVAL; - } - - LIST_FOREACH(ports, p, arg_expose_ports) { - if (p->protocol == protocol && p->host_port == host_port) { - log_error("Duplicate port specification: %s", optarg); + m = volatile_mode_from_string(optarg); + if (m < 0) { + log_error("Failed to parse --volatile= argument: %s", optarg); return -EINVAL; - } + } else + arg_volatile_mode = m; } - p = new(ExposePort, 1); - if (!p) - return log_oom(); - - p->protocol = protocol; - p->host_port = host_port; - p->container_port = container_port; + arg_settings_mask |= SETTING_VOLATILE_MODE; + break; - LIST_PREPEND(ports, arg_expose_ports, p); + case 'p': + r = expose_port_parse(&arg_expose_ports, optarg); + if (r == -EEXIST) + return log_error_errno(r, "Duplicate port specification: %s", optarg); + if (r < 0) + return log_error_errno(r, "Failed to parse host port %s: %m", optarg); + arg_settings_mask |= SETTING_EXPOSE_PORTS; break; - } case ARG_PROPERTY: if (strv_extend(&arg_property, optarg) < 0) @@ -992,6 +811,42 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } + arg_settings_mask |= SETTING_KILL_SIGNAL; + break; + + case ARG_SETTINGS: + + /* no → do not read files + * yes → read files, do not override cmdline, trust only subset + * override → read files, override cmdline, trust only subset + * trusted → read files, do not override cmdline, trust all + */ + + r = parse_boolean(optarg); + if (r < 0) { + if (streq(optarg, "trusted")) { + mask_all_settings = false; + mask_no_settings = false; + arg_settings_trusted = true; + + } else if (streq(optarg, "override")) { + mask_all_settings = false; + mask_no_settings = true; + arg_settings_trusted = -1; + } else + return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg); + } else if (r > 0) { + /* yes */ + mask_all_settings = false; + mask_no_settings = false; + arg_settings_trusted = -1; + } else { + /* no */ + mask_all_settings = true; + mask_no_settings = false; + arg_settings_trusted = false; + } + break; case '?': @@ -1044,557 +899,48 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_volatile != VOLATILE_NO && arg_read_only) { - log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); - return -EINVAL; - } - - if (arg_expose_ports && !arg_private_network) { - log_error("Cannot use --port= without private networking."); - return -EINVAL; - } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support."); - arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; - - if (arg_boot && arg_kill_signal <= 0) - arg_kill_signal = SIGRTMIN+3; - - r = detect_unified_cgroup_hierarchy(); - if (r < 0) - return r; - - return 1; -} - -static int tmpfs_patch_options(const char *options, char **ret) { - char *buf = NULL; - - if (arg_userns && arg_uid_shift != 0) { - assert(arg_uid_shift != UID_INVALID); - - if (options) - (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift); - else - (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift); - if (!buf) - return -ENOMEM; - - options = buf; - } - -#ifdef HAVE_SELINUX - if (arg_selinux_apifs_context) { - char *t; - - if (options) - t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL); - else - t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL); - if (!t) { - free(buf); - return -ENOMEM; - } - - free(buf); - buf = t; - } -#endif - - *ret = buf; - return !!buf; -} - -static int mount_all(const char *dest, bool userns) { - - typedef struct MountPoint { - const char *what; - const char *where; - const char *type; - const char *options; - unsigned long flags; - bool fatal; - bool userns; - } MountPoint; - - static const MountPoint mount_table[] = { - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */ - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false }, - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false }, - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false }, -#ifdef HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */ -#endif - }; - - unsigned k; - int r; - - for (k = 0; k < ELEMENTSOF(mount_table); k++) { - _cleanup_free_ char *where = NULL, *options = NULL; - const char *o; - - if (userns != mount_table[k].userns) - continue; - - where = prefix_root(dest, mount_table[k].where); - if (!where) + if (argc > optind) { + arg_parameters = strv_copy(argv + optind); + if (!arg_parameters) return log_oom(); - r = path_is_mount_point(where, AT_SYMLINK_FOLLOW); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); - - /* Skip this entry if it is not a remount. */ - if (mount_table[k].what && r > 0) - continue; - - r = mkdir_p(where, 0755); - if (r < 0) { - if (mount_table[k].fatal) - return log_error_errno(r, "Failed to create directory %s: %m", where); - - log_warning_errno(r, "Failed to create directory %s: %m", where); - continue; - } - - o = mount_table[k].options; - if (streq_ptr(mount_table[k].type, "tmpfs")) { - r = tmpfs_patch_options(o, &options); - if (r < 0) - return log_oom(); - if (r > 0) - o = options; - } - - if (mount(mount_table[k].what, - where, - mount_table[k].type, - mount_table[k].flags, - o) < 0) { - - if (mount_table[k].fatal) - return log_error_errno(errno, "mount(%s) failed: %m", where); - - log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where); - } + arg_settings_mask |= SETTING_BOOT; } - return 0; -} + /* Load all settings from .nspawn files */ + if (mask_no_settings) + arg_settings_mask = 0; -static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) { - const char *p = options; - unsigned long flags = *mount_flags; - char *opts = NULL; + /* Don't load any settings from .nspawn files */ + if (mask_all_settings) + arg_settings_mask = _SETTINGS_MASK_ALL; - assert(options); - - for (;;) { - _cleanup_free_ char *word = NULL; - int r = extract_first_word(&p, &word, ",", 0); - if (r < 0) - return log_error_errno(r, "Failed to extract mount option: %m"); - if (r == 0) - break; - - if (streq(word, "rbind")) - flags |= MS_REC; - else if (streq(word, "norbind")) - flags &= ~MS_REC; - else { - log_error("Invalid bind mount option: %s", word); - return -EINVAL; - } - } - - *mount_flags = flags; - /* in the future mount_opts will hold string options for mount(2) */ - *mount_opts = opts; - - return 0; -} - -static int mount_bind(const char *dest, CustomMount *m) { - struct stat source_st, dest_st; - const char *where; - unsigned long mount_flags = MS_BIND | MS_REC; - _cleanup_free_ char *mount_opts = NULL; - int r; - - assert(m); - - if (m->options) { - r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts); - if (r < 0) - return r; - } - - if (stat(m->source, &source_st) < 0) - return log_error_errno(errno, "Failed to stat %s: %m", m->source); - - where = prefix_roota(dest, m->destination); - - if (stat(where, &dest_st) >= 0) { - if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) { - log_error("Cannot bind mount directory %s on file %s.", m->source, where); - return -EINVAL; - } - - if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) { - log_error("Cannot bind mount file %s on directory %s.", m->source, where); - return -EINVAL; - } - - } else if (errno == ENOENT) { - r = mkdir_parents_label(where, 0755); - if (r < 0) - return log_error_errno(r, "Failed to make parents of %s: %m", where); - } else { - log_error_errno(errno, "Failed to stat %s: %m", where); - return -errno; - } - - /* Create the mount point. Any non-directory file can be - * mounted on any non-directory file (regular, fifo, socket, - * char, block). - */ - if (S_ISDIR(source_st.st_mode)) - r = mkdir_label(where, 0755); - else - r = touch(where); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Failed to create mount point %s: %m", where); - - if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0) - return log_error_errno(errno, "mount(%s) failed: %m", where); - - if (m->read_only) { - r = bind_remount_recursive(where, true); - if (r < 0) - return log_error_errno(r, "Read-only bind mount failed: %m"); - } - - return 0; -} - -static int mount_tmpfs(const char *dest, CustomMount *m) { - const char *where, *options; - _cleanup_free_ char *buf = NULL; - int r; - - assert(dest); - assert(m); - - where = prefix_roota(dest, m->destination); - - r = mkdir_p_label(where, 0755); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where); - - r = tmpfs_patch_options(m->options, &buf); - if (r < 0) - return log_oom(); - options = r > 0 ? buf : m->options; - - if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0) - return log_error_errno(errno, "tmpfs mount to %s failed: %m", where); - - return 0; -} - -static char *joined_and_escaped_lower_dirs(char * const *lower) { - _cleanup_strv_free_ char **sv = NULL; - - sv = strv_copy(lower); - if (!sv) - return NULL; - - strv_reverse(sv); - - if (!strv_shell_escape(sv, ",:")) - return NULL; - - return strv_join(sv, ":"); -} - -static int mount_overlay(const char *dest, CustomMount *m) { - _cleanup_free_ char *lower = NULL; - const char *where, *options; - int r; - - assert(dest); - assert(m); - - where = prefix_roota(dest, m->destination); - - r = mkdir_label(where, 0755); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where); - - (void) mkdir_p_label(m->source, 0755); - - lower = joined_and_escaped_lower_dirs(m->lower); - if (!lower) - return log_oom(); - - if (m->read_only) { - _cleanup_free_ char *escaped_source = NULL; - - escaped_source = shell_escape(m->source, ",:"); - if (!escaped_source) - return log_oom(); - - options = strjoina("lowerdir=", escaped_source, ":", lower); - } else { - _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL; - - assert(m->work_dir); - (void) mkdir_label(m->work_dir, 0700); - - escaped_source = shell_escape(m->source, ",:"); - if (!escaped_source) - return log_oom(); - escaped_work_dir = shell_escape(m->work_dir, ",:"); - if (!escaped_work_dir) - return log_oom(); - - options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir); - } - - if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0) - return log_error_errno(errno, "overlay mount to %s failed: %m", where); - - return 0; -} - -static int mount_custom(const char *dest) { - unsigned i; - int r; - - assert(dest); - - for (i = 0; i < arg_n_custom_mounts; i++) { - CustomMount *m = &arg_custom_mounts[i]; - - switch (m->type) { - - case CUSTOM_MOUNT_BIND: - r = mount_bind(dest, m); - break; - - case CUSTOM_MOUNT_TMPFS: - r = mount_tmpfs(dest, m); - break; - - case CUSTOM_MOUNT_OVERLAY: - r = mount_overlay(dest, m); - break; - - default: - assert_not_reached("Unknown custom mount type"); - } - - if (r < 0) - return r; - } - - return 0; -} - -static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) { - char *to; - int r; - - to = strjoina(dest, "/sys/fs/cgroup/", hierarchy); - - r = path_is_mount_point(to, 0); - if (r < 0 && r != -ENOENT) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to); - if (r > 0) - return 0; - - mkdir_p(to, 0755); - - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ - if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0) - return log_error_errno(errno, "Failed to mount to %s: %m", to); - - /* ... hence let's only make the bind mount read-only, not the - * superblock. */ - if (read_only) { - if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) - return log_error_errno(errno, "Failed to remount %s read-only: %m", to); - } - return 1; -} - -static int mount_legacy_cgroups(const char *dest) { - _cleanup_set_free_free_ Set *controllers = NULL; - const char *cgroup_root; - int r; - - cgroup_root = prefix_roota(dest, "/sys/fs/cgroup"); - - /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */ - r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m"); - if (r == 0) { - _cleanup_free_ char *options = NULL; - - r = tmpfs_patch_options("mode=755", &options); - if (r < 0) - return log_oom(); - - if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0) - return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m"); - } - - if (cg_unified() > 0) - goto skip_controllers; - - controllers = set_new(&string_hash_ops); - if (!controllers) - return log_oom(); - - r = cg_kernel_controllers(controllers); - if (r < 0) - return log_error_errno(r, "Failed to determine cgroup controllers: %m"); - - for (;;) { - _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL; - - controller = set_steal_first(controllers); - if (!controller) - break; - - origin = prefix_root("/sys/fs/cgroup/", controller); - if (!origin) - return log_oom(); - - r = readlink_malloc(origin, &combined); - if (r == -EINVAL) { - /* Not a symbolic link, but directly a single cgroup hierarchy */ - - r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true); - if (r < 0) - return r; - - } else if (r < 0) - return log_error_errno(r, "Failed to read link %s: %m", origin); - else { - _cleanup_free_ char *target = NULL; - - target = prefix_root(dest, origin); - if (!target) - return log_oom(); - - /* A symbolic link, a combination of controllers in one hierarchy */ - - if (!filename_is_valid(combined)) { - log_warning("Ignoring invalid combined hierarchy %s.", combined); - continue; - } - - r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true); - if (r < 0) - return r; - - r = symlink_idempotent(combined, target); - if (r == -EINVAL) { - log_error("Invalid existing symlink for combined hierarchy"); - return r; - } - if (r < 0) - return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m"); - } - } + arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus; -skip_controllers: - r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false); + r = detect_unified_cgroup_hierarchy(); if (r < 0) return r; - if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0) - return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root); - - return 0; + return 1; } -static int mount_unified_cgroups(const char *dest) { - const char *p; - int r; +static int verify_arguments(void) { - assert(dest); - - p = strjoina(dest, "/sys/fs/cgroup"); - - r = path_is_mount_point(p, AT_SYMLINK_FOLLOW); - if (r < 0) - return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p); - if (r > 0) { - p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs"); - if (access(p, F_OK) >= 0) - return 0; - if (errno != ENOENT) - return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p); - - log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p); + if (arg_volatile_mode != VOLATILE_NO && arg_read_only) { + log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy."); return -EINVAL; } - if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0) - return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p); - - return 0; -} - -static int mount_cgroups(const char *dest) { - if (arg_unified_cgroup_hierarchy) - return mount_unified_cgroups(dest); - else - return mount_legacy_cgroups(dest); -} - -static int mount_systemd_cgroup_writable(const char *dest) { - _cleanup_free_ char *own_cgroup_path = NULL; - const char *systemd_root, *systemd_own; - int r; - - assert(dest); - - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); - - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; - - if (arg_unified_cgroup_hierarchy) { - systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup"); - } else { - systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path); - systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); + if (arg_expose_ports && !arg_private_network) { + log_error("Cannot use --port= without private networking."); + return -EINVAL; } - /* Make our own cgroup a (writable) bind mount */ - if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0) - return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path); - - /* And then remount the systemd cgroup root read-only */ - if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0) - return log_error_errno(errno, "Failed to mount cgroup root read-only: %m"); + if (arg_boot && arg_kill_signal <= 0) + arg_kill_signal = SIGRTMIN+3; return 0; } @@ -1736,114 +1082,6 @@ static int setup_resolv_conf(const char *dest) { return 0; } -static int setup_volatile_state(const char *directory) { - _cleanup_free_ char *buf = NULL; - const char *p, *options; - int r; - - assert(directory); - - if (arg_volatile != VOLATILE_STATE) - return 0; - - /* --volatile=state means we simply overmount /var - with a tmpfs, and the rest read-only. */ - - r = bind_remount_recursive(directory, true); - if (r < 0) - return log_error_errno(r, "Failed to remount %s read-only: %m", directory); - - p = prefix_roota(directory, "/var"); - r = mkdir(p, 0755); - if (r < 0 && errno != EEXIST) - return log_error_errno(errno, "Failed to create %s: %m", directory); - - options = "mode=755"; - r = tmpfs_patch_options(options, &buf); - if (r < 0) - return log_oom(); - if (r > 0) - options = buf; - - if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0) - return log_error_errno(errno, "Failed to mount tmpfs to /var: %m"); - - return 0; -} - -static int setup_volatile(const char *directory) { - bool tmpfs_mounted = false, bind_mounted = false; - char template[] = "/tmp/nspawn-volatile-XXXXXX"; - _cleanup_free_ char *buf = NULL; - const char *f, *t, *options; - int r; - - assert(directory); - - if (arg_volatile != VOLATILE_YES) - return 0; - - /* --volatile=yes means we mount a tmpfs to the root dir, and - the original /usr to use inside it, and that read-only. */ - - if (!mkdtemp(template)) - return log_error_errno(errno, "Failed to create temporary directory: %m"); - - options = "mode=755"; - r = tmpfs_patch_options(options, &buf); - if (r < 0) - return log_oom(); - if (r > 0) - options = buf; - - if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) { - r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m"); - goto fail; - } - - tmpfs_mounted = true; - - f = prefix_roota(directory, "/usr"); - t = prefix_roota(template, "/usr"); - - r = mkdir(t, 0755); - if (r < 0 && errno != EEXIST) { - r = log_error_errno(errno, "Failed to create %s: %m", t); - goto fail; - } - - if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) { - r = log_error_errno(errno, "Failed to create /usr bind mount: %m"); - goto fail; - } - - bind_mounted = true; - - r = bind_remount_recursive(t, true); - if (r < 0) { - log_error_errno(r, "Failed to remount %s read-only: %m", t); - goto fail; - } - - if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) { - r = log_error_errno(errno, "Failed to move root mount: %m"); - goto fail; - } - - (void) rmdir(template); - - return 0; - -fail: - if (bind_mounted) - (void) umount(t); - - if (tmpfs_mounted) - (void) umount(template); - (void) rmdir(template); - return r; -} - static char* id128_format_as_uuid(sd_id128_t id, char s[37]) { assert(s); @@ -2082,132 +1320,6 @@ static int setup_kmsg(const char *dest, int kmsg_socket) { return 0; } -static int send_rtnl(int send_fd) { - union { - struct cmsghdr cmsghdr; - uint8_t buf[CMSG_SPACE(sizeof(int))]; - } control = {}; - struct msghdr mh = { - .msg_control = &control, - .msg_controllen = sizeof(control), - }; - struct cmsghdr *cmsg; - _cleanup_close_ int fd = -1; - ssize_t k; - - assert(send_fd >= 0); - - if (!arg_expose_ports) - return 0; - - fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE); - if (fd < 0) - return log_error_errno(errno, "Failed to allocate container netlink: %m"); - - cmsg = CMSG_FIRSTHDR(&mh); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); - - mh.msg_controllen = cmsg->cmsg_len; - - /* Store away the fd in the socket, so that it stays open as - * long as we run the child */ - k = sendmsg(send_fd, &mh, MSG_NOSIGNAL); - if (k < 0) - return log_error_errno(errno, "Failed to send netlink fd: %m"); - - return 0; -} - -static int flush_ports(union in_addr_union *exposed) { - ExposePort *p; - int r, af = AF_INET; - - assert(exposed); - - if (!arg_expose_ports) - return 0; - - if (in_addr_is_null(af, exposed)) - return 0; - - log_debug("Lost IP address."); - - LIST_FOREACH(ports, p, arg_expose_ports) { - r = fw_add_local_dnat(false, - af, - p->protocol, - NULL, - NULL, 0, - NULL, 0, - p->host_port, - exposed, - p->container_port, - NULL); - if (r < 0) - log_warning_errno(r, "Failed to modify firewall: %m"); - } - - *exposed = IN_ADDR_NULL; - return 0; -} - -static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) { - _cleanup_free_ struct local_address *addresses = NULL; - _cleanup_free_ char *pretty = NULL; - union in_addr_union new_exposed; - ExposePort *p; - bool add; - int af = AF_INET, r; - - assert(exposed); - - /* Invoked each time an address is added or removed inside the - * container */ - - if (!arg_expose_ports) - return 0; - - r = local_addresses(rtnl, 0, af, &addresses); - if (r < 0) - return log_error_errno(r, "Failed to enumerate local addresses: %m"); - - add = r > 0 && - addresses[0].family == af && - addresses[0].scope < RT_SCOPE_LINK; - - if (!add) - return flush_ports(exposed); - - new_exposed = addresses[0].address; - if (in_addr_equal(af, exposed, &new_exposed)) - return 0; - - in_addr_to_string(af, &new_exposed, &pretty); - log_debug("New container IP is %s.", strna(pretty)); - - LIST_FOREACH(ports, p, arg_expose_ports) { - - r = fw_add_local_dnat(true, - af, - p->protocol, - NULL, - NULL, 0, - NULL, 0, - p->host_port, - &new_exposed, - p->container_port, - in_addr_is_null(af, exposed) ? NULL : exposed); - if (r < 0) - log_warning_errno(r, "Failed to modify firewall: %m"); - } - - *exposed = new_exposed; - return 0; -} - static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) { union in_addr_union *exposed = userdata; @@ -2215,62 +1327,7 @@ static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *user assert(m); assert(exposed); - expose_ports(rtnl, exposed); - return 0; -} - -static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) { - union { - struct cmsghdr cmsghdr; - uint8_t buf[CMSG_SPACE(sizeof(int))]; - } control = {}; - struct msghdr mh = { - .msg_control = &control, - .msg_controllen = sizeof(control), - }; - struct cmsghdr *cmsg; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - int fd, r; - ssize_t k; - - assert(event); - assert(recv_fd >= 0); - assert(ret); - - if (!arg_expose_ports) - return 0; - - k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL); - if (k < 0) - return log_error_errno(errno, "Failed to recv netlink fd: %m"); - - cmsg = CMSG_FIRSTHDR(&mh); - assert(cmsg->cmsg_level == SOL_SOCKET); - assert(cmsg->cmsg_type == SCM_RIGHTS); - assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); - memcpy(&fd, CMSG_DATA(cmsg), sizeof(int)); - - r = sd_netlink_open_fd(&rtnl, fd); - if (r < 0) { - safe_close(fd); - return log_error_errno(r, "Failed to create rtnl object: %m"); - } - - r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed); - if (r < 0) - return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m"); - - r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed); - if (r < 0) - return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m"); - - r = sd_netlink_attach_event(rtnl, event, 0); - if (r < 0) - return log_error_errno(r, "Failed to add to even loop: %m"); - - *ret = rtnl; - rtnl = NULL; - + expose_port_execute(rtnl, arg_expose_ports, exposed); return 0; } @@ -2447,220 +1504,6 @@ static int drop_capabilities(void) { return capability_bounding_set_drop(~arg_retain, false); } -static int register_machine(pid_t pid, int local_ifindex) { - _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; - int r; - - if (!arg_register) - return 0; - - r = sd_bus_default_system(&bus); - if (r < 0) - return log_error_errno(r, "Failed to open system bus: %m"); - - if (arg_keep_unit) { - r = sd_bus_call_method( - bus, - "org.freedesktop.machine1", - "/org/freedesktop/machine1", - "org.freedesktop.machine1.Manager", - "RegisterMachineWithNetwork", - &error, - NULL, - "sayssusai", - arg_machine, - SD_BUS_MESSAGE_APPEND_ID128(arg_uuid), - "nspawn", - "container", - (uint32_t) pid, - strempty(arg_directory), - local_ifindex > 0 ? 1 : 0, local_ifindex); - } else { - _cleanup_bus_message_unref_ sd_bus_message *m = NULL; - char **i; - unsigned j; - - r = sd_bus_message_new_method_call( - bus, - &m, - "org.freedesktop.machine1", - "/org/freedesktop/machine1", - "org.freedesktop.machine1.Manager", - "CreateMachineWithNetwork"); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_message_append( - m, - "sayssusai", - arg_machine, - SD_BUS_MESSAGE_APPEND_ID128(arg_uuid), - "nspawn", - "container", - (uint32_t) pid, - strempty(arg_directory), - local_ifindex > 0 ? 1 : 0, local_ifindex); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_message_open_container(m, 'a', "(sv)"); - if (r < 0) - return bus_log_create_error(r); - - if (!isempty(arg_slice)) { - r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice); - if (r < 0) - return bus_log_create_error(r); - } - - r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict"); - if (r < 0) - return bus_log_create_error(r); - - /* If you make changes here, also make sure to update - * systemd-nspawn@.service, to keep the device - * policies in sync regardless if we are run with or - * without the --keep-unit switch. */ - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9, - /* Allow the container to - * access and create the API - * device nodes, so that - * PrivateDevices= in the - * container can work - * fine */ - "/dev/null", "rwm", - "/dev/zero", "rwm", - "/dev/full", "rwm", - "/dev/random", "rwm", - "/dev/urandom", "rwm", - "/dev/tty", "rwm", - "/dev/net/tun", "rwm", - /* Allow the container - * access to ptys. However, - * do not permit the - * container to ever create - * these device nodes. */ - "/dev/pts/ptmx", "rw", - "char-pts", "rw"); - if (r < 0) - return bus_log_create_error(r); - - for (j = 0; j < arg_n_custom_mounts; j++) { - CustomMount *cm = &arg_custom_mounts[j]; - - if (cm->type != CUSTOM_MOUNT_BIND) - continue; - - r = is_device_node(cm->source); - if (r < 0) - return log_error_errno(r, "Failed to stat %s: %m", cm->source); - - if (r) { - r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1, - cm->source, cm->read_only ? "r" : "rw"); - if (r < 0) - return log_error_errno(r, "Failed to append message arguments: %m"); - } - } - - if (arg_kill_signal != 0) { - r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed"); - if (r < 0) - return bus_log_create_error(r); - } - - STRV_FOREACH(i, arg_property) { - r = sd_bus_message_open_container(m, 'r', "sv"); - if (r < 0) - return bus_log_create_error(r); - - r = bus_append_unit_property_assignment(m, *i); - if (r < 0) - return r; - - r = sd_bus_message_close_container(m); - if (r < 0) - return bus_log_create_error(r); - } - - r = sd_bus_message_close_container(m); - if (r < 0) - return bus_log_create_error(r); - - r = sd_bus_call(bus, m, 0, &error, NULL); - } - - if (r < 0) { - log_error("Failed to register machine: %s", bus_error_message(&error, r)); - return r; - } - - return 0; -} - -static int terminate_machine(pid_t pid) { - _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL; - _cleanup_bus_message_unref_ sd_bus_message *reply = NULL; - _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL; - const char *path; - int r; - - if (!arg_register) - return 0; - - /* If we are reusing the unit, then just exit, systemd will do - * the right thing when we exit. */ - if (arg_keep_unit) - return 0; - - r = sd_bus_default_system(&bus); - if (r < 0) - return log_error_errno(r, "Failed to open system bus: %m"); - - r = sd_bus_call_method( - bus, - "org.freedesktop.machine1", - "/org/freedesktop/machine1", - "org.freedesktop.machine1.Manager", - "GetMachineByPID", - &error, - &reply, - "u", - (uint32_t) pid); - if (r < 0) { - /* Note that the machine might already have been - * cleaned up automatically, hence don't consider it a - * failure if we cannot get the machine object. */ - log_debug("Failed to get machine: %s", bus_error_message(&error, r)); - return 0; - } - - r = sd_bus_message_read(reply, "o", &path); - if (r < 0) - return bus_log_parse_error(r); - - r = sd_bus_call_method( - bus, - "org.freedesktop.machine1", - path, - "org.freedesktop.machine1.Machine", - "Terminate", - &error, - NULL, - NULL); - if (r < 0) { - log_debug("Failed to terminate machine: %s", bus_error_message(&error, r)); - return 0; - } - - return 0; -} - static int reset_audit_loginuid(void) { _cleanup_free_ char *p = NULL; int r; @@ -2693,427 +1536,6 @@ static int reset_audit_loginuid(void) { return 0; } -#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1) -#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2) -#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f) - -static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) { - uint8_t result[8]; - size_t l, sz; - uint8_t *v, *i; - int r; - - l = strlen(arg_machine); - sz = sizeof(sd_id128_t) + l; - if (idx > 0) - sz += sizeof(idx); - - v = alloca(sz); - - /* fetch some persistent data unique to the host */ - r = sd_id128_get_machine((sd_id128_t*) v); - if (r < 0) - return r; - - /* combine with some data unique (on this host) to this - * container instance */ - i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l); - if (idx > 0) { - idx = htole64(idx); - memcpy(i, &idx, sizeof(idx)); - } - - /* Let's hash the host machine ID plus the container name. We - * use a fixed, but originally randomly created hash key here. */ - siphash24(result, v, sz, hash_key.bytes); - - assert_cc(ETH_ALEN <= sizeof(result)); - memcpy(mac->ether_addr_octet, result, ETH_ALEN); - - /* see eth_random_addr in the kernel */ - mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */ - mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */ - - return 0; -} - -static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - struct ether_addr mac_host, mac_container; - int r, i; - - if (!arg_private_network) - return 0; - - if (!arg_network_veth) - return 0; - - /* Use two different interface name prefixes depending whether - * we are in bridge mode or not. */ - snprintf(iface_name, IFNAMSIZ - 1, "%s-%s", - arg_network_bridge ? "vb" : "ve", arg_machine); - - r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0); - if (r < 0) - return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m"); - - r = generate_mac(&mac_host, HOST_HASH_KEY, 0); - if (r < 0) - return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m"); - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host); - if (r < 0) - return log_error_errno(r, "Failed to add netlink MAC address: %m"); - - r = sd_netlink_message_open_container(m, IFLA_LINKINFO); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth"); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container(m, VETH_INFO_PEER); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0"); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container); - if (r < 0) - return log_error_errno(r, "Failed to add netlink MAC address: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to add netlink namespace field: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name); - - i = (int) if_nametoindex(iface_name); - if (i <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name); - - *ifi = i; - - return 0; -} - -static int setup_bridge(const char veth_name[], int *ifi) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - int r, bridge; - - if (!arg_private_network) - return 0; - - if (!arg_network_veth) - return 0; - - if (!arg_network_bridge) - return 0; - - bridge = (int) if_nametoindex(arg_network_bridge); - if (bridge <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge); - - *ifi = bridge; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP); - if (r < 0) - return log_error_errno(r, "Failed to set IFF_UP flag: %m"); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name field: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge); - if (r < 0) - return log_error_errno(r, "Failed to add netlink master field: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add veth interface to bridge: %m"); - - return 0; -} - -static int parse_interface(struct udev *udev, const char *name) { - _cleanup_udev_device_unref_ struct udev_device *d = NULL; - char ifi_str[2 + DECIMAL_STR_MAX(int)]; - int ifi; - - ifi = (int) if_nametoindex(name); - if (ifi <= 0) - return log_error_errno(errno, "Failed to resolve interface %s: %m", name); - - sprintf(ifi_str, "n%i", ifi); - d = udev_device_new_from_device_id(udev, ifi_str); - if (!d) - return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name); - - if (udev_device_get_is_initialized(d) <= 0) { - log_error("Network interface %s is not initialized yet.", name); - return -EBUSY; - } - - return ifi; -} - -static int move_network_interfaces(pid_t pid) { - _cleanup_udev_unref_ struct udev *udev = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - char **i; - int r; - - if (!arg_private_network) - return 0; - - if (strv_isempty(arg_network_interfaces)) - return 0; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - udev = udev_new(); - if (!udev) { - log_error("Failed to connect to udev."); - return -ENOMEM; - } - - STRV_FOREACH(i, arg_network_interfaces) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - int ifi; - - ifi = parse_interface(udev, *i); - if (ifi < 0) - return ifi; - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to append namespace PID to netlink message: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i); - } - - return 0; -} - -static int setup_macvlan(pid_t pid) { - _cleanup_udev_unref_ struct udev *udev = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - unsigned idx = 0; - char **i; - int r; - - if (!arg_private_network) - return 0; - - if (strv_isempty(arg_network_macvlan)) - return 0; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - udev = udev_new(); - if (!udev) { - log_error("Failed to connect to udev."); - return -ENOMEM; - } - - STRV_FOREACH(i, arg_network_macvlan) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_free_ char *n = NULL; - struct ether_addr mac; - int ifi; - - ifi = parse_interface(udev, *i); - if (ifi < 0) - return ifi; - - r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++); - if (r < 0) - return log_error_errno(r, "Failed to create MACVLAN MAC address: %m"); - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface index: %m"); - - n = strappend("mv-", *i); - if (!n) - return log_oom(); - - strshorten(n, IFNAMSIZ-1); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac); - if (r < 0) - return log_error_errno(r, "Failed to add netlink MAC address: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to add netlink namespace field: %m"); - - r = sd_netlink_message_open_container(m, IFLA_LINKINFO); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan"); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE); - if (r < 0) - return log_error_errno(r, "Failed to append macvlan mode: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add new macvlan interfaces: %m"); - } - - return 0; -} - -static int setup_ipvlan(pid_t pid) { - _cleanup_udev_unref_ struct udev *udev = NULL; - _cleanup_netlink_unref_ sd_netlink *rtnl = NULL; - char **i; - int r; - - if (!arg_private_network) - return 0; - - if (strv_isempty(arg_network_ipvlan)) - return 0; - - r = sd_netlink_open(&rtnl); - if (r < 0) - return log_error_errno(r, "Failed to connect to netlink: %m"); - - udev = udev_new(); - if (!udev) { - log_error("Failed to connect to udev."); - return -ENOMEM; - } - - STRV_FOREACH(i, arg_network_ipvlan) { - _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL; - _cleanup_free_ char *n = NULL; - int ifi; - - ifi = parse_interface(udev, *i); - if (ifi < 0) - return ifi; - - r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0); - if (r < 0) - return log_error_errno(r, "Failed to allocate netlink message: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface index: %m"); - - n = strappend("iv-", *i); - if (!n) - return log_oom(); - - strshorten(n, IFNAMSIZ-1); - - r = sd_netlink_message_append_string(m, IFLA_IFNAME, n); - if (r < 0) - return log_error_errno(r, "Failed to add netlink interface name: %m"); - - r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid); - if (r < 0) - return log_error_errno(r, "Failed to add netlink namespace field: %m"); - - r = sd_netlink_message_open_container(m, IFLA_LINKINFO); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan"); - if (r < 0) - return log_error_errno(r, "Failed to open netlink container: %m"); - - r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2); - if (r < 0) - return log_error_errno(r, "Failed to add ipvlan mode: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_message_close_container(m); - if (r < 0) - return log_error_errno(r, "Failed to close netlink container: %m"); - - r = sd_netlink_call(rtnl, m, 0, NULL); - if (r < 0) - return log_error_errno(r, "Failed to add new ipvlan interfaces: %m"); - } - - return 0; -} - static int setup_seccomp(void) { #ifdef HAVE_SECCOMP @@ -3812,247 +2234,6 @@ static void loop_remove(int nr, int *image_fd) { log_debug_errno(errno, "Failed to remove loop %d: %m", nr); } -static int spawn_getent(const char *database, const char *key, pid_t *rpid) { - int pipe_fds[2]; - pid_t pid; - - assert(database); - assert(key); - assert(rpid); - - if (pipe2(pipe_fds, O_CLOEXEC) < 0) - return log_error_errno(errno, "Failed to allocate pipe: %m"); - - pid = fork(); - if (pid < 0) - return log_error_errno(errno, "Failed to fork getent child: %m"); - else if (pid == 0) { - int nullfd; - char *empty_env = NULL; - - if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0) - _exit(EXIT_FAILURE); - - if (pipe_fds[0] > 2) - safe_close(pipe_fds[0]); - if (pipe_fds[1] > 2) - safe_close(pipe_fds[1]); - - nullfd = open("/dev/null", O_RDWR); - if (nullfd < 0) - _exit(EXIT_FAILURE); - - if (dup3(nullfd, STDIN_FILENO, 0) < 0) - _exit(EXIT_FAILURE); - - if (dup3(nullfd, STDERR_FILENO, 0) < 0) - _exit(EXIT_FAILURE); - - if (nullfd > 2) - safe_close(nullfd); - - (void) reset_all_signal_handlers(); - (void) reset_signal_mask(); - close_all_fds(NULL, 0); - - execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env); - execle("/bin/getent", "getent", database, key, NULL, &empty_env); - _exit(EXIT_FAILURE); - } - - pipe_fds[1] = safe_close(pipe_fds[1]); - - *rpid = pid; - - return pipe_fds[0]; -} - -static int change_uid_gid(char **_home) { - char line[LINE_MAX], *x, *u, *g, *h; - const char *word, *state; - _cleanup_free_ uid_t *uids = NULL; - _cleanup_free_ char *home = NULL; - _cleanup_fclose_ FILE *f = NULL; - _cleanup_close_ int fd = -1; - unsigned n_uids = 0; - size_t sz = 0, l; - uid_t uid; - gid_t gid; - pid_t pid; - int r; - - assert(_home); - - if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) { - /* Reset everything fully to 0, just in case */ - - r = reset_uid_gid(); - if (r < 0) - return log_error_errno(r, "Failed to become root: %m"); - - *_home = NULL; - return 0; - } - - /* First, get user credentials */ - fd = spawn_getent("passwd", arg_user, &pid); - if (fd < 0) - return fd; - - f = fdopen(fd, "r"); - if (!f) - return log_oom(); - fd = -1; - - if (!fgets(line, sizeof(line), f)) { - - if (!ferror(f)) { - log_error("Failed to resolve user %s.", arg_user); - return -ESRCH; - } - - log_error_errno(errno, "Failed to read from getent: %m"); - return -errno; - } - - truncate_nl(line); - - wait_for_terminate_and_warn("getent passwd", pid, true); - - x = strchr(line, ':'); - if (!x) { - log_error("/etc/passwd entry has invalid user field."); - return -EIO; - } - - u = strchr(x+1, ':'); - if (!u) { - log_error("/etc/passwd entry has invalid password field."); - return -EIO; - } - - u++; - g = strchr(u, ':'); - if (!g) { - log_error("/etc/passwd entry has invalid UID field."); - return -EIO; - } - - *g = 0; - g++; - x = strchr(g, ':'); - if (!x) { - log_error("/etc/passwd entry has invalid GID field."); - return -EIO; - } - - *x = 0; - h = strchr(x+1, ':'); - if (!h) { - log_error("/etc/passwd entry has invalid GECOS field."); - return -EIO; - } - - h++; - x = strchr(h, ':'); - if (!x) { - log_error("/etc/passwd entry has invalid home directory field."); - return -EIO; - } - - *x = 0; - - r = parse_uid(u, &uid); - if (r < 0) { - log_error("Failed to parse UID of user."); - return -EIO; - } - - r = parse_gid(g, &gid); - if (r < 0) { - log_error("Failed to parse GID of user."); - return -EIO; - } - - home = strdup(h); - if (!home) - return log_oom(); - - /* Second, get group memberships */ - fd = spawn_getent("initgroups", arg_user, &pid); - if (fd < 0) - return fd; - - fclose(f); - f = fdopen(fd, "r"); - if (!f) - return log_oom(); - fd = -1; - - if (!fgets(line, sizeof(line), f)) { - if (!ferror(f)) { - log_error("Failed to resolve user %s.", arg_user); - return -ESRCH; - } - - log_error_errno(errno, "Failed to read from getent: %m"); - return -errno; - } - - truncate_nl(line); - - wait_for_terminate_and_warn("getent initgroups", pid, true); - - /* Skip over the username and subsequent separator whitespace */ - x = line; - x += strcspn(x, WHITESPACE); - x += strspn(x, WHITESPACE); - - FOREACH_WORD(word, l, x, state) { - char c[l+1]; - - memcpy(c, word, l); - c[l] = 0; - - if (!GREEDY_REALLOC(uids, sz, n_uids+1)) - return log_oom(); - - r = parse_uid(c, &uids[n_uids++]); - if (r < 0) { - log_error("Failed to parse group data from getent."); - return -EIO; - } - } - - r = mkdir_parents(home, 0775); - if (r < 0) - return log_error_errno(r, "Failed to make home root directory: %m"); - - r = mkdir_safe(home, 0755, uid, gid); - if (r < 0 && r != -EEXIST) - return log_error_errno(r, "Failed to make home directory: %m"); - - (void) fchown(STDIN_FILENO, uid, gid); - (void) fchown(STDOUT_FILENO, uid, gid); - (void) fchown(STDERR_FILENO, uid, gid); - - if (setgroups(n_uids, uids) < 0) - return log_error_errno(errno, "Failed to set auxiliary groups: %m"); - - if (setresgid(gid, gid, gid) < 0) - return log_error_errno(errno, "setregid() failed: %m"); - - if (setresuid(uid, uid, uid) < 0) - return log_error_errno(errno, "setreuid() failed: %m"); - - if (_home) { - *_home = home; - home = NULL; - } - - return 0; -} - /* * Return values: * < 0 : wait_for_terminate() failed to get the state of the @@ -4254,9 +2435,7 @@ static int inner_child( bool secondary, int kmsg_socket, int rtnl_socket, - FDSet *fds, - int argc, - char *argv[]) { + FDSet *fds) { _cleanup_free_ char *home = NULL; unsigned n_env = 2; @@ -4293,7 +2472,7 @@ static int inner_child( } } - r = mount_all(NULL, true); + r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; @@ -4304,7 +2483,7 @@ static int inner_child( return -ESRCH; } - r = mount_systemd_cgroup_writable(""); + r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy); if (r < 0) return r; @@ -4329,10 +2508,12 @@ static int inner_child( if (arg_private_network) loopback_setup(); - r = send_rtnl(rtnl_socket); - if (r < 0) - return r; - rtnl_socket = safe_close(rtnl_socket); + if (arg_expose_ports) { + r = expose_port_send_rtnl(rtnl_socket); + if (r < 0) + return r; + rtnl_socket = safe_close(rtnl_socket); + } if (drop_capabilities() < 0) return log_error_errno(errno, "drop_capabilities() failed: %m"); @@ -4353,7 +2534,7 @@ static int inner_child( return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context); #endif - r = change_uid_gid(&home); + r = change_uid_gid(arg_user, &home); if (r < 0) return r; @@ -4412,9 +2593,12 @@ static int inner_child( /* Automatically search for the init system */ - m = 1 + argc - optind; + m = 1 + strv_length(arg_parameters); a = newa(char*, m + 1); - memcpy(a + 1, argv + optind, m * sizeof(char*)); + if (strv_isempty(arg_parameters)) + a[1] = NULL; + else + memcpy(a + 1, arg_parameters, m * sizeof(char*)); a[0] = (char*) "/usr/lib/systemd/systemd"; execve(a[0], a, env_use); @@ -4424,10 +2608,10 @@ static int inner_child( a[0] = (char*) "/sbin/init"; execve(a[0], a, env_use); - } else if (argc > optind) - execvpe(argv[optind], argv + optind, env_use); + } else if (!strv_isempty(arg_parameters)) + execvpe(arg_parameters[0], arg_parameters, env_use); else { - chdir(home ? home : "/root"); + chdir(home ?: "/root"); execle("/bin/bash", "-bash", NULL, env_use); execle("/bin/sh", "-sh", NULL, env_use); } @@ -4449,9 +2633,7 @@ static int outer_child( int kmsg_socket, int rtnl_socket, int uid_shift_socket, - FDSet *fds, - int argc, - char *argv[]) { + FDSet *fds) { pid_t pid; ssize_t l; @@ -4523,11 +2705,11 @@ static int outer_child( if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) return log_error_errno(errno, "Failed to make bind mount: %m"); - r = setup_volatile(directory); + r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); if (r < 0) return r; - r = setup_volatile_state(directory); + r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); if (r < 0) return r; @@ -4541,16 +2723,18 @@ static int outer_child( return log_error_errno(r, "Failed to make tree read-only: %m"); } - r = mount_all(directory, false); + r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; - if (copy_devnodes(directory) < 0) + r = copy_devnodes(directory); + if (r < 0) return r; dev_setup(directory, arg_uid_shift, arg_uid_shift); - if (setup_pts(directory) < 0) + r = setup_pts(directory); + if (r < 0) return r; r = setup_propagate(directory); @@ -4577,11 +2761,11 @@ static int outer_child( if (r < 0) return r; - r = mount_custom(directory); + r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; - r = mount_cgroups(directory); + r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); if (r < 0) return r; @@ -4604,7 +2788,7 @@ static int outer_child( * requested, so that we all are owned by the user if * user namespaces are turned on. */ - r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv); + r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds); if (r < 0) _exit(EXIT_FAILURE); @@ -4645,138 +2829,199 @@ static int setup_uid_map(pid_t pid) { return 0; } -static int chown_cgroup(pid_t pid) { - _cleanup_free_ char *path = NULL, *fs = NULL; - _cleanup_close_ int fd = -1; - const char *fn; +static int load_settings(void) { + _cleanup_(settings_freep) Settings *settings = NULL; + _cleanup_fclose_ FILE *f = NULL; + _cleanup_free_ char *p = NULL; + const char *fn, *i; int r; - r = cg_pid_get_path(NULL, pid, &path); - if (r < 0) - return log_error_errno(r, "Failed to get container cgroup path: %m"); + /* If all settings are masked, there's no point in looking for + * the settings file */ + if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL) + return 0; - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); - if (r < 0) - return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + fn = strjoina(arg_machine, ".nspawn"); - fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY); - if (fd < 0) - return log_error_errno(errno, "Failed to open %s: %m", fs); - - FOREACH_STRING(fn, - ".", - "tasks", - "notify_on_release", - "cgroup.procs", - "cgroup.clone_children", - "cgroup.controllers", - "cgroup.subtree_control", - "cgroup.populated") - if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0) - log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, - "Failed to chown() cgroup file %s, ignoring: %m", fn); + /* We first look in the admin's directories in /etc and /run */ + FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { + _cleanup_free_ char *j = NULL; - return 0; -} + j = strjoin(i, "/", fn, NULL); + if (!j) + return log_oom(); + + f = fopen(j, "re"); + if (f) { + p = j; + j = NULL; + + /* By default we trust configuration from /etc and /run */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; + + break; + } -static int sync_cgroup(pid_t pid) { - _cleanup_free_ char *cgroup = NULL; - char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; - bool undo_mount = false; - const char *fn; - int unified, r; + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", j); + } + + if (!f) { + /* After that, let's look for a file next to the + * actual image we shall boot. */ + + if (arg_image) { + p = file_in_same_dir(arg_image, fn); + if (!p) + return log_oom(); + } else if (arg_directory) { + p = file_in_same_dir(arg_directory, fn); + if (!p) + return log_oom(); + } - unified = cg_unified(); - if (unified < 0) - return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); + if (p) { + f = fopen(p, "re"); + if (!f && errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", p); - if ((unified > 0) == arg_unified_cgroup_hierarchy) + /* By default we do not trust configuration from /var/lib/machines */ + if (arg_settings_trusted < 0) + arg_settings_trusted = false; + } + } + + if (!f) return 0; - /* When the host uses the legacy cgroup setup, but the - * container shall use the unified hierarchy, let's make sure - * we copy the path from the name=systemd hierarchy into the - * unified hierarchy. Similar for the reverse situation. */ + log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted)); - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + r = settings_load(f, p, &settings); if (r < 0) - return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid); + return r; - /* In order to access the unified hierarchy we need to mount it */ - if (!mkdtemp(tree)) - return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m"); + /* Copy over bits from the settings, unless they have been + * explicitly masked by command line switches. */ - if (unified) - r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr"); - else - r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior"); - if (r < 0) { - r = log_error_errno(errno, "Failed to mount unified hierarchy: %m"); - goto finish; + if ((arg_settings_mask & SETTING_BOOT) == 0 && + settings->boot >= 0) { + arg_boot = settings->boot; + + strv_free(arg_parameters); + arg_parameters = settings->parameters; + settings->parameters = NULL; } - undo_mount = true; + if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 && + settings->environment) { + strv_free(arg_setenv); + arg_setenv = settings->environment; + settings->environment = NULL; + } - fn = strjoina(tree, cgroup, "/cgroup.procs"); - (void) mkdir_parents(fn, 0755); + if ((arg_settings_mask & SETTING_USER) == 0 && + settings->user) { + free(arg_user); + arg_user = settings->user; + settings->user = NULL; + } - sprintf(pid_string, PID_FMT, pid); - r = write_string_file(fn, pid_string, 0); - if (r < 0) - log_error_errno(r, "Failed to move process: %m"); + if ((arg_settings_mask & SETTING_CAPABILITY) == 0) { -finish: - if (undo_mount) - (void) umount(tree); + if (!arg_settings_trusted && settings->capability != 0) + log_warning("Ignoring Capability= setting, file %s is not trusted.", p); + else + arg_retain |= settings->capability; - (void) rmdir(tree); - return r; -} + arg_retain &= ~settings->drop_capability; + } -static int create_subcgroup(pid_t pid) { - _cleanup_free_ char *cgroup = NULL; - const char *child; - int unified, r; - CGroupMask supported; + if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 && + settings->kill_signal > 0) + arg_kill_signal = settings->kill_signal; - /* In the unified hierarchy inner nodes may only only contain - * subgroups, but not processes. Hence, if we running in the - * unified hierarchy and the container does the same, and we - * did not create a scope unit for the container move us and - * the container into two separate subcgroups. */ + if ((arg_settings_mask & SETTING_PERSONALITY) == 0 && + settings->personality != PERSONALITY_INVALID) + arg_personality = settings->personality; - if (!arg_keep_unit) - return 0; + if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 && + !sd_id128_is_null(settings->machine_id)) { - if (!arg_unified_cgroup_hierarchy) - return 0; + if (!arg_settings_trusted) + log_warning("Ignoring MachineID= setting, file %s is not trusted.", p); + else + arg_uuid = settings->machine_id; + } - unified = cg_unified(); - if (unified < 0) - return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m"); - if (unified == 0) - return 0; + if ((arg_settings_mask & SETTING_READ_ONLY) == 0 && + settings->read_only >= 0) + arg_read_only = settings->read_only; - r = cg_mask_supported(&supported); - if (r < 0) - return log_error_errno(r, "Failed to determine supported controllers: %m"); + if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 && + settings->volatile_mode != _VOLATILE_MODE_INVALID) + arg_volatile_mode = settings->volatile_mode; - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); - if (r < 0) - return log_error_errno(r, "Failed to get our control group: %m"); + if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 && + settings->n_custom_mounts > 0) { - child = strjoina(cgroup, "/payload"); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid); - if (r < 0) - return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + if (!arg_settings_trusted) + log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p); + else { + custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); + arg_custom_mounts = settings->custom_mounts; + arg_n_custom_mounts = settings->n_custom_mounts; - child = strjoina(cgroup, "/supervisor"); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0); - if (r < 0) - return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + settings->custom_mounts = NULL; + settings->n_custom_mounts = 0; + } + } + + if ((arg_settings_mask & SETTING_NETWORK) == 0 && + (settings->private_network >= 0 || + settings->network_veth >= 0 || + settings->network_bridge || + settings->network_interfaces || + settings->network_macvlan || + settings->network_ipvlan)) { + + if (!arg_settings_trusted) + log_warning("Ignoring network settings, file %s is not trusted.", p); + else { + strv_free(arg_network_interfaces); + arg_network_interfaces = settings->network_interfaces; + settings->network_interfaces = NULL; + + strv_free(arg_network_macvlan); + arg_network_macvlan = settings->network_macvlan; + settings->network_macvlan = NULL; + + strv_free(arg_network_ipvlan); + arg_network_ipvlan = settings->network_ipvlan; + settings->network_ipvlan = NULL; + + free(arg_network_bridge); + arg_network_bridge = settings->network_bridge; + settings->network_bridge = NULL; + + arg_network_veth = settings->network_veth > 0 || settings->network_bridge; + + arg_private_network = true; /* all these settings imply private networking */ + } + } + + if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 && + settings->expose_ports) { + + if (!arg_settings_trusted) + log_warning("Ignoring Port= setting, file %s is not trusted.", p); + else { + expose_port_free_all(arg_expose_ports); + arg_expose_ports = settings->expose_ports; + settings->expose_ports = NULL; + } + } - /* Try to enable as many controllers as possible for the new payload. */ - (void) cg_enable_everywhere(supported, supported, cgroup); return 0; } @@ -4803,15 +3048,22 @@ int main(int argc, char *argv[]) { if (r <= 0) goto finish; - r = determine_names(); - if (r < 0) - goto finish; - if (geteuid() != 0) { log_error("Need to be root."); r = -EPERM; goto finish; } + r = determine_names(); + if (r < 0) + goto finish; + + r = load_settings(); + if (r < 0) + goto finish; + + r = verify_arguments(); + if (r < 0) + goto finish; n_fd_passed = sd_listen_fds(false); if (n_fd_passed > 0) { @@ -5020,23 +3272,23 @@ int main(int argc, char *argv[]) { goto finish; } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create kmsg socket pair: %m"); goto finish; } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create rtnl socket pair: %m"); goto finish; } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create pid socket pair: %m"); goto finish; } if (arg_userns) - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); goto finish; } @@ -5092,8 +3344,7 @@ int main(int argc, char *argv[]) { kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], - fds, - argc, argv); + fds); if (r < 0) _exit(EXIT_FAILURE); @@ -5108,6 +3359,7 @@ int main(int argc, char *argv[]) { kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]); rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]); pid_socket_pair[1] = safe_close(pid_socket_pair[1]); + uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", pid, NULL); @@ -5126,7 +3378,7 @@ int main(int argc, char *argv[]) { goto finish; } if (l != sizeof(pid)) { - log_error("Short read while reading inner child PID: %m"); + log_error("Short read while reading inner child PID."); r = EIO; goto finish; } @@ -5146,7 +3398,7 @@ int main(int argc, char *argv[]) { goto finish; } if (l != sizeof(arg_uid_shift)) { - log_error("Short read while reading UID shift: %m"); + log_error("Short read while reading UID shift."); r = EIO; goto finish; } @@ -5158,39 +3410,64 @@ int main(int argc, char *argv[]) { (void) barrier_place(&barrier); /* #2 */ } - r = move_network_interfaces(pid); - if (r < 0) - goto finish; + if (arg_private_network) { - r = setup_veth(pid, veth_name, &ifi); - if (r < 0) - goto finish; + r = move_network_interfaces(pid, arg_network_interfaces); + if (r < 0) + goto finish; - r = setup_bridge(veth_name, &ifi); - if (r < 0) - goto finish; + if (arg_network_veth) { + r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge); + if (r < 0) + goto finish; + else if (r > 0) + ifi = r; - r = setup_macvlan(pid); - if (r < 0) - goto finish; + if (arg_network_bridge) { + r = setup_bridge(veth_name, arg_network_bridge); + if (r < 0) + goto finish; + if (r > 0) + ifi = r; + } + } - r = setup_ipvlan(pid); - if (r < 0) - goto finish; + r = setup_macvlan(arg_machine, pid, arg_network_macvlan); + if (r < 0) + goto finish; - r = register_machine(pid, ifi); - if (r < 0) - goto finish; + r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan); + if (r < 0) + goto finish; + } - r = sync_cgroup(pid); - if (r < 0) - goto finish; + if (arg_register) { + r = register_machine( + arg_machine, + pid, + arg_directory, + arg_uuid, + ifi, + arg_slice, + arg_custom_mounts, arg_n_custom_mounts, + arg_kill_signal, + arg_property, + arg_keep_unit); + if (r < 0) + goto finish; + } - r = create_subcgroup(pid); + r = sync_cgroup(pid, arg_unified_cgroup_hierarchy); if (r < 0) goto finish; - r = chown_cgroup(pid); + if (arg_keep_unit) { + r = create_subcgroup(pid, arg_unified_cgroup_hierarchy); + if (r < 0) + goto finish; + } + + r = chown_cgroup(pid, arg_uid_shift); if (r < 0) goto finish; @@ -5243,11 +3520,11 @@ int main(int argc, char *argv[]) { sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL); if (arg_expose_ports) { - r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl); + r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl); if (r < 0) goto finish; - (void) expose_ports(rtnl, &exposed); + (void) expose_port_execute(rtnl, arg_expose_ports, &exposed); } rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]); @@ -5272,7 +3549,8 @@ int main(int argc, char *argv[]) { putc('\n', stdout); /* Kill if it is not dead yet anyway */ - terminate_machine(pid); + if (arg_register && !arg_keep_unit) + terminate_machine(pid); /* Normally redundant, but better safe than sorry */ kill(pid, SIGKILL); @@ -5310,7 +3588,7 @@ int main(int argc, char *argv[]) { break; } - flush_ports(&exposed); + expose_port_flush(arg_expose_ports, &exposed); } finish: @@ -5342,24 +3620,21 @@ finish: (void) rm_rf(p, REMOVE_ROOT); } + expose_port_flush(arg_expose_ports, &exposed); + free(arg_directory); free(arg_template); free(arg_image); free(arg_machine); free(arg_user); strv_free(arg_setenv); + free(arg_network_bridge); strv_free(arg_network_interfaces); strv_free(arg_network_macvlan); strv_free(arg_network_ipvlan); - custom_mount_free_all(); - - flush_ports(&exposed); - - while (arg_expose_ports) { - ExposePort *p = arg_expose_ports; - LIST_REMOVE(ports, arg_expose_ports, p); - free(p); - } + strv_free(arg_parameters); + custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); + expose_port_free_all(arg_expose_ports); return r < 0 ? EXIT_FAILURE : ret; } diff --git a/src/resolve/resolved-dns-packet.h b/src/resolve/resolved-dns-packet.h index 5628e579ad..fbbabaf232 100644 --- a/src/resolve/resolved-dns-packet.h +++ b/src/resolve/resolved-dns-packet.h @@ -78,7 +78,7 @@ struct DnsPacket { DnsQuestion *question; DnsAnswer *answer; - /* Packet reception meta data */ + /* Packet reception metadata */ int ifindex; int family, ipproto; union in_addr_union sender, destination; diff --git a/src/shared/condition.c b/src/shared/condition.c index f58e84a3d0..1d7dd49e04 100644 --- a/src/shared/condition.c +++ b/src/shared/condition.c @@ -125,13 +125,12 @@ static int condition_test_kernel_command_line(Condition *c) { static int condition_test_virtualization(Condition *c) { int b, v; - const char *id; assert(c); assert(c->parameter); assert(c->type == CONDITION_VIRTUALIZATION); - v = detect_virtualization(&id); + v = detect_virtualization(); if (v < 0) return v; @@ -145,14 +144,14 @@ static int condition_test_virtualization(Condition *c) { return true; /* Then, compare categorization */ - if (v == VIRTUALIZATION_VM && streq(c->parameter, "vm")) + if (VIRTUALIZATION_IS_VM(v) && streq(c->parameter, "vm")) return true; - if (v == VIRTUALIZATION_CONTAINER && streq(c->parameter, "container")) + if (VIRTUALIZATION_IS_CONTAINER(v) && streq(c->parameter, "container")) return true; /* Finally compare id */ - return v > 0 && streq(c->parameter, id); + return v != VIRTUALIZATION_NONE && streq(c->parameter, virtualization_to_string(v)); } static int condition_test_architecture(Condition *c) { diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c index 1f4aea6d6b..36150d8744 100644 --- a/src/shared/conf-parser.c +++ b/src/shared/conf-parser.c @@ -24,7 +24,7 @@ #include <errno.h> #include <stdlib.h> -#include "conf-parser.h" +#include "sd-messages.h" #include "conf-files.h" #include "util.h" #include "macro.h" @@ -32,7 +32,8 @@ #include "log.h" #include "utf8.h" #include "path-util.h" -#include "sd-messages.h" +#include "signal-util.h" +#include "conf-parser.h" int config_item_table_lookup( const void *table, @@ -575,6 +576,39 @@ int config_parse_bool(const char* unit, return 0; } +int config_parse_tristate( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int k, *t = data; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(data); + + /* A tristate is pretty much a boolean, except that it can + * also take the special value -1, indicating "uninitialized", + * much like NULL is for a pointer type. */ + + k = parse_boolean(rvalue); + if (k < 0) { + log_syntax(unit, LOG_ERR, filename, line, k, "Failed to parse boolean value, ignoring: %s", rvalue); + return 0; + } + + *t = !!k; + return 0; +} + int config_parse_string( const char *unit, const char *filename, @@ -802,3 +836,61 @@ int config_parse_log_level( *o = (*o & LOG_FACMASK) | x; return 0; } + +int config_parse_signal( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *sig = data, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(sig); + + r = signal_from_string_try_harder(rvalue); + if (r <= 0) { + log_syntax(unit, LOG_ERR, filename, line, EINVAL, "Failed to parse signal name, ignoring: %s", rvalue); + return 0; + } + + *sig = r; + return 0; +} + +int config_parse_personality( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + unsigned long *personality = data, p; + + assert(filename); + assert(lvalue); + assert(rvalue); + assert(personality); + + p = personality_from_string(rvalue); + if (p == PERSONALITY_INVALID) { + log_syntax(unit, LOG_ERR, filename, line, EINVAL, "Failed to parse personality, ignoring: %s", rvalue); + return 0; + } + + *personality = p; + return 0; +} diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h index 66c80890d3..34e3815782 100644 --- a/src/shared/conf-parser.h +++ b/src/shared/conf-parser.h @@ -111,6 +111,7 @@ int config_parse_iec_size(const char *unit, const char *filename, unsigned line, int config_parse_si_size(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_iec_off(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_bool(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_tristate(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_string(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_path(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_strv(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); @@ -119,6 +120,8 @@ int config_parse_nsec(const char *unit, const char *filename, unsigned line, con int config_parse_mode(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_log_facility(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_log_level(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_signal(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_personality(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); #define log_invalid_utf8(unit, level, config_file, config_line, error, rvalue) \ do { \ diff --git a/src/shared/efivars.c b/src/shared/efivars.c index 347cd30b09..f087c2a566 100644 --- a/src/shared/efivars.c +++ b/src/shared/efivars.c @@ -101,7 +101,7 @@ int efi_reboot_to_firmware_supported(void) { uint64_t b; _cleanup_free_ void *v = NULL; - if (!is_efi_boot() || detect_container(NULL) > 0) + if (!is_efi_boot() || detect_container() > 0) return -EOPNOTSUPP; r = efi_get_variable(EFI_VENDOR_GLOBAL, "OsIndicationsSupported", NULL, &v, &s); diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c index 273dacff1f..70220bdd14 100644 --- a/src/shared/machine-image.c +++ b/src/shared/machine-image.c @@ -33,7 +33,7 @@ static const char image_search_path[] = "/var/lib/machines\0" - "/var/lib/container\0" + "/var/lib/container\0" /* legacy */ "/usr/local/lib/machines\0" "/usr/lib/machines\0"; diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c index 8d80aae182..96c39aa0a9 100644 --- a/src/systemctl/systemctl.c +++ b/src/systemctl/systemctl.c @@ -535,10 +535,8 @@ static int get_unit_list( return bus_log_create_error(r); r = sd_bus_call(bus, m, 0, &error, &reply); - if (r < 0) { - log_error("Failed to list units: %s", bus_error_message(&error, r)); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to list units: %s", bus_error_message(&error, r)); r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "(ssssssouso)"); if (r < 0) @@ -605,7 +603,7 @@ static int get_unit_list_recursive( r = set_put(replies, reply); if (r < 0) { sd_bus_message_unref(reply); - return r; + return log_oom(); } if (arg_recursive) { @@ -614,7 +612,7 @@ static int get_unit_list_recursive( r = sd_get_machine_names(&machines); if (r < 0) - return r; + return log_error_errno(r, "Failed to get machine names: %m"); STRV_FOREACH(i, machines) { _cleanup_bus_flush_close_unref_ sd_bus *container = NULL; @@ -622,7 +620,7 @@ static int get_unit_list_recursive( r = sd_bus_open_system_machine(&container, *i); if (r < 0) { - log_error_errno(r, "Failed to connect to container %s: %m", *i); + log_warning_errno(r, "Failed to connect to container %s, ignoring: %m", *i); continue; } @@ -635,7 +633,7 @@ static int get_unit_list_recursive( r = set_put(replies, reply); if (r < 0) { sd_bus_message_unref(reply); - return r; + return log_oom(); } } @@ -1474,6 +1472,7 @@ static int list_dependencies_get_dependencies(sd_bus *bus, const char *name, cha "Requisite\0" "RequisiteOverridable\0" "Wants\0" + "ConsistsOf\0" "BindsTo\0", [DEPENDENCY_REVERSE] = "RequiredBy\0" "RequiredByOverridable\0" @@ -1743,7 +1742,7 @@ static int get_machine_list( _cleanup_free_ char *hn = NULL; size_t sz = 0; char **i; - int c = 0; + int c = 0, r; hn = gethostname_malloc(); if (!hn) @@ -1761,7 +1760,10 @@ static int get_machine_list( c++; } - sd_get_machine_names(&m); + r = sd_get_machine_names(&m); + if (r < 0) + return log_error_errno(r, "Failed to get machine list: %m"); + STRV_FOREACH(i, m) { _cleanup_free_ char *class = NULL; @@ -4267,10 +4269,8 @@ static int show_one( &error, &reply, "s", ""); - if (r < 0) { - log_error("Failed to get properties: %s", bus_error_message(&error, r)); - return r; - } + if (r < 0) + return log_error_errno(r, "Failed to get properties: %s", bus_error_message(&error, r)); r = sd_bus_message_enter_container(reply, SD_BUS_TYPE_ARRAY, "{sv}"); if (r < 0) diff --git a/src/test/test-architecture.c b/src/test/test-architecture.c index 30bdec45e5..a5b66a7d2f 100644 --- a/src/test/test-architecture.c +++ b/src/test/test-architecture.c @@ -25,18 +25,18 @@ #include "log.h" int main(int argc, char *argv[]) { - const char *id = NULL; int a, v; - v = detect_virtualization(&id); + v = detect_virtualization(); if (v == -EPERM || v == -EACCES) return EXIT_TEST_SKIP; assert_se(v >= 0); log_info("virtualization=%s id=%s", - v == VIRTUALIZATION_CONTAINER ? "container" : v == VIRTUALIZATION_VM ? "vm" : "n/a", - strna(id)); + VIRTUALIZATION_IS_CONTAINER(v) ? "container" : + VIRTUALIZATION_IS_VM(v) ? "vm" : "n/a", + virtualization_to_string(v)); a = uname_architecture(); assert_se(a >= 0); diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c index ff7e45901c..4ecf09a29e 100644 --- a/src/test/test-cgroup-util.c +++ b/src/test/test-cgroup-util.c @@ -320,7 +320,7 @@ int main(void) { test_controller_is_valid(); test_slice_to_path(); test_shift_path(); - test_mask_supported(); + TEST_REQ_RUNNING_SYSTEMD(test_mask_supported()); return 0; } diff --git a/src/test/test-process-util.c b/src/test/test-process-util.c index e4e2efecd5..eb0f443a43 100644 --- a/src/test/test-process-util.c +++ b/src/test/test-process-util.c @@ -85,7 +85,7 @@ static void test_get_process_comm(void) { assert_se(r >= 0 || r == -EACCES); log_info("self strlen(environ): '%zu'", strlen(env)); - if (!detect_container(NULL)) + if (!detect_container()) assert_se(get_ctty_devnr(1, &h) == -ENXIO); getenv_for_pid(1, "PATH", &i); diff --git a/src/udev/udev-builtin-path_id.c b/src/udev/udev-builtin-path_id.c index f529ffcf25..01e2c659ae 100644 --- a/src/udev/udev-builtin-path_id.c +++ b/src/udev/udev-builtin-path_id.c @@ -317,6 +317,39 @@ out: return parent; } +static struct udev_device *handle_scsi_ata(struct udev_device *parent, char **path) { + struct udev *udev = udev_device_get_udev(parent); + struct udev_device *targetdev; + struct udev_device *target_parent; + struct udev_device *atadev; + const char *port_no; + + assert(parent); + assert(path); + + targetdev = udev_device_get_parent_with_subsystem_devtype(parent, "scsi", "scsi_host"); + if (!targetdev) + return NULL; + + target_parent = udev_device_get_parent(targetdev); + if (!target_parent) + return NULL; + + atadev = udev_device_new_from_subsystem_sysname(udev, "ata_port", udev_device_get_sysname(target_parent)); + if (!atadev) + return NULL; + + port_no = udev_device_get_sysattr_value(atadev, "port_no"); + if (!port_no) { + parent = NULL; + goto out; + } + path_prepend(path, "ata-%s", port_no); +out: + udev_device_unref(atadev); + return parent; +} + static struct udev_device *handle_scsi_default(struct udev_device *parent, char **path) { struct udev_device *hostdev; int host, bus, target, lun; @@ -482,19 +515,8 @@ static struct udev_device *handle_scsi(struct udev_device *parent, char **path, goto out; } - /* - * We do not support the ATA transport class, it uses global counters - * to name the ata devices which numbers spread across multiple - * controllers. - * - * The real link numbers are not exported. Also, possible chains of ports - * behind port multipliers cannot be composed that way. - * - * Until all that is solved at the kernel level, there are no by-path/ - * links for ATA devices. - */ if (strstr(name, "/ata") != NULL) { - parent = NULL; + parent = handle_scsi_ata(parent, path); goto out; } diff --git a/src/udev/udev-builtin-uaccess.c b/src/udev/udev-builtin-uaccess.c index 43bab8af63..7bf4e7f24d 100644 --- a/src/udev/udev-builtin-uaccess.c +++ b/src/udev/udev-builtin-uaccess.c @@ -45,7 +45,7 @@ static int builtin_uaccess(struct udev_device *dev, int argc, char *argv[], bool seat = "seat0"; r = sd_seat_get_active(seat, NULL, &uid); - if (r == -ENOENT) { + if (r == -ENXIO || r == -ENODATA) { /* No active session on this seat */ r = 0; goto finish; diff --git a/src/vconsole/vconsole-setup.c b/src/vconsole/vconsole-setup.c index 7bdc158ad7..6353579283 100644 --- a/src/vconsole/vconsole-setup.c +++ b/src/vconsole/vconsole-setup.c @@ -293,7 +293,7 @@ int main(int argc, char **argv) { log_warning_errno(r, "Failed to read /etc/vconsole.conf: %m"); /* Let the kernel command line override /etc/vconsole.conf */ - if (detect_container(NULL) <= 0) { + if (detect_container() <= 0) { r = parse_env_file("/proc/cmdline", WHITESPACE, "vconsole.keymap", &vc_keymap, "vconsole.keymap.toggle", &vc_keymap_toggle, diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index 074b916d38..6b86e0a7f7 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -13,7 +13,7 @@ Before=machines.target After=network.target [Service] -ExecStart=@bindir@/systemd-nspawn --quiet --keep-unit --boot --link-journal=try-guest --network-veth --machine=%I +ExecStart=@bindir@/systemd-nspawn --quiet --keep-unit --boot --link-journal=try-guest --network-veth --settings=override --machine=%I KillMode=mixed Type=notify RestartForceExitStatus=133 |