26 files changed, 280 insertions, 109 deletions
diff --git a/TODO b/TODO
index b4443ce585..1f1c2739c5 100644
--- a/TODO
+++ b/TODO
@@ -40,14 +40,6 @@ Features:
 
 * switch to ProtectSystem=strict for all our long-running services where that's possible
 
-* introduce an "invocation ID" for units, that is randomly generated, and
-  identifies each runtime-cycle of a unit. It should be set freshly each time
-  we traverse inactive → activating/active, and should be the primary key to
-  map offline data (stored in the journal) with online bus objects. Let's pass
-  this as $SYSTEMD_INVOCATION_ID to services, as well as set this as xattr on
-  the cgroup of a services. The former is accessible without privileges, the
-  latter ensures the ID cannot be faked.
-
 * If RootDirectory= is used, mount /proc, /sys, /dev into it, if not mounted yet
 
 * Permit masking specific netlink APIs with RestrictAddressFamily=
@@ -498,7 +490,6 @@ Features:
     message that works, but alraedy after a short tiemout
   - check if we can make journalctl by default use --follow mode inside of less if called without args?
   - maybe add API to send pairs of iovecs via sd_journal_send
-  - journal: when writing journal auto-rotate if time jumps backwards
   - journal: add a setgid "systemd-journal" utility to invoke from libsystemd-journal, which passes fds via STDOUT and does PK access
   - journactl: support negative filtering, i.e. FOOBAR!="waldo",
     and !FOOBAR for events without FOOBAR.
diff --git a/man/logind.conf.xml b/man/logind.conf.xml
index 9b0e181849..994e0e1140 100644
--- a/man/logind.conf.xml
+++ b/man/logind.conf.xml
@@ -261,7 +261,7 @@
         <listitem><para>Controls whether actions that <command>systemd-logind</command>
         takes when the power and sleep keys and the lid switch are triggered are subject
         to high-level inhibitor locks ("shutdown", "sleep", "idle"). Low level inhibitor
-        locks ("handle-*-key"), are always honoured, irrespective of this setting.</para>
+        locks ("handle-*-key"), are always honored, irrespective of this setting.</para>
 
         <para>These settings take boolean arguments. If <literal>no</literal>, the
         inhibitor locks taken by applications are respected. If <literal>yes</literal>,
diff --git a/man/sd-event.xml b/man/sd-event.xml
index fc615f0906..24a69bb645 100644
--- a/man/sd-event.xml
+++ b/man/sd-event.xml
@@ -97,7 +97,7 @@
     iteration a single event source is dispatched. Each time an event
     source is dispatched the kernel is polled for new events, before
     the next event source is dispatched. The event loop is designed to
-    honour priorities and provide fairness within each priority. It is
+    honor priorities and provide fairness within each priority. It is
     not designed to provide optimal throughput, as this contradicts
     these goals due the limitations of the underlying <citerefentry
     project='man-pages'><refentrytitle>epoll</refentrytitle><manvolnum>7</manvolnum></citerefentry>
diff --git a/man/sd_event_source_set_priority.xml b/man/sd_event_source_set_priority.xml
index 8c9b39fe5e..6e7032fc80 100644
--- a/man/sd_event_source_set_priority.xml
+++ b/man/sd_event_source_set_priority.xml
@@ -115,7 +115,7 @@
     reliable. However, it is guaranteed that if events are seen on
     multiple same-priority event sources at the same time, each one is
     not dispatched again until all others have been dispatched
-    once. This behaviour guarantees that within each priority
+    once. This behavior guarantees that within each priority
     particular event sources do not starve or dominate the event
     loop.</para>
 
diff --git a/man/systemctl.xml b/man/systemctl.xml
index 3b883ea754..b51badf7fe 100644
--- a/man/systemctl.xml
+++ b/man/systemctl.xml
@@ -306,7 +306,7 @@
         <para><literal>ignore-requirements</literal> is similar to
         <literal>ignore-dependencies</literal>, but only causes the
         requirement dependencies to be ignored, the ordering
-        dependencies will still be honoured.</para>
+        dependencies will still be honored.</para>
         </listitem>
 
       </varlistentry>
@@ -1006,7 +1006,7 @@ kobject-uevent 1 systemd-udevd-kernel.socket systemd-udevd.service
             desired, combine this command with the <option>--now</option> switch, or invoke <command>start</command>
             with appropriate arguments later. Note that in case of unit instance enablement (i.e. enablement of units of
             the form <filename>foo@bar.service</filename>), symlinks named the same as instances are created in the
-            unit configuration diectory, however they point to the single template unit file they are instantiated
+            unit configuration directory, however they point to the single template unit file they are instantiated
             from.</para>
 
             <para>This command expects either valid unit names (in which case various unit file directories are
diff --git a/man/systemd-coredump.xml b/man/systemd-coredump.xml
index a28dc62e5a..4a1bc8b296 100644
--- a/man/systemd-coredump.xml
+++ b/man/systemd-coredump.xml
@@ -107,7 +107,7 @@
     <citerefentry><refentrytitle>systemd-sysctl</refentrytitle><manvolnum>8</manvolnum></citerefentry>.
     </para>
 
-    <para>The behaviour of <command>systemd-coredump</command> itself is configured through the configuration file
+    <para>The behavior of <command>systemd-coredump</command> itself is configured through the configuration file
     <filename>/etc/systemd/coredump.conf</filename> and corresponding snippets
     <filename>/etc/systemd/coredump.conf.d/*.conf</filename>, see
     <citerefentry><refentrytitle>coredump.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>. A new
diff --git a/man/systemd-mount.xml b/man/systemd-mount.xml
index e6c07863c0..06b7c85bd8 100644
--- a/man/systemd-mount.xml
+++ b/man/systemd-mount.xml
@@ -260,7 +260,7 @@
   <refsect1>
     <title>The udev Database</title>
 
-    <para>If <option>--discover</option> is used, <command>systemd-mount</command> honours a couple of additional udev
+    <para>If <option>--discover</option> is used, <command>systemd-mount</command> honors a couple of additional udev
     properties of block devices:</para>
 
     <variablelist class='udev-directives'>
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index 5ac54df81a..f153034296 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -405,7 +405,7 @@
         purposes (usually in the range beyond the host's UID/GID 65536). The parameter may be specified as follows:</para>
 
         <orderedlist>
-          <listitem><para>If one or two colon-separated numers are specified, user namespacing is turned on. The first
+          <listitem><para>If one or two colon-separated numbers are specified, user namespacing is turned on. The first
           parameter specifies the first host UID/GID to assign to the container, the second parameter specifies the
           number of host UIDs/GIDs to assign to the container. If the second parameter is omitted, 65536 UIDs/GIDs are
           assigned.</para></listitem>
@@ -425,13 +425,13 @@
           range is automatically chosen. As first step, the file owner of the root directory of the container's
           directory tree is read, and it is checked that it is currently not used by the system otherwise (in
           particular, that no other container is using it). If this check is successful, the UID/GID range determined
-          this way is used, similar to the behaviour if "yes" is specified. If the check is not successful (and thus
+          this way is used, similar to the behavior if "yes" is specified. If the check is not successful (and thus
           the UID/GID range indicated in the root directory's file owner is already used elsewhere) a new – currently
           unused – UID/GID range of 65536 UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and
           1878982656, always starting at a multiple of 65536. This setting implies
           <option>--private-users-chown</option> (see below), which has the effect that the files and directories in
           the container's directory tree will be owned by the appropriate users of the range picked. Using this option
-          makes user namespace behaviour fully automatic. Note that the first invocation of a previously unused
+          makes user namespace behavior fully automatic. Note that the first invocation of a previously unused
           container image might result in picking a new UID/GID range for it, and thus in the (possibly expensive) file
           ownership adjustment operation. However, subsequent invocations of the container will be cheap (unless of
           course the picked UID/GID range is assigned to a different use by then).</para></listitem>
@@ -440,7 +440,7 @@
         <para>It is recommended to assign at least 65536 UIDs/GIDs to each container, so that the usable UID/GID range in the
         container covers 16 bit. For best security, do not assign overlapping UID/GID ranges to multiple containers. It is
         hence a good idea to use the upper 16 bit of the host 32-bit UIDs/GIDs as container identifier, while the lower 16
-        bit encode the container UID/GID used. This is in fact the behaviour enforced by the
+        bit encode the container UID/GID used. This is in fact the behavior enforced by the
         <option>--private-users=pick</option> option.</para>
 
         <para>When user namespaces are used, the GID range assigned to each container is always chosen identical to the
@@ -722,7 +722,7 @@
         and the subdirectory is symlinked into the host at the same
         location. <literal>try-host</literal> and
         <literal>try-guest</literal> do the same but do not fail if
-        the host does not have persistent journalling enabled. If
+        the host does not have persistent journaling enabled. If
         <literal>auto</literal> (the default), and the right
         subdirectory of <filename>/var/log/journal</filename> exists,
         it will be bind mounted into the container. If the
diff --git a/man/systemd-run.xml b/man/systemd-run.xml
index 15f9119e54..2ad8cb0835 100644
--- a/man/systemd-run.xml
+++ b/man/systemd-run.xml
@@ -402,7 +402,7 @@ There is a screen on:
       when the user first logs in, and stays around as long as at least one
       login session is open. After the user logs out of the last session,
       <filename>user@.service</filename> and all services underneath it
-      are terminated. This behaviour is the default, when "lingering" is
+      are terminated. This behavior is the default, when "lingering" is
       not enabled for that user. Enabling lingering means that
       <filename>user@.service</filename> is started automatically during
       boot, even if the user is not logged in, and that the service is
diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml
index a268397d09..1d995f143e 100644
--- a/man/systemd-system.conf.xml
+++ b/man/systemd-system.conf.xml
@@ -109,7 +109,7 @@
         <term><varname>CtrlAltDelBurstAction=</varname></term>
 
         <listitem><para>Defines what action will be performed
-        if user presses Ctr-Alt-Delete more than 7 times in 2s.
+        if user presses Ctrl-Alt-Delete more than 7 times in 2s.
         Can be set to <literal>reboot-force</literal>, <literal>poweroff-force</literal>
         or disabled with <literal>ignore</literal>. Defaults to
         <literal>reboot-force</literal>.
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 986985ad35..c088042a51 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -988,7 +988,7 @@
         the unit's own user and group to themselves and everything else to the <literal>nobody</literal> user and
         group. This is useful to securely detach the user and group databases used by the unit from the rest of the
         system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and
-        other resources owned by users/groups not equalling <literal>root</literal> or the unit's own will stay visible
+        other resources owned by users/groups not equaling <literal>root</literal> or the unit's own will stay visible
         from within the unit but appear owned by the <literal>nobody</literal> user and group. If this mode is enabled,
         all unit processes are run without privileges in the host user namespace (regardless if the unit's own
         user/group is <literal>root</literal> or not). Specifically this means that the process will have zero process
@@ -1560,7 +1560,7 @@
       <varlistentry>
         <term><varname>$MAINPID</varname></term>
 
-        <listitem><para>The PID of the units main process if it is
+        <listitem><para>The PID of the unit's main process if it is
         known. This is only set for control processes as invoked by
         <varname>ExecReload=</varname> and similar. </para></listitem>
       </varlistentry>
diff --git a/man/systemd.link.xml b/man/systemd.link.xml
index 10fddeced0..8edbe758d9 100644
--- a/man/systemd.link.xml
+++ b/man/systemd.link.xml
@@ -107,7 +107,7 @@
         <listitem>
           <para>A whitespace-separated list of shell-style globs matching
           the device name, as exposed by the udev property
-          "INTERFACE". This can not be used to match on names that have
+          "INTERFACE". This cannot be used to match on names that have
           already been changed from userspace. Caution is advised when matching on
           kernel-assigned names, as they are known to be unstable
           between reboots.</para>
diff --git a/man/systemd.socket.xml b/man/systemd.socket.xml
index d759e17289..0ce1203cfb 100644
--- a/man/systemd.socket.xml
+++ b/man/systemd.socket.xml
@@ -535,7 +535,7 @@
         and the kernel will ignore initial ACK packets without any
         data. The argument specifies the approximate amount of time
         the kernel should wait for incoming data before falling back
-        to the normal behavior of honouring empty ACK packets. This
+        to the normal behavior of honoring empty ACK packets. This
         option is beneficial for protocols where the client sends the
         data first (e.g. HTTP, in contrast to SMTP), because the
         server process will not be woken up unnecessarily before it
diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml
index 9778283fec..79bdb2cd38 100644
--- a/man/systemd.unit.xml
+++ b/man/systemd.unit.xml
@@ -195,7 +195,7 @@
     instantiated units, this logic will first look for the instance <literal>.d/</literal>
     subdirectory and read its <literal>.conf</literal> files, followed by the template
     <literal>.d/</literal> subdirectory and the <literal>.conf</literal> files there. Also note that
-    settings from the <literal>[Install]</literal> section are not honoured in drop-in unit files,
+    settings from the <literal>[Install]</literal> section are not honored in drop-in unit files,
     and have no effect.</para>
 
     <para>In addition to <filename>/etc/systemd/system</filename>, the drop-in <literal>.d</literal>
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 37e6928a46..cede835920 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -2514,6 +2514,20 @@ int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
         return 0;
 }
 
+bool is_cgroup_fs(const struct statfs *s) {
+        return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
+               is_fs_type(s, CGROUP2_SUPER_MAGIC);
+}
+
+bool fd_is_cgroup_fs(int fd) {
+        struct statfs s;
+
+        if (fstatfs(fd, &s) < 0)
+                return -errno;
+
+        return is_cgroup_fs(&s);
+}
+
 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
         [CGROUP_CONTROLLER_CPU] = "cpu",
         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index 7529c9719e..0aa27c4cd7 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -23,6 +23,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 
 #include "def.h"
@@ -254,3 +255,6 @@ CGroupController cgroup_controller_from_string(const char *s) _pure_;
 int cg_weight_parse(const char *s, uint64_t *ret);
 int cg_cpu_shares_parse(const char *s, uint64_t *ret);
 int cg_blkio_weight_parse(const char *s, uint64_t *ret);
+
+bool is_cgroup_fs(const struct statfs *s);
+bool fd_is_cgroup_fs(int fd);
diff --git a/src/basic/rm-rf.c b/src/basic/rm-rf.c
index 43816fd1bb..baa70c2c8d 100644
--- a/src/basic/rm-rf.c
+++ b/src/basic/rm-rf.c
@@ -27,6 +27,7 @@
 #include <unistd.h>
 
 #include "btrfs-util.h"
+#include "cgroup-util.h"
 #include "fd-util.h"
 #include "log.h"
 #include "macro.h"
@@ -36,9 +37,14 @@
 #include "stat-util.h"
 #include "string-util.h"
 
+static bool is_physical_fs(const struct statfs *sfs) {
+        return !is_temporary_fs(sfs) && !is_cgroup_fs(sfs);
+}
+
 int rm_rf_children(int fd, RemoveFlags flags, struct stat *root_dev) {
         _cleanup_closedir_ DIR *d = NULL;
         int ret = 0, r;
+        struct statfs sfs;
 
         assert(fd >= 0);
 
@@ -47,13 +53,13 @@ int rm_rf_children(int fd, RemoveFlags flags, struct stat *root_dev) {
 
         if (!(flags & REMOVE_PHYSICAL)) {
 
-                r = fd_is_temporary_fs(fd);
+                r = fstatfs(fd, &sfs);
                 if (r < 0) {
                         safe_close(fd);
-                        return r;
+                        return -errno;
                 }
 
-                if (!r) {
+                if (is_physical_fs(&sfs)) {
                         /* We refuse to clean physical file systems
                          * with this call, unless explicitly
                          * requested. This is extra paranoia just to
@@ -210,7 +216,7 @@ int rm_rf(const char *path, RemoveFlags flags) {
                         if (statfs(path, &s) < 0)
                                 return -errno;
 
-                        if (!is_temporary_fs(&s)) {
+                        if (is_physical_fs(&s)) {
                                 log_error("Attempted to remove disk file system, and we can't allow that.");
                                 return -EPERM;
                         }
diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c
index 349ef74e81..49199b269f 100644
--- a/src/journal/journal-file.c
+++ b/src/journal/journal-file.c
@@ -568,8 +568,8 @@ static int journal_file_verify_header(JournalFile *f) {
                 return -ENODATA;
 
         if (f->writable) {
-                uint8_t state;
                 sd_id128_t machine_id;
+                uint8_t state;
                 int r;
 
                 r = sd_id128_get_machine(&machine_id);
@@ -590,6 +590,14 @@ static int journal_file_verify_header(JournalFile *f) {
                         log_debug("Journal file %s has unknown state %i.", f->path, state);
                         return -EBUSY;
                 }
+
+                /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
+                 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
+                 * bisection. */
+                if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
+                        log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
+                        return -ETXTBSY;
+                }
         }
 
         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
@@ -747,12 +755,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
         assert(ret);
 
         /* Objects may only be located at multiple of 64 bit */
-        if (!VALID64(offset))
+        if (!VALID64(offset)) {
+                log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
         /* Object may not be located in the file header */
-        if (offset < le64toh(f->header->header_size))
+        if (offset < le64toh(f->header->header_size)) {
+                log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
         if (r < 0)
@@ -761,17 +773,29 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
         o = (Object*) t;
         s = le64toh(o->object.size);
 
-        if (s < sizeof(ObjectHeader))
+        if (s == 0) {
+                log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
+                return -EBADMSG;
+        }
+        if (s < sizeof(ObjectHeader)) {
+                log_debug("Attempt to move to overly short object: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
-        if (o->object.type <= OBJECT_UNUSED)
+        if (o->object.type <= OBJECT_UNUSED) {
+                log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
-        if (s < minimum_header_size(o))
+        if (s < minimum_header_size(o)) {
+                log_debug("Attempt to move to truncated object: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
-        if (type > OBJECT_UNUSED && o->object.type != type)
+        if (type > OBJECT_UNUSED && o->object.type != type) {
+                log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
         if (s > sizeof(ObjectHeader)) {
                 r = journal_file_move_to(f, type, false, offset, s, &t);
@@ -2472,6 +2496,37 @@ int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
         return 0;
 }
 
+static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
+
+        /* Increase or decrease the specified index, in the right direction. */
+
+        if (direction == DIRECTION_DOWN) {
+                if (*i >= n - 1)
+                        return 0;
+
+                (*i) ++;
+        } else {
+                if (*i <= 0)
+                        return 0;
+
+                (*i) --;
+        }
+
+        return 1;
+}
+
+static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
+
+        /* Consider it an error if any of the two offsets is uninitialized */
+        if (old_offset == 0 || new_offset == 0)
+                return false;
+
+        /* If we go down, the new offset must be larger than the old one. */
+        return direction == DIRECTION_DOWN ?
+                new_offset > old_offset  :
+                new_offset < old_offset;
+}
+
 int journal_file_next_entry(
                 JournalFile *f,
                 uint64_t p,
@@ -2502,36 +2557,34 @@ int journal_file_next_entry(
                 if (r <= 0)
                         return r;
 
-                if (direction == DIRECTION_DOWN) {
-                        if (i >= n - 1)
-                                return 0;
-
-                        i++;
-                } else {
-                        if (i <= 0)
-                                return 0;
-
-                        i--;
-                }
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
         }
 
         /* And jump to it */
-        r = generic_array_get(f,
-                              le64toh(f->header->entry_array_offset),
-                              i,
-                              ret, &ofs);
-        if (r == -EBADMSG && direction == DIRECTION_DOWN) {
-                /* Special case: when we iterate throught the journal file linearly, and hit an entry we can't read,
-                 * consider this the end of the journal file. */
-                log_debug_errno(r, "Encountered entry we can't read while iterating through journal file. Considering this the end of the file.");
-                return 0;
+        for (;;) {
+                r = generic_array_get(f,
+                                      le64toh(f->header->entry_array_offset),
+                                      i,
+                                      ret, &ofs);
+                if (r > 0)
+                        break;
+                if (r != -EBADMSG)
+                        return r;
+
+                /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
+                 * the next one might work for us instead. */
+                log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
+
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
         }
-        if (r <= 0)
-                return r;
 
-        if (p > 0 &&
-            (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
-                log_debug("%s: entry array corrupted at entry %" PRIu64, f->path, i);
+        /* Ensure our array is properly ordered. */
+        if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
+                log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
                 return -EBADMSG;
         }
 
@@ -2548,9 +2601,9 @@ int journal_file_next_entry_for_data(
                 direction_t direction,
                 Object **ret, uint64_t *offset) {
 
-        uint64_t n, i;
-        int r;
+        uint64_t i, n, ofs;
         Object *d;
+        int r;
 
         assert(f);
         assert(p > 0 || !o);
@@ -2582,25 +2635,39 @@ int journal_file_next_entry_for_data(
                 if (r <= 0)
                         return r;
 
-                if (direction == DIRECTION_DOWN) {
-                        if (i >= n - 1)
-                                return 0;
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
+        }
 
-                        i++;
-                } else {
-                        if (i <= 0)
-                                return 0;
+        for (;;) {
+                r = generic_array_get_plus_one(f,
+                                               le64toh(d->data.entry_offset),
+                                               le64toh(d->data.entry_array_offset),
+                                               i,
+                                               ret, &ofs);
+                if (r > 0)
+                        break;
+                if (r != -EBADMSG)
+                        return r;
 
-                        i--;
-                }
+                log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
+
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
+        }
 
+        /* Ensure our array is properly ordered. */
+        if (p > 0 && check_properly_ordered(ofs, p, direction)) {
+                log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
+                return -EBADMSG;
         }
 
-        return generic_array_get_plus_one(f,
-                                          le64toh(d->data.entry_offset),
-                                          le64toh(d->data.entry_array_offset),
-                                          i,
-                                          ret, offset);
+        if (offset)
+                *offset = ofs;
+
+        return 1;
 }
 
 int journal_file_move_to_entry_by_offset_for_data(
@@ -3271,7 +3338,8 @@ int journal_file_open_reliably(
                     -EBUSY,             /* unclean shutdown */
                     -ESHUTDOWN,         /* already archived */
                     -EIO,               /* IO error, including SIGBUS on mmap */
-                    -EIDRM              /* File has been deleted */))
+                    -EIDRM,             /* File has been deleted */
+                    -ETXTBSY))          /* File is from the future */
                 return r;
 
         if ((flags & O_ACCMODE) == O_RDONLY)
diff --git a/src/journal/journal-vacuum.c b/src/journal/journal-vacuum.c
index f09dc66e03..12ce2fd56c 100644
--- a/src/journal/journal-vacuum.c
+++ b/src/journal/journal-vacuum.c
@@ -343,7 +343,7 @@ finish:
                 free(list[i].filename);
         free(list);
 
-        log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals on disk.", format_bytes(sbytes, sizeof(sbytes), freed));
+        log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals from %s.", format_bytes(sbytes, sizeof(sbytes), freed), directory);
 
         return r;
 }
diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c
index 13e3b44f06..7f997487b4 100644
--- a/src/journal/journalctl.c
+++ b/src/journal/journalctl.c
@@ -1091,8 +1091,10 @@ static int discover_next_boot(sd_journal *j,
                 r = sd_journal_previous(j);
         if (r < 0)
                 return r;
-        else if (r == 0)
+        else if (r == 0) {
+                log_debug("Whoopsie! We found a boot ID but can't read its last entry.");
                 return -ENODATA; /* This shouldn't happen. We just came from this very boot ID. */
+        }
 
         r = sd_journal_get_realtime_usec(j, &next_boot->last);
         if (r < 0)
@@ -2264,7 +2266,7 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
-                printf("Archived and active journals take up %s on disk.\n",
+                printf("Archived and active journals take up %s in the file system.\n",
                        format_bytes(sbytes, sizeof(sbytes), bytes));
                 goto finish;
         }
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index f01cf1d937..381182fa2c 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -595,52 +595,86 @@ static void server_cache_hostname(Server *s) {
 
 static bool shall_try_append_again(JournalFile *f, int r) {
         switch(r) {
+
         case -E2BIG:           /* Hit configured limit          */
         case -EFBIG:           /* Hit fs limit                  */
         case -EDQUOT:          /* Quota limit hit               */
         case -ENOSPC:          /* Disk full                     */
                 log_debug("%s: Allocation limit reached, rotating.", f->path);
                 return true;
+
         case -EIO:             /* I/O error of some kind (mmap) */
                 log_warning("%s: IO error, rotating.", f->path);
                 return true;
+
         case -EHOSTDOWN:       /* Other machine                 */
                 log_info("%s: Journal file from other machine, rotating.", f->path);
                 return true;
+
         case -EBUSY:           /* Unclean shutdown              */
                 log_info("%s: Unclean shutdown, rotating.", f->path);
                 return true;
+
         case -EPROTONOSUPPORT: /* Unsupported feature           */
                 log_info("%s: Unsupported feature, rotating.", f->path);
                 return true;
+
         case -EBADMSG:         /* Corrupted                     */
         case -ENODATA:         /* Truncated                     */
         case -ESHUTDOWN:       /* Already archived              */
                 log_warning("%s: Journal file corrupted, rotating.", f->path);
                 return true;
+
         case -EIDRM:           /* Journal file has been deleted */
                 log_warning("%s: Journal file has been deleted, rotating.", f->path);
                 return true;
+
+        case -ETXTBSY:         /* Journal file is from the future */
+                log_warning("%s: Journal file is from the future, rotating.", f->path);
+                return true;
+
         default:
                 return false;
         }
 }
 
 static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned n, int priority) {
+        bool vacuumed = false, rotate = false;
+        struct dual_timestamp ts;
         JournalFile *f;
-        bool vacuumed = false;
         int r;
 
         assert(s);
         assert(iovec);
         assert(n > 0);
 
-        f = find_journal(s, uid);
-        if (!f)
-                return;
+        /* Get the closest, linearized time we have for this log event from the event loop. (Note that we do not use
+         * the source time, and not even the time the event was originally seen, but instead simply the time we started
+         * processing it, as we want strictly linear ordering in what we write out.) */
+        assert_se(sd_event_now(s->event, CLOCK_REALTIME, &ts.realtime) >= 0);
+        assert_se(sd_event_now(s->event, CLOCK_MONOTONIC, &ts.monotonic) >= 0);
+
+        if (ts.realtime < s->last_realtime_clock) {
+                /* When the time jumps backwards, let's immediately rotate. Of course, this should not happen during
+                 * regular operation. However, when it does happen, then we should make sure that we start fresh files
+                 * to ensure that the entries in the journal files are strictly ordered by time, in order to ensure
+                 * bisection works correctly. */
+
+                log_debug("Time jumped backwards, rotating.");
+                rotate = true;
+        } else {
+
+                f = find_journal(s, uid);
+                if (!f)
+                        return;
+
+                if (journal_file_rotate_suggested(f, s->max_file_usec)) {
+                        log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
+                        rotate = true;
+                }
+        }
 
-        if (journal_file_rotate_suggested(f, s->max_file_usec)) {
-                log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
+        if (rotate) {
                 server_rotate(s);
                 server_vacuum(s, false, false);
                 vacuumed = true;
@@ -650,7 +684,9 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned
                         return;
         }
 
-        r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        s->last_realtime_clock = ts.realtime;
+
+        r = journal_file_append_entry(f, &ts, iovec, n, &s->seqnum, NULL, NULL);
         if (r >= 0) {
                 server_schedule_sync(s, priority);
                 return;
@@ -669,7 +705,7 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned
                 return;
 
         log_debug("Retrying write.");
-        r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        r = journal_file_append_entry(f, &ts, iovec, n, &s->seqnum, NULL, NULL);
         if (r < 0)
                 log_error_errno(r, "Failed to write entry (%d items, %zu bytes) despite vacuuming, ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));
         else
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
index dfb5724794..cc68a0a690 100644
--- a/src/journal/journald-server.h
+++ b/src/journal/journald-server.h
@@ -149,6 +149,8 @@ struct Server {
         char *cgroup_root;
 
         usec_t watchdog_usec;
+
+        usec_t last_realtime_clock;
 };
 
 #define SERVER_MACHINE_ID(s) ((s)->machine_id_field + strlen("_MACHINE_ID="))
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 6793df1286..fd0578b85c 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -25,27 +25,18 @@
 #include "mkdir.h"
 #include "mount-util.h"
 #include "nspawn-cgroup.h"
+#include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
 #include "util.h"
 
-int chown_cgroup(pid_t pid, uid_t uid_shift) {
-        _cleanup_free_ char *path = NULL, *fs = NULL;
+static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
-        int r;
 
-        r = cg_pid_get_path(NULL, pid, &path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get container cgroup path: %m");
-
-        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-        fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+        fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
         if (fd < 0)
-                return log_error_errno(errno, "Failed to open %s: %m", fs);
+                return -errno;
 
         FOREACH_STRING(fn,
                        ".",
@@ -63,7 +54,27 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) {
         return 0;
 }
 
-int sync_cgroup(pid_t pid, CGroupUnified unified_requested) {
+int chown_cgroup(pid_t pid, uid_t uid_shift) {
+        _cleanup_free_ char *path = NULL, *fs = NULL;
+        _cleanup_close_ int fd = -1;
+        int r;
+
+        r = cg_pid_get_path(NULL, pid, &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+        r = chown_cgroup_path(fs, uid_shift);
+        if (r < 0)
+                return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+        return 0;
+}
+
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) {
         _cleanup_free_ char *cgroup = NULL;
         char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
         bool undo_mount = false;
@@ -101,14 +112,26 @@ int sync_cgroup(pid_t pid, CGroupUnified unified_requested) {
 
         undo_mount = true;
 
+        /* If nspawn dies abruptly the cgroup hierarchy created below
+         * its unit isn't cleaned up. So, let's remove it
+         * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
+        fn = strjoina(tree, cgroup);
+        (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
+
         fn = strjoina(tree, cgroup, "/cgroup.procs");
         (void) mkdir_parents(fn, 0755);
 
         sprintf(pid_string, PID_FMT, pid);
         r = write_string_file(fn, pid_string, 0);
-        if (r < 0)
+        if (r < 0) {
                 log_error_errno(r, "Failed to move process: %m");
+                goto finish;
+        }
 
+        fn = strjoina(tree, cgroup);
+        r = chown_cgroup_path(fn, arg_uid_shift);
+        if (r < 0)
+                log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
 finish:
         if (undo_mount)
                 (void) umount_verbose(tree);
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index dc33da8abe..fa4321ab43 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -25,5 +25,5 @@
 #include "cgroup-util.h"
 
 int chown_cgroup(pid_t pid, uid_t uid_shift);
-int sync_cgroup(pid_t pid, CGroupUnified unified_requested);
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
 int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index d95204f71e..14af51fc0e 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -3879,7 +3879,7 @@ static int run(int master,
                         return r;
         }
 
-        r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy);
+        r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
         if (r < 0)
                 return r;
 
diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c
index 43f8906172..c24c784e9b 100644
--- a/src/test/test-cgroup-util.c
+++ b/src/test/test-cgroup-util.c
@@ -24,6 +24,7 @@
 #include "formats-util.h"
 #include "parse-util.h"
 #include "process-util.h"
+#include "stat-util.h"
 #include "string-util.h"
 #include "test-helper.h"
 #include "user-util.h"
@@ -309,6 +310,28 @@ static void test_mask_supported(void) {
                 printf("'%s' is supported: %s\n", cgroup_controller_to_string(c), yes_no(m & CGROUP_CONTROLLER_TO_MASK(c)));
 }
 
+static void test_is_cgroup_fs(void) {
+        struct statfs sfs;
+        assert_se(statfs("/sys/fs/cgroup", &sfs) == 0);
+        if (is_temporary_fs(&sfs))
+                assert_se(statfs("/sys/fs/cgroup/systemd", &sfs) == 0);
+        assert_se(is_cgroup_fs(&sfs));
+}
+
+static void test_fd_is_cgroup_fs(void) {
+        int fd;
+
+        fd = open("/sys/fs/cgroup", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+        assert_se(fd >= 0);
+        if (fd_is_temporary_fs(fd)) {
+                fd = safe_close(fd);
+                fd = open("/sys/fs/cgroup/systemd", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+                assert_se(fd >= 0);
+        }
+        assert_se(fd_is_cgroup_fs(fd));
+        fd = safe_close(fd);
+}
+
 int main(void) {
         test_path_decode_unit();
         test_path_get_unit();
@@ -324,6 +347,8 @@ int main(void) {
         test_slice_to_path();
         test_shift_path();
         TEST_REQ_RUNNING_SYSTEMD(test_mask_supported());
+        TEST_REQ_RUNNING_SYSTEMD(test_is_cgroup_fs());
+        TEST_REQ_RUNNING_SYSTEMD(test_fd_is_cgroup_fs());
 
         return 0;
 }