16 files changed, 276 insertions, 89 deletions
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 37e6928a46..cede835920 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -2514,6 +2514,20 @@ int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
         return 0;
 }
 
+bool is_cgroup_fs(const struct statfs *s) {
+        return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
+               is_fs_type(s, CGROUP2_SUPER_MAGIC);
+}
+
+bool fd_is_cgroup_fs(int fd) {
+        struct statfs s;
+
+        if (fstatfs(fd, &s) < 0)
+                return -errno;
+
+        return is_cgroup_fs(&s);
+}
+
 static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
         [CGROUP_CONTROLLER_CPU] = "cpu",
         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h
index 7529c9719e..0aa27c4cd7 100644
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@@ -23,6 +23,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <sys/statfs.h>
 #include <sys/types.h>
 
 #include "def.h"
@@ -254,3 +255,6 @@ CGroupController cgroup_controller_from_string(const char *s) _pure_;
 int cg_weight_parse(const char *s, uint64_t *ret);
 int cg_cpu_shares_parse(const char *s, uint64_t *ret);
 int cg_blkio_weight_parse(const char *s, uint64_t *ret);
+
+bool is_cgroup_fs(const struct statfs *s);
+bool fd_is_cgroup_fs(int fd);
diff --git a/src/basic/gunicode.c b/src/basic/gunicode.c
index 542110503f..e6ac0545a4 100644
--- a/src/basic/gunicode.c
+++ b/src/basic/gunicode.c
@@ -26,7 +26,7 @@
 char *
 utf8_prev_char (const char *p)
 {
-  while (1)
+  for (;;)
     {
       p--;
       if ((*p & 0xc0) != 0x80)
diff --git a/src/basic/rm-rf.c b/src/basic/rm-rf.c
index 43816fd1bb..baa70c2c8d 100644
--- a/src/basic/rm-rf.c
+++ b/src/basic/rm-rf.c
@@ -27,6 +27,7 @@
 #include <unistd.h>
 
 #include "btrfs-util.h"
+#include "cgroup-util.h"
 #include "fd-util.h"
 #include "log.h"
 #include "macro.h"
@@ -36,9 +37,14 @@
 #include "stat-util.h"
 #include "string-util.h"
 
+static bool is_physical_fs(const struct statfs *sfs) {
+        return !is_temporary_fs(sfs) && !is_cgroup_fs(sfs);
+}
+
 int rm_rf_children(int fd, RemoveFlags flags, struct stat *root_dev) {
         _cleanup_closedir_ DIR *d = NULL;
         int ret = 0, r;
+        struct statfs sfs;
 
         assert(fd >= 0);
 
@@ -47,13 +53,13 @@ int rm_rf_children(int fd, RemoveFlags flags, struct stat *root_dev) {
 
         if (!(flags & REMOVE_PHYSICAL)) {
 
-                r = fd_is_temporary_fs(fd);
+                r = fstatfs(fd, &sfs);
                 if (r < 0) {
                         safe_close(fd);
-                        return r;
+                        return -errno;
                 }
 
-                if (!r) {
+                if (is_physical_fs(&sfs)) {
                         /* We refuse to clean physical file systems
                          * with this call, unless explicitly
                          * requested. This is extra paranoia just to
@@ -210,7 +216,7 @@ int rm_rf(const char *path, RemoveFlags flags) {
                         if (statfs(path, &s) < 0)
                                 return -errno;
 
-                        if (!is_temporary_fs(&s)) {
+                        if (is_physical_fs(&s)) {
                                 log_error("Attempted to remove disk file system, and we can't allow that.");
                                 return -EPERM;
                         }
diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c
index db60d0af7a..a982c204be 100644
--- a/src/coredump/coredump.c
+++ b/src/coredump/coredump.c
@@ -589,7 +589,7 @@ static int get_mount_namespace_leader(pid_t pid, pid_t *container_pid) {
         if (r < 0)
                 return r;
 
-        while (1) {
+        for (;;) {
                 ino_t parent_mntns;
 
                 r = get_process_ppid(cpid, &ppid);
diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c
index 349ef74e81..49199b269f 100644
--- a/src/journal/journal-file.c
+++ b/src/journal/journal-file.c
@@ -568,8 +568,8 @@ static int journal_file_verify_header(JournalFile *f) {
                 return -ENODATA;
 
         if (f->writable) {
-                uint8_t state;
                 sd_id128_t machine_id;
+                uint8_t state;
                 int r;
 
                 r = sd_id128_get_machine(&machine_id);
@@ -590,6 +590,14 @@ static int journal_file_verify_header(JournalFile *f) {
                         log_debug("Journal file %s has unknown state %i.", f->path, state);
                         return -EBUSY;
                 }
+
+                /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
+                 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
+                 * bisection. */
+                if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
+                        log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
+                        return -ETXTBSY;
+                }
         }
 
         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
@@ -747,12 +755,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
         assert(ret);
 
         /* Objects may only be located at multiple of 64 bit */
-        if (!VALID64(offset))
+        if (!VALID64(offset)) {
+                log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
         /* Object may not be located in the file header */
-        if (offset < le64toh(f->header->header_size))
+        if (offset < le64toh(f->header->header_size)) {
+                log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
         if (r < 0)
@@ -761,17 +773,29 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
         o = (Object*) t;
         s = le64toh(o->object.size);
 
-        if (s < sizeof(ObjectHeader))
+        if (s == 0) {
+                log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
+                return -EBADMSG;
+        }
+        if (s < sizeof(ObjectHeader)) {
+                log_debug("Attempt to move to overly short object: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
-        if (o->object.type <= OBJECT_UNUSED)
+        if (o->object.type <= OBJECT_UNUSED) {
+                log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
-        if (s < minimum_header_size(o))
+        if (s < minimum_header_size(o)) {
+                log_debug("Attempt to move to truncated object: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
-        if (type > OBJECT_UNUSED && o->object.type != type)
+        if (type > OBJECT_UNUSED && o->object.type != type) {
+                log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
                 return -EBADMSG;
+        }
 
         if (s > sizeof(ObjectHeader)) {
                 r = journal_file_move_to(f, type, false, offset, s, &t);
@@ -2472,6 +2496,37 @@ int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
         return 0;
 }
 
+static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
+
+        /* Increase or decrease the specified index, in the right direction. */
+
+        if (direction == DIRECTION_DOWN) {
+                if (*i >= n - 1)
+                        return 0;
+
+                (*i) ++;
+        } else {
+                if (*i <= 0)
+                        return 0;
+
+                (*i) --;
+        }
+
+        return 1;
+}
+
+static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
+
+        /* Consider it an error if any of the two offsets is uninitialized */
+        if (old_offset == 0 || new_offset == 0)
+                return false;
+
+        /* If we go down, the new offset must be larger than the old one. */
+        return direction == DIRECTION_DOWN ?
+                new_offset > old_offset  :
+                new_offset < old_offset;
+}
+
 int journal_file_next_entry(
                 JournalFile *f,
                 uint64_t p,
@@ -2502,36 +2557,34 @@ int journal_file_next_entry(
                 if (r <= 0)
                         return r;
 
-                if (direction == DIRECTION_DOWN) {
-                        if (i >= n - 1)
-                                return 0;
-
-                        i++;
-                } else {
-                        if (i <= 0)
-                                return 0;
-
-                        i--;
-                }
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
         }
 
         /* And jump to it */
-        r = generic_array_get(f,
-                              le64toh(f->header->entry_array_offset),
-                              i,
-                              ret, &ofs);
-        if (r == -EBADMSG && direction == DIRECTION_DOWN) {
-                /* Special case: when we iterate throught the journal file linearly, and hit an entry we can't read,
-                 * consider this the end of the journal file. */
-                log_debug_errno(r, "Encountered entry we can't read while iterating through journal file. Considering this the end of the file.");
-                return 0;
+        for (;;) {
+                r = generic_array_get(f,
+                                      le64toh(f->header->entry_array_offset),
+                                      i,
+                                      ret, &ofs);
+                if (r > 0)
+                        break;
+                if (r != -EBADMSG)
+                        return r;
+
+                /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
+                 * the next one might work for us instead. */
+                log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
+
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
         }
-        if (r <= 0)
-                return r;
 
-        if (p > 0 &&
-            (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
-                log_debug("%s: entry array corrupted at entry %" PRIu64, f->path, i);
+        /* Ensure our array is properly ordered. */
+        if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
+                log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
                 return -EBADMSG;
         }
 
@@ -2548,9 +2601,9 @@ int journal_file_next_entry_for_data(
                 direction_t direction,
                 Object **ret, uint64_t *offset) {
 
-        uint64_t n, i;
-        int r;
+        uint64_t i, n, ofs;
         Object *d;
+        int r;
 
         assert(f);
         assert(p > 0 || !o);
@@ -2582,25 +2635,39 @@ int journal_file_next_entry_for_data(
                 if (r <= 0)
                         return r;
 
-                if (direction == DIRECTION_DOWN) {
-                        if (i >= n - 1)
-                                return 0;
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
+        }
 
-                        i++;
-                } else {
-                        if (i <= 0)
-                                return 0;
+        for (;;) {
+                r = generic_array_get_plus_one(f,
+                                               le64toh(d->data.entry_offset),
+                                               le64toh(d->data.entry_array_offset),
+                                               i,
+                                               ret, &ofs);
+                if (r > 0)
+                        break;
+                if (r != -EBADMSG)
+                        return r;
 
-                        i--;
-                }
+                log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
+
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
+        }
 
+        /* Ensure our array is properly ordered. */
+        if (p > 0 && check_properly_ordered(ofs, p, direction)) {
+                log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
+                return -EBADMSG;
         }
 
-        return generic_array_get_plus_one(f,
-                                          le64toh(d->data.entry_offset),
-                                          le64toh(d->data.entry_array_offset),
-                                          i,
-                                          ret, offset);
+        if (offset)
+                *offset = ofs;
+
+        return 1;
 }
 
 int journal_file_move_to_entry_by_offset_for_data(
@@ -3271,7 +3338,8 @@ int journal_file_open_reliably(
                     -EBUSY,             /* unclean shutdown */
                     -ESHUTDOWN,         /* already archived */
                     -EIO,               /* IO error, including SIGBUS on mmap */
-                    -EIDRM              /* File has been deleted */))
+                    -EIDRM,             /* File has been deleted */
+                    -ETXTBSY))          /* File is from the future */
                 return r;
 
         if ((flags & O_ACCMODE) == O_RDONLY)
diff --git a/src/journal/journal-vacuum.c b/src/journal/journal-vacuum.c
index f09dc66e03..12ce2fd56c 100644
--- a/src/journal/journal-vacuum.c
+++ b/src/journal/journal-vacuum.c
@@ -343,7 +343,7 @@ finish:
                 free(list[i].filename);
         free(list);
 
-        log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals on disk.", format_bytes(sbytes, sizeof(sbytes), freed));
+        log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals from %s.", format_bytes(sbytes, sizeof(sbytes), freed), directory);
 
         return r;
 }
diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c
index 4350925fb0..7f997487b4 100644
--- a/src/journal/journalctl.c
+++ b/src/journal/journalctl.c
@@ -1091,8 +1091,10 @@ static int discover_next_boot(sd_journal *j,
                 r = sd_journal_previous(j);
         if (r < 0)
                 return r;
-        else if (r == 0)
+        else if (r == 0) {
+                log_debug("Whoopsie! We found a boot ID but can't read its last entry.");
                 return -ENODATA; /* This shouldn't happen. We just came from this very boot ID. */
+        }
 
         r = sd_journal_get_realtime_usec(j, &next_boot->last);
         if (r < 0)
@@ -1112,7 +1114,7 @@ static int get_boots(
 
         bool skip_once;
         int r, count = 0;
-        BootId *head = NULL, *tail = NULL;
+        BootId *head = NULL, *tail = NULL, *id;
         const bool advance_older = boot_id && offset <= 0;
         sd_id128_t previous_boot_id;
 
@@ -1203,6 +1205,13 @@ static int get_boots(
                                 break;
                         }
                 } else {
+                        LIST_FOREACH(boot_list, id, head) {
+                                if (sd_id128_equal(id->id, current->id)) {
+                                        /* boot id already stored, something wrong with the journal files */
+                                        /* exiting as otherwise this problem would cause forever loop */
+                                        goto finish;
+                                }
+                        }
                         LIST_INSERT_AFTER(boot_list, head, tail, current);
                         tail = current;
                         current = NULL;
@@ -2257,7 +2266,7 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
-                printf("Archived and active journals take up %s on disk.\n",
+                printf("Archived and active journals take up %s in the file system.\n",
                        format_bytes(sbytes, sizeof(sbytes), bytes));
                 goto finish;
         }
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index f01cf1d937..381182fa2c 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -595,52 +595,86 @@ static void server_cache_hostname(Server *s) {
 
 static bool shall_try_append_again(JournalFile *f, int r) {
         switch(r) {
+
         case -E2BIG:           /* Hit configured limit          */
         case -EFBIG:           /* Hit fs limit                  */
         case -EDQUOT:          /* Quota limit hit               */
         case -ENOSPC:          /* Disk full                     */
                 log_debug("%s: Allocation limit reached, rotating.", f->path);
                 return true;
+
         case -EIO:             /* I/O error of some kind (mmap) */
                 log_warning("%s: IO error, rotating.", f->path);
                 return true;
+
         case -EHOSTDOWN:       /* Other machine                 */
                 log_info("%s: Journal file from other machine, rotating.", f->path);
                 return true;
+
         case -EBUSY:           /* Unclean shutdown              */
                 log_info("%s: Unclean shutdown, rotating.", f->path);
                 return true;
+
         case -EPROTONOSUPPORT: /* Unsupported feature           */
                 log_info("%s: Unsupported feature, rotating.", f->path);
                 return true;
+
         case -EBADMSG:         /* Corrupted                     */
         case -ENODATA:         /* Truncated                     */
         case -ESHUTDOWN:       /* Already archived              */
                 log_warning("%s: Journal file corrupted, rotating.", f->path);
                 return true;
+
         case -EIDRM:           /* Journal file has been deleted */
                 log_warning("%s: Journal file has been deleted, rotating.", f->path);
                 return true;
+
+        case -ETXTBSY:         /* Journal file is from the future */
+                log_warning("%s: Journal file is from the future, rotating.", f->path);
+                return true;
+
         default:
                 return false;
         }
 }
 
 static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned n, int priority) {
+        bool vacuumed = false, rotate = false;
+        struct dual_timestamp ts;
         JournalFile *f;
-        bool vacuumed = false;
         int r;
 
         assert(s);
         assert(iovec);
         assert(n > 0);
 
-        f = find_journal(s, uid);
-        if (!f)
-                return;
+        /* Get the closest, linearized time we have for this log event from the event loop. (Note that we do not use
+         * the source time, and not even the time the event was originally seen, but instead simply the time we started
+         * processing it, as we want strictly linear ordering in what we write out.) */
+        assert_se(sd_event_now(s->event, CLOCK_REALTIME, &ts.realtime) >= 0);
+        assert_se(sd_event_now(s->event, CLOCK_MONOTONIC, &ts.monotonic) >= 0);
+
+        if (ts.realtime < s->last_realtime_clock) {
+                /* When the time jumps backwards, let's immediately rotate. Of course, this should not happen during
+                 * regular operation. However, when it does happen, then we should make sure that we start fresh files
+                 * to ensure that the entries in the journal files are strictly ordered by time, in order to ensure
+                 * bisection works correctly. */
+
+                log_debug("Time jumped backwards, rotating.");
+                rotate = true;
+        } else {
+
+                f = find_journal(s, uid);
+                if (!f)
+                        return;
+
+                if (journal_file_rotate_suggested(f, s->max_file_usec)) {
+                        log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
+                        rotate = true;
+                }
+        }
 
-        if (journal_file_rotate_suggested(f, s->max_file_usec)) {
-                log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
+        if (rotate) {
                 server_rotate(s);
                 server_vacuum(s, false, false);
                 vacuumed = true;
@@ -650,7 +684,9 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned
                         return;
         }
 
-        r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        s->last_realtime_clock = ts.realtime;
+
+        r = journal_file_append_entry(f, &ts, iovec, n, &s->seqnum, NULL, NULL);
         if (r >= 0) {
                 server_schedule_sync(s, priority);
                 return;
@@ -669,7 +705,7 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned
                 return;
 
         log_debug("Retrying write.");
-        r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        r = journal_file_append_entry(f, &ts, iovec, n, &s->seqnum, NULL, NULL);
         if (r < 0)
                 log_error_errno(r, "Failed to write entry (%d items, %zu bytes) despite vacuuming, ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));
         else
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
index dfb5724794..cc68a0a690 100644
--- a/src/journal/journald-server.h
+++ b/src/journal/journald-server.h
@@ -149,6 +149,8 @@ struct Server {
         char *cgroup_root;
 
         usec_t watchdog_usec;
+
+        usec_t last_realtime_clock;
 };
 
 #define SERVER_MACHINE_ID(s) ((s)->machine_id_field + strlen("_MACHINE_ID="))
diff --git a/src/libsystemd/sd-bus/bus-error.c b/src/libsystemd/sd-bus/bus-error.c
index 26219bdeed..378f7a377a 100644
--- a/src/libsystemd/sd-bus/bus-error.c
+++ b/src/libsystemd/sd-bus/bus-error.c
@@ -70,11 +70,9 @@ BUS_ERROR_MAP_ELF_REGISTER const sd_bus_error_map bus_standard_errors[] = {
         SD_BUS_ERROR_MAP_END
 };
 
-/* GCC maps this magically to the beginning and end of the BUS_ERROR_MAP section.
- * Hide them; for currently unknown reasons they get exported to the shared libries
- * even without being listed in the sym file. */
-extern const sd_bus_error_map __start_BUS_ERROR_MAP[] _hidden_;
-extern const sd_bus_error_map __stop_BUS_ERROR_MAP[] _hidden_;
+/* GCC maps this magically to the beginning and end of the BUS_ERROR_MAP section */
+extern const sd_bus_error_map __start_BUS_ERROR_MAP[];
+extern const sd_bus_error_map __stop_BUS_ERROR_MAP[];
 
 /* Additional maps registered with sd_bus_error_add_map() are in this
  * NULL terminated array */
diff --git a/src/login/systemd-user.m4 b/src/login/systemd-user.m4
index fe38b24fef..e33963b125 100644
--- a/src/login/systemd-user.m4
+++ b/src/login/systemd-user.m4
@@ -2,6 +2,8 @@
 #
 # Used by systemd --user instances.
 
+account required pam_unix.so
+
 m4_ifdef(`HAVE_SELINUX',
 session  required pam_selinux.so close
 session  required pam_selinux.so nottys open
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 6793df1286..fd0578b85c 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -25,27 +25,18 @@
 #include "mkdir.h"
 #include "mount-util.h"
 #include "nspawn-cgroup.h"
+#include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
 #include "util.h"
 
-int chown_cgroup(pid_t pid, uid_t uid_shift) {
-        _cleanup_free_ char *path = NULL, *fs = NULL;
+static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
-        int r;
 
-        r = cg_pid_get_path(NULL, pid, &path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get container cgroup path: %m");
-
-        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-        fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+        fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
         if (fd < 0)
-                return log_error_errno(errno, "Failed to open %s: %m", fs);
+                return -errno;
 
         FOREACH_STRING(fn,
                        ".",
@@ -63,7 +54,27 @@ int chown_cgroup(pid_t pid, uid_t uid_shift) {
         return 0;
 }
 
-int sync_cgroup(pid_t pid, CGroupUnified unified_requested) {
+int chown_cgroup(pid_t pid, uid_t uid_shift) {
+        _cleanup_free_ char *path = NULL, *fs = NULL;
+        _cleanup_close_ int fd = -1;
+        int r;
+
+        r = cg_pid_get_path(NULL, pid, &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get container cgroup path: %m");
+
+        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+        r = chown_cgroup_path(fs, uid_shift);
+        if (r < 0)
+                return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+        return 0;
+}
+
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t arg_uid_shift) {
         _cleanup_free_ char *cgroup = NULL;
         char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
         bool undo_mount = false;
@@ -101,14 +112,26 @@ int sync_cgroup(pid_t pid, CGroupUnified unified_requested) {
 
         undo_mount = true;
 
+        /* If nspawn dies abruptly the cgroup hierarchy created below
+         * its unit isn't cleaned up. So, let's remove it
+         * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
+        fn = strjoina(tree, cgroup);
+        (void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
+
         fn = strjoina(tree, cgroup, "/cgroup.procs");
         (void) mkdir_parents(fn, 0755);
 
         sprintf(pid_string, PID_FMT, pid);
         r = write_string_file(fn, pid_string, 0);
-        if (r < 0)
+        if (r < 0) {
                 log_error_errno(r, "Failed to move process: %m");
+                goto finish;
+        }
 
+        fn = strjoina(tree, cgroup);
+        r = chown_cgroup_path(fn, arg_uid_shift);
+        if (r < 0)
+                log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
 finish:
         if (undo_mount)
                 (void) umount_verbose(tree);
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index dc33da8abe..fa4321ab43 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -25,5 +25,5 @@
 #include "cgroup-util.h"
 
 int chown_cgroup(pid_t pid, uid_t uid_shift);
-int sync_cgroup(pid_t pid, CGroupUnified unified_requested);
+int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
 int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index d95204f71e..14af51fc0e 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -3879,7 +3879,7 @@ static int run(int master,
                         return r;
         }
 
-        r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy);
+        r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
         if (r < 0)
                 return r;
 
diff --git a/src/test/test-cgroup-util.c b/src/test/test-cgroup-util.c
index 43f8906172..c24c784e9b 100644
--- a/src/test/test-cgroup-util.c
+++ b/src/test/test-cgroup-util.c
@@ -24,6 +24,7 @@
 #include "formats-util.h"
 #include "parse-util.h"
 #include "process-util.h"
+#include "stat-util.h"
 #include "string-util.h"
 #include "test-helper.h"
 #include "user-util.h"
@@ -309,6 +310,28 @@ static void test_mask_supported(void) {
                 printf("'%s' is supported: %s\n", cgroup_controller_to_string(c), yes_no(m & CGROUP_CONTROLLER_TO_MASK(c)));
 }
 
+static void test_is_cgroup_fs(void) {
+        struct statfs sfs;
+        assert_se(statfs("/sys/fs/cgroup", &sfs) == 0);
+        if (is_temporary_fs(&sfs))
+                assert_se(statfs("/sys/fs/cgroup/systemd", &sfs) == 0);
+        assert_se(is_cgroup_fs(&sfs));
+}
+
+static void test_fd_is_cgroup_fs(void) {
+        int fd;
+
+        fd = open("/sys/fs/cgroup", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+        assert_se(fd >= 0);
+        if (fd_is_temporary_fs(fd)) {
+                fd = safe_close(fd);
+                fd = open("/sys/fs/cgroup/systemd", O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+                assert_se(fd >= 0);
+        }
+        assert_se(fd_is_cgroup_fs(fd));
+        fd = safe_close(fd);
+}
+
 int main(void) {
         test_path_decode_unit();
         test_path_get_unit();
@@ -324,6 +347,8 @@ int main(void) {
         test_slice_to_path();
         test_shift_path();
         TEST_REQ_RUNNING_SYSTEMD(test_mask_supported());
+        TEST_REQ_RUNNING_SYSTEMD(test_is_cgroup_fs());
+        TEST_REQ_RUNNING_SYSTEMD(test_fd_is_cgroup_fs());
 
         return 0;
 }