summaryrefslogtreecommitdiff
path: root/src/libsystemd/libsystemd-journal-internal/journal-file.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libsystemd/libsystemd-journal-internal/journal-file.c')
-rw-r--r--src/libsystemd/libsystemd-journal-internal/journal-file.c420
1 files changed, 356 insertions, 64 deletions
diff --git a/src/libsystemd/libsystemd-journal-internal/journal-file.c b/src/libsystemd/libsystemd-journal-internal/journal-file.c
index 4ea7013a53..13db5c53de 100644
--- a/src/libsystemd/libsystemd-journal-internal/journal-file.c
+++ b/src/libsystemd/libsystemd-journal-internal/journal-file.c
@@ -20,6 +20,7 @@
#include <errno.h>
#include <fcntl.h>
#include <linux/fs.h>
+#include <pthread.h>
#include <stddef.h>
#include <sys/mman.h>
#include <sys/statvfs.h>
@@ -36,8 +37,10 @@
#include "journal-file.h"
#include "lookup3.h"
#include "parse-util.h"
+#include "path-util.h"
#include "random-util.h"
#include <systemd/sd-event.h>
+#include "set.h"
#include "string-util.h"
#include "xattr-util.h"
@@ -86,33 +89,127 @@
/* The mmap context to use for the header we pick as one above the last defined typed */
#define CONTEXT_HEADER _OBJECT_TYPE_MAX
-static int journal_file_set_online(JournalFile *f) {
+/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
+ * As a result we use atomic operations on f->offline_state for inter-thread communications with
+ * journal_file_set_offline() and journal_file_set_online(). */
+static void journal_file_set_offline_internal(JournalFile *f) {
assert(f);
+ assert(f->fd >= 0);
+ assert(f->header);
- if (!f->writable)
- return -EPERM;
+ for (;;) {
+ switch (f->offline_state) {
+ case OFFLINE_CANCEL:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
+ continue;
+ return;
- if (!(f->fd >= 0 && f->header))
- return -EINVAL;
+ case OFFLINE_AGAIN_FROM_SYNCING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
+ continue;
+ break;
+
+ case OFFLINE_AGAIN_FROM_OFFLINING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
+ continue;
+ break;
+
+ case OFFLINE_SYNCING:
+ (void) fsync(f->fd);
+
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
+ continue;
+
+ f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
+ (void) fsync(f->fd);
+ break;
+
+ case OFFLINE_OFFLINING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
+ continue;
+ /* fall through */
+
+ case OFFLINE_DONE:
+ return;
+
+ case OFFLINE_JOINED:
+ log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
+ return;
+ }
+ }
+}
+
+static void * journal_file_set_offline_thread(void *arg) {
+ JournalFile *f = arg;
+
+ journal_file_set_offline_internal(f);
+
+ return NULL;
+}
+
+static int journal_file_set_offline_thread_join(JournalFile *f) {
+ int r;
+
+ assert(f);
+
+ if (f->offline_state == OFFLINE_JOINED)
+ return 0;
+
+ r = pthread_join(f->offline_thread, NULL);
+ if (r)
+ return -r;
+
+ f->offline_state = OFFLINE_JOINED;
if (mmap_cache_got_sigbus(f->mmap, f->fd))
return -EIO;
- switch (f->header->state) {
- case STATE_ONLINE:
- return 0;
+ return 0;
+}
- case STATE_OFFLINE:
- f->header->state = STATE_ONLINE;
- fsync(f->fd);
- return 0;
+/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
+static bool journal_file_set_offline_try_restart(JournalFile *f) {
+ for (;;) {
+ switch (f->offline_state) {
+ case OFFLINE_AGAIN_FROM_SYNCING:
+ case OFFLINE_AGAIN_FROM_OFFLINING:
+ return true;
+
+ case OFFLINE_CANCEL:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
+ continue;
+ return true;
+
+ case OFFLINE_SYNCING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
+ continue;
+ return true;
+
+ case OFFLINE_OFFLINING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
+ continue;
+ return true;
default:
- return -EINVAL;
+ return false;
+ }
}
}
-int journal_file_set_offline(JournalFile *f) {
+/* Sets a journal offline.
+ *
+ * If wait is false then an offline is dispatched in a separate thread for a
+ * subsequent journal_file_set_offline() or journal_file_set_online() of the
+ * same journal to synchronize with.
+ *
+ * If wait is true, then either an existing offline thread will be restarted
+ * and joined, or if none exists the offline is simply performed in this
+ * context without involving another thread.
+ */
+int journal_file_set_offline(JournalFile *f, bool wait) {
+ bool restarted;
+ int r;
+
assert(f);
if (!f->writable)
@@ -121,22 +218,114 @@ int journal_file_set_offline(JournalFile *f) {
if (!(f->fd >= 0 && f->header))
return -EINVAL;
- if (f->header->state != STATE_ONLINE)
+ /* An offlining journal is implicitly online and may modify f->header->state,
+ * we must also join any potentially lingering offline thread when not online. */
+ if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
+ return journal_file_set_offline_thread_join(f);
+
+ /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
+ restarted = journal_file_set_offline_try_restart(f);
+ if ((restarted && wait) || !restarted) {
+ r = journal_file_set_offline_thread_join(f);
+ if (r < 0)
+ return r;
+ }
+
+ if (restarted)
return 0;
- fsync(f->fd);
+ /* Initiate a new offline. */
+ f->offline_state = OFFLINE_SYNCING;
- if (mmap_cache_got_sigbus(f->mmap, f->fd))
- return -EIO;
+ if (wait) /* Without using a thread if waiting. */
+ journal_file_set_offline_internal(f);
+ else {
+ r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
+ if (r > 0) {
+ f->offline_state = OFFLINE_JOINED;
+ return -r;
+ }
+ }
- f->header->state = STATE_OFFLINE;
+ return 0;
+}
+
+static int journal_file_set_online(JournalFile *f) {
+ bool joined = false;
+
+ assert(f);
+
+ if (!f->writable)
+ return -EPERM;
+
+ if (!(f->fd >= 0 && f->header))
+ return -EINVAL;
+
+ while (!joined) {
+ switch (f->offline_state) {
+ case OFFLINE_JOINED:
+ /* No offline thread, no need to wait. */
+ joined = true;
+ break;
+
+ case OFFLINE_SYNCING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
+ continue;
+ /* Canceled syncing prior to offlining, no need to wait. */
+ break;
+
+ case OFFLINE_AGAIN_FROM_SYNCING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
+ continue;
+ /* Canceled restart from syncing, no need to wait. */
+ break;
+
+ case OFFLINE_AGAIN_FROM_OFFLINING:
+ if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
+ continue;
+ /* Canceled restart from offlining, must wait for offlining to complete however. */
+
+ /* fall through to wait */
+ default: {
+ int r;
+
+ r = journal_file_set_offline_thread_join(f);
+ if (r < 0)
+ return r;
+
+ joined = true;
+ break;
+ }
+ }
+ }
if (mmap_cache_got_sigbus(f->mmap, f->fd))
return -EIO;
- fsync(f->fd);
+ switch (f->header->state) {
+ case STATE_ONLINE:
+ return 0;
- return 0;
+ case STATE_OFFLINE:
+ f->header->state = STATE_ONLINE;
+ (void) fsync(f->fd);
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+bool journal_file_is_offlining(JournalFile *f) {
+ assert(f);
+
+ __sync_synchronize();
+
+ if (f->offline_state == OFFLINE_DONE ||
+ f->offline_state == OFFLINE_JOINED)
+ return false;
+
+ return true;
}
JournalFile* journal_file_close(JournalFile *f) {
@@ -159,7 +348,7 @@ JournalFile* journal_file_close(JournalFile *f) {
sd_event_source_unref(f->post_change_timer);
}
- journal_file_set_offline(f);
+ journal_file_set_offline(f, true);
if (f->mmap && f->fd >= 0)
mmap_cache_close_fd(f->mmap, f->fd);
@@ -176,7 +365,8 @@ JournalFile* journal_file_close(JournalFile *f) {
(void) btrfs_defrag_fd(f->fd);
}
- safe_close(f->fd);
+ if (f->close_fd)
+ safe_close(f->fd);
free(f->path);
mmap_cache_unref(f->mmap);
@@ -203,6 +393,15 @@ JournalFile* journal_file_close(JournalFile *f) {
return NULL;
}
+void journal_file_close_set(Set *s) {
+ JournalFile *f;
+
+ assert(s);
+
+ while ((f = set_steal_first(s)))
+ (void) journal_file_close(f);
+}
+
static int journal_file_init_header(JournalFile *f, JournalFile *template) {
Header h = {};
ssize_t k;
@@ -240,6 +439,39 @@ static int journal_file_init_header(JournalFile *f, JournalFile *template) {
return 0;
}
+static int fsync_directory_of_file(int fd) {
+ _cleanup_free_ char *path = NULL, *dn = NULL;
+ _cleanup_close_ int dfd = -1;
+ struct stat st;
+ int r;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISREG(st.st_mode))
+ return -EBADFD;
+
+ r = fd_get_path(fd, &path);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(path))
+ return -EINVAL;
+
+ dn = dirname_malloc(path);
+ if (!dn)
+ return -ENOMEM;
+
+ dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+ if (dfd < 0)
+ return -errno;
+
+ if (fsync(dfd) < 0)
+ return -errno;
+
+ return 0;
+}
+
static int journal_file_refresh_header(JournalFile *f) {
sd_id128_t boot_id;
int r;
@@ -263,7 +495,10 @@ static int journal_file_refresh_header(JournalFile *f) {
r = journal_file_set_online(f);
/* Sync the online state to disk */
- fsync(f->fd);
+ (void) fsync(f->fd);
+
+ /* We likely just created a new file, also sync the directory this file is located in. */
+ (void) fsync_directory_of_file(f->fd);
return r;
}
@@ -508,7 +743,11 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
/* Objects may only be located at multiple of 64 bit */
if (!VALID64(offset))
- return -EFAULT;
+ return -EBADMSG;
+
+ /* Object may not be located in the file header */
+ if (offset < le64toh(f->header->header_size))
+ return -EBADMSG;
r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
if (r < 0)
@@ -1123,8 +1362,8 @@ static int journal_file_append_data(
}
#endif
- if (compression == 0 && size > 0)
- memcpy(o->data.payload, data, size);
+ if (compression == 0)
+ memcpy_safe(o->data.payload, data, size);
r = journal_file_link_data(f, o, p, hash);
if (r < 0)
@@ -1389,7 +1628,7 @@ static int journal_file_append_entry_internal(
return r;
o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
- memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
+ memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
o->entry.realtime = htole64(ts->realtime);
o->entry.monotonic = htole64(ts->monotonic);
o->entry.xor_hash = htole64(xor_hash);
@@ -1781,9 +2020,14 @@ static int generic_array_bisect(
i = right - 1;
lp = p = le64toh(array->entry_array.items[i]);
if (p <= 0)
- return -EBADMSG;
-
- r = test_object(f, p, needle);
+ r = -EBADMSG;
+ else
+ r = test_object(f, p, needle);
+ if (r == -EBADMSG) {
+ log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
+ n = i;
+ continue;
+ }
if (r < 0)
return r;
@@ -1859,9 +2103,14 @@ static int generic_array_bisect(
p = le64toh(array->entry_array.items[i]);
if (p <= 0)
- return -EBADMSG;
-
- r = test_object(f, p, needle);
+ r = -EBADMSG;
+ else
+ r = test_object(f, p, needle);
+ if (r == -EBADMSG) {
+ log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
+ right = n = i;
+ continue;
+ }
if (r < 0)
return r;
@@ -1977,7 +2226,7 @@ static int generic_array_bisect_plus_one(
goto found;
if (r > 0 && idx)
- (*idx) ++;
+ (*idx)++;
return r;
@@ -2266,13 +2515,18 @@ int journal_file_next_entry(
le64toh(f->header->entry_array_offset),
i,
ret, &ofs);
+ if (r == -EBADMSG && direction == DIRECTION_DOWN) {
+ /* Special case: when we iterate throught the journal file linearly, and hit an entry we can't read,
+ * consider this the end of the journal file. */
+ log_debug_errno(r, "Encountered entry we can't read while iterating through journal file. Considering this the end of the file.");
+ return 0;
+ }
if (r <= 0)
return r;
if (p > 0 &&
(direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
- log_debug("%s: entry array corrupted at entry %"PRIu64,
- f->path, i);
+ log_debug("%s: entry array corrupted at entry %" PRIu64, f->path, i);
return -EBADMSG;
}
@@ -2611,11 +2865,11 @@ void journal_file_print_header(JournalFile *f) {
"Data Hash Table Size: %"PRIu64"\n"
"Field Hash Table Size: %"PRIu64"\n"
"Rotate Suggested: %s\n"
- "Head Sequential Number: %"PRIu64"\n"
- "Tail Sequential Number: %"PRIu64"\n"
- "Head Realtime Timestamp: %s\n"
- "Tail Realtime Timestamp: %s\n"
- "Tail Monotonic Timestamp: %s\n"
+ "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
+ "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
+ "Head Realtime Timestamp: %s (%"PRIx64")\n"
+ "Tail Realtime Timestamp: %s (%"PRIx64")\n"
+ "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
"Objects: %"PRIu64"\n"
"Entry Objects: %"PRIu64"\n",
f->path,
@@ -2636,11 +2890,11 @@ void journal_file_print_header(JournalFile *f) {
le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
yes_no(journal_file_rotate_suggested(f, 0)),
- le64toh(f->header->head_entry_seqnum),
- le64toh(f->header->tail_entry_seqnum),
- format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
- format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
- format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
+ le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
+ le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
+ format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
+ format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
+ format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
le64toh(f->header->n_objects),
le64toh(f->header->n_entries));
@@ -2703,6 +2957,7 @@ static int journal_file_warn_btrfs(JournalFile *f) {
}
int journal_file_open(
+ int fd,
const char *fname,
int flags,
mode_t mode,
@@ -2710,6 +2965,7 @@ int journal_file_open(
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
+ Set *deferred_closes,
JournalFile *template,
JournalFile **ret) {
@@ -2718,22 +2974,24 @@ int journal_file_open(
void *h;
int r;
- assert(fname);
assert(ret);
+ assert(fd >= 0 || fname);
if ((flags & O_ACCMODE) != O_RDONLY &&
(flags & O_ACCMODE) != O_RDWR)
return -EINVAL;
- if (!endswith(fname, ".journal") &&
- !endswith(fname, ".journal~"))
- return -EINVAL;
+ if (fname) {
+ if (!endswith(fname, ".journal") &&
+ !endswith(fname, ".journal~"))
+ return -EINVAL;
+ }
f = new0(JournalFile, 1);
if (!f)
return -ENOMEM;
- f->fd = -1;
+ f->fd = fd;
f->mode = mode;
f->flags = flags;
@@ -2758,7 +3016,10 @@ int journal_file_open(
}
}
- f->path = strdup(fname);
+ if (fname)
+ f->path = strdup(fname);
+ else /* If we don't know the path, fill in something explanatory and vaguely useful */
+ asprintf(&f->path, "/proc/self/%i", fd);
if (!f->path) {
r = -ENOMEM;
goto fail;
@@ -2770,10 +3031,15 @@ int journal_file_open(
goto fail;
}
- f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
if (f->fd < 0) {
- r = -errno;
- goto fail;
+ f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
+ if (f->fd < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ /* fds we opened here by us should also be closed by us. */
+ f->close_fd = true;
}
r = journal_file_fstat(f);
@@ -2829,6 +3095,9 @@ int journal_file_open(
f->header = h;
if (!newly_created) {
+ if (deferred_closes)
+ journal_file_close_set(deferred_closes);
+
r = journal_file_verify_header(f);
if (r < 0)
goto fail;
@@ -2891,6 +3160,9 @@ int journal_file_open(
goto fail;
}
+ /* The file is opened now successfully, thus we take possession of any passed in fd. */
+ f->close_fd = true;
+
*ret = f;
return 0;
@@ -2898,12 +3170,12 @@ fail:
if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
r = -EIO;
- journal_file_close(f);
+ (void) journal_file_close(f);
return r;
}
-int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
+int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
_cleanup_free_ char *p = NULL;
size_t l;
JournalFile *old_file, *new_file = NULL;
@@ -2917,6 +3189,11 @@ int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
if (!old_file->writable)
return -EINVAL;
+ /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
+ * rotation, since we don't know the actual path, and couldn't rename the file hence.*/
+ if (path_startswith(old_file->path, "/proc/self/fd"))
+ return -EINVAL;
+
if (!endswith(old_file->path, ".journal"))
return -EINVAL;
@@ -2936,15 +3213,29 @@ int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
if (r < 0 && errno != ENOENT)
return -errno;
- old_file->header->state = STATE_ARCHIVED;
+ /* Sync the rename to disk */
+ (void) fsync_directory_of_file(old_file->fd);
+
+ /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
+ * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
+ * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
+ * would result in the rotated journal never getting fsync() called before closing.
+ * Now we simply queue the archive state by setting an archive bit, leaving the state
+ * as STATE_ONLINE so proper offlining occurs. */
+ old_file->archive = true;
/* Currently, btrfs is not very good with out write patterns
* and fragments heavily. Let's defrag our journal files when
* we archive them */
old_file->defrag_on_close = true;
- r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
- journal_file_close(old_file);
+ r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
+
+ if (deferred_closes &&
+ set_put(deferred_closes, old_file) >= 0)
+ (void) journal_file_set_offline(old_file, false);
+ else
+ (void) journal_file_close(old_file);
*f = new_file;
return r;
@@ -2958,6 +3249,7 @@ int journal_file_open_reliably(
bool seal,
JournalMetrics *metrics,
MMapCache *mmap_cache,
+ Set *deferred_closes,
JournalFile *template,
JournalFile **ret) {
@@ -2965,7 +3257,7 @@ int journal_file_open_reliably(
size_t l;
_cleanup_free_ char *p = NULL;
- r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
+ r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
if (!IN_SET(r,
-EBADMSG, /* corrupted */
-ENODATA, /* truncated */
@@ -3001,12 +3293,12 @@ int journal_file_open_reliably(
/* btrfs doesn't cope well with our write pattern and
* fragments heavily. Let's defrag all files we rotate */
- (void) chattr_path(p, false, FS_NOCOW_FL);
+ (void) chattr_path(p, 0, FS_NOCOW_FL);
(void) btrfs_defrag(p);
log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
- return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
+ return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
}
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {