diff options
Diffstat (limited to 'src/journal/journal-file.c')
-rw-r--r-- | src/journal/journal-file.c | 905 |
1 files changed, 724 insertions, 181 deletions
diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c index be6a5522fa..49199b269f 100644 --- a/src/journal/journal-file.c +++ b/src/journal/journal-file.c @@ -1,5 +1,3 @@ -/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ - /*** This file is part of systemd. @@ -19,22 +17,32 @@ along with systemd; If not, see <http://www.gnu.org/licenses/>. ***/ -#include <sys/mman.h> #include <errno.h> -#include <sys/uio.h> -#include <unistd.h> -#include <sys/statvfs.h> #include <fcntl.h> -#include <stddef.h> #include <linux/fs.h> +#include <pthread.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/statvfs.h> +#include <sys/uio.h> +#include <unistd.h> +#include "alloc-util.h" #include "btrfs-util.h" +#include "chattr-util.h" +#include "compress.h" +#include "fd-util.h" +#include "journal-authenticate.h" #include "journal-def.h" #include "journal-file.h" -#include "journal-authenticate.h" #include "lookup3.h" -#include "compress.h" +#include "parse-util.h" +#include "path-util.h" #include "random-util.h" +#include "sd-event.h" +#include "set.h" +#include "string-util.h" +#include "xattr-util.h" #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem)) #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem)) @@ -42,13 +50,16 @@ #define COMPRESSION_SIZE_THRESHOLD (512ULL) /* This is the minimum journal file size */ -#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */ +#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */ /* These are the lower and upper bounds if we deduce the max_use value * from the file system size */ #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */ #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */ +/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */ +#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */ + /* This is the upper bound if we deduce max_size from max_use */ #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */ @@ -60,6 +71,9 @@ * size */ #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */ +/* This is the default maximum number of journal files to keep around. */ +#define DEFAULT_N_MAX_FILES (100) + /* n_data was the first entry we added after the initial file format design */ #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data)) @@ -75,33 +89,127 @@ /* The mmap context to use for the header we pick as one above the last defined typed */ #define CONTEXT_HEADER _OBJECT_TYPE_MAX -static int journal_file_set_online(JournalFile *f) { +/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync(). + * As a result we use atomic operations on f->offline_state for inter-thread communications with + * journal_file_set_offline() and journal_file_set_online(). */ +static void journal_file_set_offline_internal(JournalFile *f) { assert(f); + assert(f->fd >= 0); + assert(f->header); - if (!f->writable) - return -EPERM; + for (;;) { + switch (f->offline_state) { + case OFFLINE_CANCEL: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE)) + continue; + return; - if (!(f->fd >= 0 && f->header)) - return -EINVAL; + case OFFLINE_AGAIN_FROM_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING)) + continue; + break; + + case OFFLINE_AGAIN_FROM_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING)) + continue; + break; + + case OFFLINE_SYNCING: + (void) fsync(f->fd); + + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING)) + continue; + + f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE; + (void) fsync(f->fd); + break; + + case OFFLINE_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE)) + continue; + /* fall through */ + + case OFFLINE_DONE: + return; + + case OFFLINE_JOINED: + log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()"); + return; + } + } +} + +static void * journal_file_set_offline_thread(void *arg) { + JournalFile *f = arg; + + journal_file_set_offline_internal(f); + + return NULL; +} + +static int journal_file_set_offline_thread_join(JournalFile *f) { + int r; + + assert(f); + + if (f->offline_state == OFFLINE_JOINED) + return 0; + + r = pthread_join(f->offline_thread, NULL); + if (r) + return -r; + + f->offline_state = OFFLINE_JOINED; if (mmap_cache_got_sigbus(f->mmap, f->fd)) return -EIO; - switch(f->header->state) { - case STATE_ONLINE: - return 0; + return 0; +} - case STATE_OFFLINE: - f->header->state = STATE_ONLINE; - fsync(f->fd); - return 0; +/* Trigger a restart if the offline thread is mid-flight in a restartable state. */ +static bool journal_file_set_offline_try_restart(JournalFile *f) { + for (;;) { + switch (f->offline_state) { + case OFFLINE_AGAIN_FROM_SYNCING: + case OFFLINE_AGAIN_FROM_OFFLINING: + return true; + + case OFFLINE_CANCEL: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING)) + continue; + return true; + + case OFFLINE_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING)) + continue; + return true; + + case OFFLINE_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING)) + continue; + return true; default: - return -EINVAL; + return false; + } } } -int journal_file_set_offline(JournalFile *f) { +/* Sets a journal offline. + * + * If wait is false then an offline is dispatched in a separate thread for a + * subsequent journal_file_set_offline() or journal_file_set_online() of the + * same journal to synchronize with. + * + * If wait is true, then either an existing offline thread will be restarted + * and joined, or if none exists the offline is simply performed in this + * context without involving another thread. + */ +int journal_file_set_offline(JournalFile *f, bool wait) { + bool restarted; + int r; + assert(f); if (!f->writable) @@ -110,34 +218,142 @@ int journal_file_set_offline(JournalFile *f) { if (!(f->fd >= 0 && f->header)) return -EINVAL; - if (f->header->state != STATE_ONLINE) + /* An offlining journal is implicitly online and may modify f->header->state, + * we must also join any potentially lingering offline thread when not online. */ + if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE) + return journal_file_set_offline_thread_join(f); + + /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */ + restarted = journal_file_set_offline_try_restart(f); + if ((restarted && wait) || !restarted) { + r = journal_file_set_offline_thread_join(f); + if (r < 0) + return r; + } + + if (restarted) return 0; - fsync(f->fd); + /* Initiate a new offline. */ + f->offline_state = OFFLINE_SYNCING; - if (mmap_cache_got_sigbus(f->mmap, f->fd)) - return -EIO; + if (wait) /* Without using a thread if waiting. */ + journal_file_set_offline_internal(f); + else { + r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f); + if (r > 0) { + f->offline_state = OFFLINE_JOINED; + return -r; + } + } + + return 0; +} - f->header->state = STATE_OFFLINE; +static int journal_file_set_online(JournalFile *f) { + bool joined = false; + + assert(f); + + if (!f->writable) + return -EPERM; + + if (!(f->fd >= 0 && f->header)) + return -EINVAL; + + while (!joined) { + switch (f->offline_state) { + case OFFLINE_JOINED: + /* No offline thread, no need to wait. */ + joined = true; + break; + + case OFFLINE_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL)) + continue; + /* Canceled syncing prior to offlining, no need to wait. */ + break; + + case OFFLINE_AGAIN_FROM_SYNCING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL)) + continue; + /* Canceled restart from syncing, no need to wait. */ + break; + + case OFFLINE_AGAIN_FROM_OFFLINING: + if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL)) + continue; + /* Canceled restart from offlining, must wait for offlining to complete however. */ + + /* fall through to wait */ + default: { + int r; + + r = journal_file_set_offline_thread_join(f); + if (r < 0) + return r; + + joined = true; + break; + } + } + } if (mmap_cache_got_sigbus(f->mmap, f->fd)) return -EIO; - fsync(f->fd); + switch (f->header->state) { + case STATE_ONLINE: + return 0; - return 0; + case STATE_OFFLINE: + f->header->state = STATE_ONLINE; + (void) fsync(f->fd); + return 0; + + default: + return -EINVAL; + } +} + +bool journal_file_is_offlining(JournalFile *f) { + assert(f); + + __sync_synchronize(); + + if (f->offline_state == OFFLINE_DONE || + f->offline_state == OFFLINE_JOINED) + return false; + + return true; } -void journal_file_close(JournalFile *f) { +JournalFile* journal_file_close(JournalFile *f) { assert(f); #ifdef HAVE_GCRYPT /* Write the final tag */ - if (f->seal && f->writable) - journal_file_append_tag(f); + if (f->seal && f->writable) { + int r; + + r = journal_file_append_tag(f); + if (r < 0) + log_error_errno(r, "Failed to append tag when closing journal: %m"); + } #endif - journal_file_set_offline(f); + if (f->post_change_timer) { + int enabled; + + if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0) + if (enabled == SD_EVENT_ONESHOT) + journal_file_post_change(f); + + (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF); + sd_event_source_unref(f->post_change_timer); + } + + journal_file_set_offline(f, true); if (f->mmap && f->fd >= 0) mmap_cache_close_fd(f->mmap, f->fd); @@ -154,11 +370,11 @@ void journal_file_close(JournalFile *f) { (void) btrfs_defrag_fd(f->fd); } - safe_close(f->fd); + if (f->close_fd) + safe_close(f->fd); free(f->path); - if (f->mmap) - mmap_cache_unref(f->mmap); + mmap_cache_unref(f->mmap); ordered_hashmap_free_free(f->chain_cache); @@ -169,7 +385,7 @@ void journal_file_close(JournalFile *f) { #ifdef HAVE_GCRYPT if (f->fss_file) munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size)); - else if (f->fsprg_state) + else free(f->fsprg_state); free(f->fsprg_seed); @@ -179,6 +395,16 @@ void journal_file_close(JournalFile *f) { #endif free(f); + return NULL; +} + +void journal_file_close_set(Set *s) { + JournalFile *f; + + assert(s); + + while ((f = set_steal_first(s))) + (void) journal_file_close(f); } static int journal_file_init_header(JournalFile *f, JournalFile *template) { @@ -218,11 +444,45 @@ static int journal_file_init_header(JournalFile *f, JournalFile *template) { return 0; } +static int fsync_directory_of_file(int fd) { + _cleanup_free_ char *path = NULL, *dn = NULL; + _cleanup_close_ int dfd = -1; + struct stat st; + int r; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISREG(st.st_mode)) + return -EBADFD; + + r = fd_get_path(fd, &path); + if (r < 0) + return r; + + if (!path_is_absolute(path)) + return -EINVAL; + + dn = dirname_malloc(path); + if (!dn) + return -ENOMEM; + + dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (dfd < 0) + return -errno; + + if (fsync(dfd) < 0) + return -errno; + + return 0; +} + static int journal_file_refresh_header(JournalFile *f) { sd_id128_t boot_id; int r; assert(f); + assert(f->header); r = sd_id128_get_machine(&f->header->machine_id); if (r < 0) @@ -240,7 +500,10 @@ static int journal_file_refresh_header(JournalFile *f) { r = journal_file_set_online(f); /* Sync the online state to disk */ - fsync(f->fd); + (void) fsync(f->fd); + + /* We likely just created a new file, also sync the directory this file is located in. */ + (void) fsync_directory_of_file(f->fd); return r; } @@ -249,6 +512,7 @@ static int journal_file_verify_header(JournalFile *f) { uint32_t flags; assert(f); + assert(f->header); if (memcmp(f->header->signature, HEADER_SIGNATURE, 8)) return -EBADMSG; @@ -304,8 +568,8 @@ static int journal_file_verify_header(JournalFile *f) { return -ENODATA; if (f->writable) { - uint8_t state; sd_id128_t machine_id; + uint8_t state; int r; r = sd_id128_get_machine(&machine_id); @@ -326,6 +590,14 @@ static int journal_file_verify_header(JournalFile *f) { log_debug("Journal file %s has unknown state %i.", f->path, state); return -EBUSY; } + + /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't + * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks + * bisection. */ + if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) { + log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path); + return -ETXTBSY; + } } f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header); @@ -357,6 +629,7 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) int r; assert(f); + assert(f->header); /* We assume that this file is not sparse, and we know that * for sure, since we always call posix_fallocate() @@ -398,12 +671,7 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) if (fstatvfs(f->fd, &svfs) >= 0) { uint64_t available; - available = svfs.f_bfree * svfs.f_bsize; - - if (available >= f->metrics.keep_free) - available -= f->metrics.keep_free; - else - available = 0; + available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free); if (new_size - old_size > available) return -E2BIG; @@ -487,8 +755,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset assert(ret); /* Objects may only be located at multiple of 64 bit */ - if (!VALID64(offset)) - return -EFAULT; + if (!VALID64(offset)) { + log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset); + return -EBADMSG; + } + + /* Object may not be located in the file header */ + if (offset < le64toh(f->header->header_size)) { + log_debug("Attempt to move to object located in file header: %" PRIu64, offset); + return -EBADMSG; + } r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t); if (r < 0) @@ -497,17 +773,29 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset o = (Object*) t; s = le64toh(o->object.size); - if (s < sizeof(ObjectHeader)) + if (s == 0) { + log_debug("Attempt to move to uninitialized object: %" PRIu64, offset); + return -EBADMSG; + } + if (s < sizeof(ObjectHeader)) { + log_debug("Attempt to move to overly short object: %" PRIu64, offset); return -EBADMSG; + } - if (o->object.type <= OBJECT_UNUSED) + if (o->object.type <= OBJECT_UNUSED) { + log_debug("Attempt to move to object with invalid type: %" PRIu64, offset); return -EBADMSG; + } - if (s < minimum_header_size(o)) + if (s < minimum_header_size(o)) { + log_debug("Attempt to move to truncated object: %" PRIu64, offset); return -EBADMSG; + } - if (type > OBJECT_UNUSED && o->object.type != type) + if (type > OBJECT_UNUSED && o->object.type != type) { + log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset); return -EBADMSG; + } if (s > sizeof(ObjectHeader)) { r = journal_file_move_to(f, type, false, offset, s, &t); @@ -525,6 +813,7 @@ static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) { uint64_t r; assert(f); + assert(f->header); r = le64toh(f->header->tail_entry_seqnum) + 1; @@ -554,6 +843,7 @@ int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, O void *t; assert(f); + assert(f->header); assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX); assert(size >= sizeof(ObjectHeader)); assert(offset); @@ -603,11 +893,12 @@ static int journal_file_setup_data_hash_table(JournalFile *f) { int r; assert(f); + assert(f->header); - /* We estimate that we need 1 hash table entry per 768 of - journal file and we want to make sure we never get beyond - 75% fill level. Calculate the hash table size for the - maximum file size based on these metrics. */ + /* We estimate that we need 1 hash table entry per 768 bytes + of journal file and we want to make sure we never get + beyond 75% fill level. Calculate the hash table size for + the maximum file size based on these metrics. */ s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem); if (s < DEFAULT_DATA_HASH_TABLE_SIZE) @@ -636,6 +927,7 @@ static int journal_file_setup_field_hash_table(JournalFile *f) { int r; assert(f); + assert(f->header); /* We use a fixed size hash table for the fields as this * number should grow very slowly only */ @@ -656,12 +948,16 @@ static int journal_file_setup_field_hash_table(JournalFile *f) { return 0; } -static int journal_file_map_data_hash_table(JournalFile *f) { +int journal_file_map_data_hash_table(JournalFile *f) { uint64_t s, p; void *t; int r; assert(f); + assert(f->header); + + if (f->data_hash_table) + return 0; p = le64toh(f->header->data_hash_table_offset); s = le64toh(f->header->data_hash_table_size); @@ -678,12 +974,16 @@ static int journal_file_map_data_hash_table(JournalFile *f) { return 0; } -static int journal_file_map_field_hash_table(JournalFile *f) { +int journal_file_map_field_hash_table(JournalFile *f) { uint64_t s, p; void *t; int r; assert(f); + assert(f->header); + + if (f->field_hash_table) + return 0; p = le64toh(f->header->field_hash_table_offset); s = le64toh(f->header->field_hash_table_size); @@ -710,6 +1010,8 @@ static int journal_file_link_field( int r; assert(f); + assert(f->header); + assert(f->field_hash_table); assert(o); assert(offset > 0); @@ -753,6 +1055,8 @@ static int journal_file_link_data( int r; assert(f); + assert(f->header); + assert(f->data_hash_table); assert(o); assert(offset > 0); @@ -801,12 +1105,21 @@ int journal_file_find_field_object_with_hash( int r; assert(f); + assert(f->header); assert(field && size > 0); + /* If the field hash table is empty, we can't find anything */ + if (le64toh(f->header->field_hash_table_size) <= 0) + return 0; + + /* Map the field hash table, if it isn't mapped yet. */ + r = journal_file_map_field_hash_table(f); + if (r < 0) + return r; + osize = offsetof(Object, field.payload) + size; m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem); - if (m <= 0) return -EBADMSG; @@ -864,8 +1177,18 @@ int journal_file_find_data_object_with_hash( int r; assert(f); + assert(f->header); assert(data || size == 0); + /* If there's no data hash table, then there's no entry. */ + if (le64toh(f->header->data_hash_table_size) <= 0) + return 0; + + /* Map the data hash table, if it isn't mapped yet. */ + r = journal_file_map_data_hash_table(f); + if (r < 0) + return r; + osize = offsetof(Object, data.payload) + size; m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem); @@ -1032,7 +1355,7 @@ static int journal_file_append_data( r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p); if (r < 0) return r; - else if (r > 0) { + if (r > 0) { if (ret) *ret = o; @@ -1051,29 +1374,36 @@ static int journal_file_append_data( o->data.hash = htole64(hash); #if defined(HAVE_XZ) || defined(HAVE_LZ4) - if (f->compress_xz && - size >= COMPRESSION_SIZE_THRESHOLD) { + if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) { size_t rsize = 0; - compression = compress_blob(data, size, o->data.payload, &rsize); + compression = compress_blob(data, size, o->data.payload, size - 1, &rsize); - if (compression) { + if (compression >= 0) { o->object.size = htole64(offsetof(Object, data.payload) + rsize); o->object.flags |= compression; log_debug("Compressed data object %"PRIu64" -> %zu using %s", size, rsize, object_compressed_to_string(compression)); - } + } else + /* Compression didn't work, we don't really care why, let's continue without compression */ + compression = 0; } #endif - if (!compression && size > 0) - memcpy(o->data.payload, data, size); + if (compression == 0) + memcpy_safe(o->data.payload, data, size); r = journal_file_link_data(f, o, p, hash); if (r < 0) return r; +#ifdef HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); + if (r < 0) + return r; +#endif + /* The linking might have altered the window, so let's * refresh our pointer */ r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); @@ -1098,12 +1428,6 @@ static int journal_file_append_data( fo->field.head_data_offset = le64toh(p); } -#ifdef HAVE_GCRYPT - r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); - if (r < 0) - return r; -#endif - if (ret) *ret = o; @@ -1150,6 +1474,7 @@ static int link_entry_into_array(JournalFile *f, Object *o; assert(f); + assert(f->header); assert(first); assert(idx); assert(p > 0); @@ -1270,6 +1595,7 @@ static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) { int r; assert(f); + assert(f->header); assert(o); assert(offset > 0); @@ -1320,6 +1646,7 @@ static int journal_file_append_entry_internal( int r; assert(f); + assert(f->header); assert(items || n_items == 0); assert(ts); @@ -1330,7 +1657,7 @@ static int journal_file_append_entry_internal( return r; o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum)); - memcpy(o->entry.items, items, n_items * sizeof(EntryItem)); + memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem)); o->entry.realtime = htole64(ts->realtime); o->entry.monotonic = htole64(ts->monotonic); o->entry.xor_hash = htole64(xor_hash); @@ -1366,7 +1693,84 @@ void journal_file_post_change(JournalFile *f) { __sync_synchronize(); if (ftruncate(f->fd, f->last_stat.st_size) < 0) - log_error_errno(errno, "Failed to truncate file to its own size: %m"); + log_debug_errno(errno, "Failed to truncate file to its own size: %m"); +} + +static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) { + assert(userdata); + + journal_file_post_change(userdata); + + return 1; +} + +static void schedule_post_change(JournalFile *f) { + sd_event_source *timer; + int enabled, r; + uint64_t now; + + assert(f); + assert(f->post_change_timer); + + timer = f->post_change_timer; + + r = sd_event_source_get_enabled(timer, &enabled); + if (r < 0) { + log_debug_errno(r, "Failed to get ftruncate timer state: %m"); + goto fail; + } + + if (enabled == SD_EVENT_ONESHOT) + return; + + r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now); + if (r < 0) { + log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m"); + goto fail; + } + + r = sd_event_source_set_time(timer, now+f->post_change_timer_period); + if (r < 0) { + log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m"); + goto fail; + } + + r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT); + if (r < 0) { + log_debug_errno(r, "Failed to enable scheduled ftruncate: %m"); + goto fail; + } + + return; + +fail: + /* On failure, let's simply post the change immediately. */ + journal_file_post_change(f); +} + +/* Enable coalesced change posting in a timer on the provided sd_event instance */ +int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) { + _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL; + int r; + + assert(f); + assert_return(!f->post_change_timer, -EINVAL); + assert(e); + assert(t); + + r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(timer, SD_EVENT_OFF); + if (r < 0) + return r; + + f->post_change_timer = timer; + timer = NULL; + f->post_change_timer_period = t; + + return r; } static int entry_item_cmp(const void *_a, const void *_b) { @@ -1387,6 +1791,7 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st struct dual_timestamp _ts; assert(f); + assert(f->header); assert(iovec || n_iovec == 0); if (!ts) { @@ -1394,10 +1799,6 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st ts = &_ts; } - if (f->tail_entry_monotonic_valid && - ts->monotonic < le64toh(f->header->tail_entry_monotonic)) - return -EINVAL; - #ifdef HAVE_GCRYPT r = journal_file_maybe_append_tag(f, ts->realtime); if (r < 0) @@ -1434,7 +1835,10 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st if (mmap_cache_got_sigbus(f->mmap, f->fd)) r = -EIO; - journal_file_post_change(f); + if (f->post_change_timer) + schedule_post_change(f); + else + journal_file_post_change(f); return r; } @@ -1645,9 +2049,14 @@ static int generic_array_bisect( i = right - 1; lp = p = le64toh(array->entry_array.items[i]); if (p <= 0) - return -EBADMSG; - - r = test_object(f, p, needle); + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (r == -EBADMSG) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)"); + n = i; + continue; + } if (r < 0) return r; @@ -1723,9 +2132,14 @@ static int generic_array_bisect( p = le64toh(array->entry_array.items[i]); if (p <= 0) - return -EBADMSG; - - r = test_object(f, p, needle); + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (r == -EBADMSG) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)"); + right = n = i; + continue; + } if (r < 0) return r; @@ -1841,7 +2255,7 @@ static int generic_array_bisect_plus_one( goto found; if (r > 0 && idx) - (*idx) ++; + (*idx)++; return r; @@ -1899,6 +2313,8 @@ int journal_file_move_to_entry_by_seqnum( direction_t direction, Object **ret, uint64_t *offset) { + assert(f); + assert(f->header); return generic_array_bisect(f, le64toh(f->header->entry_array_offset), @@ -1934,6 +2350,8 @@ int journal_file_move_to_entry_by_realtime( direction_t direction, Object **ret, uint64_t *offset) { + assert(f); + assert(f->header); return generic_array_bisect(f, le64toh(f->header->entry_array_offset), @@ -2026,7 +2444,9 @@ void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) { int journal_file_compare_locations(JournalFile *af, JournalFile *bf) { assert(af); + assert(af->header); assert(bf); + assert(bf->header); assert(af->location_type == LOCATION_SEEK); assert(bf->location_type == LOCATION_SEEK); @@ -2076,6 +2496,37 @@ int journal_file_compare_locations(JournalFile *af, JournalFile *bf) { return 0; } +static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) { + + /* Increase or decrease the specified index, in the right direction. */ + + if (direction == DIRECTION_DOWN) { + if (*i >= n - 1) + return 0; + + (*i) ++; + } else { + if (*i <= 0) + return 0; + + (*i) --; + } + + return 1; +} + +static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) { + + /* Consider it an error if any of the two offsets is uninitialized */ + if (old_offset == 0 || new_offset == 0) + return false; + + /* If we go down, the new offset must be larger than the old one. */ + return direction == DIRECTION_DOWN ? + new_offset > old_offset : + new_offset < old_offset; +} + int journal_file_next_entry( JournalFile *f, uint64_t p, @@ -2086,6 +2537,7 @@ int journal_file_next_entry( int r; assert(f); + assert(f->header); n = le64toh(f->header->n_entries); if (n <= 0) @@ -2105,31 +2557,34 @@ int journal_file_next_entry( if (r <= 0) return r; - if (direction == DIRECTION_DOWN) { - if (i >= n - 1) - return 0; - - i++; - } else { - if (i <= 0) - return 0; - - i--; - } + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; } /* And jump to it */ - r = generic_array_get(f, - le64toh(f->header->entry_array_offset), - i, - ret, &ofs); - if (r <= 0) - return r; + for (;;) { + r = generic_array_get(f, + le64toh(f->header->entry_array_offset), + i, + ret, &ofs); + if (r > 0) + break; + if (r != -EBADMSG) + return r; + + /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if + * the next one might work for us instead. */ + log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i); - if (p > 0 && - (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) { - log_debug("%s: entry array corrupted at entry %"PRIu64, - f->path, i); + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + + /* Ensure our array is properly ordered. */ + if (p > 0 && !check_properly_ordered(ofs, p, direction)) { + log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i); return -EBADMSG; } @@ -2146,9 +2601,9 @@ int journal_file_next_entry_for_data( direction_t direction, Object **ret, uint64_t *offset) { - uint64_t n, i; - int r; + uint64_t i, n, ofs; Object *d; + int r; assert(f); assert(p > 0 || !o); @@ -2180,25 +2635,39 @@ int journal_file_next_entry_for_data( if (r <= 0) return r; - if (direction == DIRECTION_DOWN) { - if (i >= n - 1) - return 0; + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + + for (;;) { + r = generic_array_get_plus_one(f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + i, + ret, &ofs); + if (r > 0) + break; + if (r != -EBADMSG) + return r; - i++; - } else { - if (i <= 0) - return 0; + log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i); - i--; - } + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + /* Ensure our array is properly ordered. */ + if (p > 0 && check_properly_ordered(ofs, p, direction)) { + log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i); + return -EBADMSG; } - return generic_array_get_plus_one(f, - le64toh(d->data.entry_offset), - le64toh(d->data.entry_array_offset), - i, - ret, offset); + if (offset) + *offset = ofs; + + return 1; } int journal_file_move_to_entry_by_offset_for_data( @@ -2368,6 +2837,7 @@ void journal_file_dump(JournalFile *f) { uint64_t p; assert(f); + assert(f->header); journal_file_print_header(f); @@ -2452,6 +2922,7 @@ void journal_file_print_header(JournalFile *f) { char bytes[FORMAT_BYTES_MAX]; assert(f); + assert(f->header); printf("File Path: %s\n" "File ID: %s\n" @@ -2466,11 +2937,11 @@ void journal_file_print_header(JournalFile *f) { "Data Hash Table Size: %"PRIu64"\n" "Field Hash Table Size: %"PRIu64"\n" "Rotate Suggested: %s\n" - "Head Sequential Number: %"PRIu64"\n" - "Tail Sequential Number: %"PRIu64"\n" - "Head Realtime Timestamp: %s\n" - "Tail Realtime Timestamp: %s\n" - "Tail Monotonic Timestamp: %s\n" + "Head Sequential Number: %"PRIu64" (%"PRIx64")\n" + "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n" + "Head Realtime Timestamp: %s (%"PRIx64")\n" + "Tail Realtime Timestamp: %s (%"PRIx64")\n" + "Tail Monotonic Timestamp: %s (%"PRIx64")\n" "Objects: %"PRIu64"\n" "Entry Objects: %"PRIu64"\n", f->path, @@ -2491,11 +2962,11 @@ void journal_file_print_header(JournalFile *f) { le64toh(f->header->data_hash_table_size) / sizeof(HashItem), le64toh(f->header->field_hash_table_size) / sizeof(HashItem), yes_no(journal_file_rotate_suggested(f, 0)), - le64toh(f->header->head_entry_seqnum), - le64toh(f->header->tail_entry_seqnum), - format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), - format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), - format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), + le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum), + le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum), + format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime), + format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime), + format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic), le64toh(f->header->n_objects), le64toh(f->header->n_entries)); @@ -2519,7 +2990,7 @@ void journal_file_print_header(JournalFile *f) { le64toh(f->header->n_entry_arrays)); if (fstat(f->fd, &st) >= 0) - printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL)); + printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL)); } static int journal_file_warn_btrfs(JournalFile *f) { @@ -2558,6 +3029,7 @@ static int journal_file_warn_btrfs(JournalFile *f) { } int journal_file_open( + int fd, const char *fname, int flags, mode_t mode, @@ -2565,6 +3037,7 @@ int journal_file_open( bool seal, JournalMetrics *metrics, MMapCache *mmap_cache, + Set *deferred_closes, JournalFile *template, JournalFile **ret) { @@ -2573,22 +3046,24 @@ int journal_file_open( void *h; int r; - assert(fname); assert(ret); + assert(fd >= 0 || fname); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return -EINVAL; - if (!endswith(fname, ".journal") && - !endswith(fname, ".journal~")) - return -EINVAL; + if (fname) { + if (!endswith(fname, ".journal") && + !endswith(fname, ".journal~")) + return -EINVAL; + } f = new0(JournalFile, 1); if (!f) return -ENOMEM; - f->fd = -1; + f->fd = fd; f->mode = mode; f->flags = flags; @@ -2613,7 +3088,10 @@ int journal_file_open( } } - f->path = strdup(fname); + if (fname) + f->path = strdup(fname); + else /* If we don't know the path, fill in something explanatory and vaguely useful */ + asprintf(&f->path, "/proc/self/%i", fd); if (!f->path) { r = -ENOMEM; goto fail; @@ -2625,10 +3103,15 @@ int journal_file_open( goto fail; } - f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode); if (f->fd < 0) { - r = -errno; - goto fail; + f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode); + if (f->fd < 0) { + r = -errno; + goto fail; + } + + /* fds we opened here by us should also be closed by us. */ + f->close_fd = true; } r = journal_file_fstat(f); @@ -2673,7 +3156,7 @@ int journal_file_open( } if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) { - r = -EIO; + r = -ENODATA; goto fail; } @@ -2684,6 +3167,9 @@ int journal_file_open( f->header = h; if (!newly_created) { + if (deferred_closes) + journal_file_close_set(deferred_closes); + r = journal_file_verify_header(f); if (r < 0) goto fail; @@ -2731,19 +3217,24 @@ int journal_file_open( #endif } - r = journal_file_map_field_hash_table(f); - if (r < 0) - goto fail; - - r = journal_file_map_data_hash_table(f); - if (r < 0) - goto fail; - if (mmap_cache_got_sigbus(f->mmap, f->fd)) { r = -EIO; goto fail; } + if (template && template->post_change_timer) { + r = journal_file_enable_post_change_timer( + f, + sd_event_source_get_event(template->post_change_timer), + template->post_change_timer_period); + + if (r < 0) + goto fail; + } + + /* The file is opened now successfully, thus we take possession of any passed in fd. */ + f->close_fd = true; + *ret = f; return 0; @@ -2751,12 +3242,12 @@ fail: if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd)) r = -EIO; - journal_file_close(f); + (void) journal_file_close(f); return r; } -int journal_file_rotate(JournalFile **f, bool compress, bool seal) { +int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) { _cleanup_free_ char *p = NULL; size_t l; JournalFile *old_file, *new_file = NULL; @@ -2770,6 +3261,11 @@ int journal_file_rotate(JournalFile **f, bool compress, bool seal) { if (!old_file->writable) return -EINVAL; + /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse + * rotation, since we don't know the actual path, and couldn't rename the file hence.*/ + if (path_startswith(old_file->path, "/proc/self/fd")) + return -EINVAL; + if (!endswith(old_file->path, ".journal")) return -EINVAL; @@ -2789,15 +3285,29 @@ int journal_file_rotate(JournalFile **f, bool compress, bool seal) { if (r < 0 && errno != ENOENT) return -errno; - old_file->header->state = STATE_ARCHIVED; + /* Sync the rename to disk */ + (void) fsync_directory_of_file(old_file->fd); + + /* Set as archive so offlining commits w/state=STATE_ARCHIVED. + * Previously we would set old_file->header->state to STATE_ARCHIVED directly here, + * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which + * would result in the rotated journal never getting fsync() called before closing. + * Now we simply queue the archive state by setting an archive bit, leaving the state + * as STATE_ONLINE so proper offlining occurs. */ + old_file->archive = true; /* Currently, btrfs is not very good with out write patterns * and fragments heavily. Let's defrag our journal files when * we archive them */ old_file->defrag_on_close = true; - r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file); - journal_file_close(old_file); + r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file); + + if (deferred_closes && + set_put(deferred_closes, old_file) >= 0) + (void) journal_file_set_offline(old_file, false); + else + (void) journal_file_close(old_file); *f = new_file; return r; @@ -2811,6 +3321,7 @@ int journal_file_open_reliably( bool seal, JournalMetrics *metrics, MMapCache *mmap_cache, + Set *deferred_closes, JournalFile *template, JournalFile **ret) { @@ -2818,8 +3329,7 @@ int journal_file_open_reliably( size_t l; _cleanup_free_ char *p = NULL; - r = journal_file_open(fname, flags, mode, compress, seal, - metrics, mmap_cache, template, ret); + r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret); if (!IN_SET(r, -EBADMSG, /* corrupted */ -ENODATA, /* truncated */ @@ -2828,7 +3338,8 @@ int journal_file_open_reliably( -EBUSY, /* unclean shutdown */ -ESHUTDOWN, /* already archived */ -EIO, /* IO error, including SIGBUS on mmap */ - -EIDRM /* File has been deleted */)) + -EIDRM, /* File has been deleted */ + -ETXTBSY)) /* File is from the future */ return r; if ((flags & O_ACCMODE) == O_RDONLY) @@ -2849,20 +3360,18 @@ int journal_file_open_reliably( random_u64()) < 0) return -ENOMEM; - r = rename(fname, p); - if (r < 0) + if (rename(fname, p) < 0) return -errno; /* btrfs doesn't cope well with our write pattern and * fragments heavily. Let's defrag all files we rotate */ - (void) chattr_path(p, false, FS_NOCOW_FL); + (void) chattr_path(p, 0, FS_NOCOW_FL); (void) btrfs_defrag(p); - log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname); + log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname); - return journal_file_open(fname, flags, mode, compress, seal, - metrics, mmap_cache, template, ret); + return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret); } int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) { @@ -2949,16 +3458,35 @@ int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint6 return r; } +void journal_reset_metrics(JournalMetrics *m) { + assert(m); + + /* Set everything to "pick automatic values". */ + + *m = (JournalMetrics) { + .min_use = (uint64_t) -1, + .max_use = (uint64_t) -1, + .min_size = (uint64_t) -1, + .max_size = (uint64_t) -1, + .keep_free = (uint64_t) -1, + .n_max_files = (uint64_t) -1, + }; +} + void journal_default_metrics(JournalMetrics *m, int fd) { - uint64_t fs_size = 0; + char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX]; struct statvfs ss; - char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX]; + uint64_t fs_size; assert(m); assert(fd >= 0); if (fstatvfs(fd, &ss) >= 0) fs_size = ss.f_frsize * ss.f_blocks; + else { + log_debug_errno(errno, "Failed to detremine disk size: %m"); + fs_size = 0; + } if (m->max_use == (uint64_t) -1) { @@ -2975,10 +3503,16 @@ void journal_default_metrics(JournalMetrics *m, int fd) { } else { m->max_use = PAGE_ALIGN(m->max_use); - if (m->max_use < JOURNAL_FILE_SIZE_MIN*2) + if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2) m->max_use = JOURNAL_FILE_SIZE_MIN*2; } + if (m->min_use == (uint64_t) -1) + m->min_use = DEFAULT_MIN_USE; + + if (m->min_use > m->max_use) + m->min_use = m->max_use; + if (m->max_size == (uint64_t) -1) { m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */ @@ -2987,11 +3521,13 @@ void journal_default_metrics(JournalMetrics *m, int fd) { } else m->max_size = PAGE_ALIGN(m->max_size); - if (m->max_size < JOURNAL_FILE_SIZE_MIN) - m->max_size = JOURNAL_FILE_SIZE_MIN; + if (m->max_size != 0) { + if (m->max_size < JOURNAL_FILE_SIZE_MIN) + m->max_size = JOURNAL_FILE_SIZE_MIN; - if (m->max_size*2 > m->max_use) - m->max_use = m->max_size*2; + if (m->max_use != 0 && m->max_size*2 > m->max_use) + m->max_use = m->max_size*2; + } if (m->min_size == (uint64_t) -1) m->min_size = JOURNAL_FILE_SIZE_MIN; @@ -3001,7 +3537,7 @@ void journal_default_metrics(JournalMetrics *m, int fd) { if (m->min_size < JOURNAL_FILE_SIZE_MIN) m->min_size = JOURNAL_FILE_SIZE_MIN; - if (m->min_size > m->max_size) + if (m->max_size != 0 && m->min_size > m->max_size) m->max_size = m->min_size; } @@ -3017,15 +3553,21 @@ void journal_default_metrics(JournalMetrics *m, int fd) { m->keep_free = DEFAULT_KEEP_FREE; } - log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s", - format_bytes(a, sizeof(a), m->max_use), - format_bytes(b, sizeof(b), m->max_size), - format_bytes(c, sizeof(c), m->min_size), - format_bytes(d, sizeof(d), m->keep_free)); + if (m->n_max_files == (uint64_t) -1) + m->n_max_files = DEFAULT_N_MAX_FILES; + + log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64, + format_bytes(a, sizeof(a), m->min_use), + format_bytes(b, sizeof(b), m->max_use), + format_bytes(c, sizeof(c), m->max_size), + format_bytes(d, sizeof(d), m->min_size), + format_bytes(e, sizeof(e), m->keep_free), + m->n_max_files); } int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) { assert(f); + assert(f->header); assert(from || to); if (from) { @@ -3089,6 +3631,7 @@ int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, u bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) { assert(f); + assert(f->header); /* If we gained new header fields we gained new features, * hence suggest a rotation */ |