From fa6ac76083b8ffc1309876459f54f9f0e2843731 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 30 Dec 2014 20:57:53 +0100 Subject: journald: process SIGBUS for the memory maps we set up Even though we use fallocate() it appears that file systems like btrfs will trigger SIGBUS on certain low-disk-space situation. We should handle that, hence catch the signal, add it to a list of invalidated pages, and replace the page with an empty memory area. After each write check if SIGBUS was triggered, and consider the write invalid if it was. This should make journald a lot more robust with file systems where fallocate() is not reliable, for example all CoW file systems (btrfs...), where changing written data can fail with disk full errors. https://bugzilla.redhat.com/show_bug.cgi?id=1045810 --- .gitignore | 1 + Makefile.am | 11 ++- src/journal/journal-file.c | 67 ++++++++++++++----- src/journal/journal-file.h | 2 +- src/journal/journald-server.c | 3 + src/journal/journald.c | 4 ++ src/journal/mmap-cache.c | 126 +++++++++++++++++++++++++++++++--- src/journal/mmap-cache.h | 5 +- src/shared/sigbus.c | 152 ++++++++++++++++++++++++++++++++++++++++++ src/shared/sigbus.h | 25 +++++++ src/test/test-sigbus.c | 62 +++++++++++++++++ 11 files changed, 430 insertions(+), 28 deletions(-) create mode 100644 src/shared/sigbus.c create mode 100644 src/shared/sigbus.h create mode 100644 src/test/test-sigbus.c diff --git a/.gitignore b/.gitignore index 078fd9ad7f..70ee4f63dd 100644 --- a/.gitignore +++ b/.gitignore @@ -233,6 +233,7 @@ /test-rtnl-manual /test-sched-prio /test-set +/test-sigbus /test-sleep /test-socket-util /test-ssd diff --git a/Makefile.am b/Makefile.am index 73e911f6bf..10fc8a9c84 100644 --- a/Makefile.am +++ b/Makefile.am @@ -901,6 +901,8 @@ libsystemd_shared_la_SOURCES = \ src/shared/verbs.h \ src/shared/machine-image.c \ src/shared/machine-image.h \ + src/shared/sigbus.c \ + src/shared/sigbus.h \ src/shared/build.h if HAVE_UTMP @@ -1386,7 +1388,8 @@ tests += \ test-locale-util \ test-execute \ test-copy \ - test-cap-list + test-cap-list \ + test-sigbus EXTRA_DIST += \ test/a.service \ @@ -1580,6 +1583,12 @@ test_copy_SOURCES = \ test_copy_LDADD = \ libsystemd-shared.la +test_sigbus_SOURCES = \ + src/test/test-sigbus.c + +test_sigbus_LDADD = \ + libsystemd-shared.la + test_condition_SOURCES = \ src/test/test-condition.c diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c index 48c27ee627..44a96928e0 100644 --- a/src/journal/journal-file.c +++ b/src/journal/journal-file.c @@ -67,6 +67,9 @@ /* How much to increase the journal file size at once each time we allocate something new. */ #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */ +/* The mmap context to use for the header we pick as one above the last defined typed */ +#define CONTEXT_HEADER _OBJECT_TYPE_MAX + static int journal_file_set_online(JournalFile *f) { assert(f); @@ -76,6 +79,9 @@ static int journal_file_set_online(JournalFile *f) { if (!(f->fd >= 0 && f->header)) return -EINVAL; + if (mmap_cache_got_sigbus(f->mmap, f->fd)) + return -EIO; + switch(f->header->state) { case STATE_ONLINE: return 0; @@ -104,8 +110,14 @@ int journal_file_set_offline(JournalFile *f) { fsync(f->fd); + if (mmap_cache_got_sigbus(f->mmap, f->fd)) + return -EIO; + f->header->state = STATE_OFFLINE; + if (mmap_cache_got_sigbus(f->mmap, f->fd)) + return -EIO; + fsync(f->fd); return 0; @@ -120,14 +132,10 @@ void journal_file_close(JournalFile *f) { journal_file_append_tag(f); #endif - /* Sync everything to disk, before we mark the file offline */ - if (f->mmap && f->fd >= 0) - mmap_cache_close_fd(f->mmap, f->fd); - journal_file_set_offline(f); - if (f->header) - munmap(f->header, PAGE_ALIGN(sizeof(Header))); + if (f->mmap && f->fd >= 0) + mmap_cache_close_fd(f->mmap, f->fd); safe_close(f->fd); free(f->path); @@ -194,8 +202,8 @@ static int journal_file_init_header(JournalFile *f, JournalFile *template) { } static int journal_file_refresh_header(JournalFile *f) { - int r; sd_id128_t boot_id; + int r; assert(f); @@ -212,12 +220,12 @@ static int journal_file_refresh_header(JournalFile *f) { f->header->boot_id = boot_id; - journal_file_set_online(f); + r = journal_file_set_online(f); /* Sync the online state to disk */ fsync(f->fd); - return 0; + return r; } static int journal_file_verify_header(JournalFile *f) { @@ -321,6 +329,9 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) * for sure, since we always call posix_fallocate() * ourselves */ + if (mmap_cache_got_sigbus(f->mmap, f->fd)) + return -EIO; + old_size = le64toh(f->header->header_size) + le64toh(f->header->arena_size); @@ -376,6 +387,7 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) static unsigned type_to_context(ObjectType type) { /* One context for each type, plus one catch-all for the rest */ assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS); + assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS); return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0; } @@ -1357,6 +1369,14 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset); + /* If the memory mapping triggered a SIGBUS then we return an + * IO error and ignore the error code passed down to us, since + * it is very likely just an effect of a nullified replacement + * mapping page */ + + if (mmap_cache_got_sigbus(f->mmap, f->fd)) + r = -EIO; + journal_file_post_change(f); return r; @@ -1712,7 +1732,6 @@ found: return 1; } - static int generic_array_bisect_plus_one( JournalFile *f, uint64_t extra, @@ -2457,9 +2476,10 @@ int journal_file_open( JournalFile *template, JournalFile **ret) { + bool newly_created = false; JournalFile *f; + void *h; int r; - bool newly_created = false; assert(fname); assert(ret); @@ -2564,13 +2584,14 @@ int journal_file_open( goto fail; } - f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0); - if (f->header == MAP_FAILED) { - f->header = NULL; + r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h); + if (r < 0) { r = -errno; goto fail; } + f->header = h; + if (!newly_created) { r = journal_file_verify_header(f); if (r < 0) @@ -2627,10 +2648,18 @@ int journal_file_open( if (r < 0) goto fail; + if (mmap_cache_got_sigbus(f->mmap, f->fd)) { + r = -EIO; + goto fail; + } + *ret = f; return 0; fail: + if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd)) + r = -EIO; + journal_file_close(f); return r; @@ -2697,7 +2726,8 @@ int journal_file_open_reliably( r != -EHOSTDOWN && /* other machine */ r != -EPROTONOSUPPORT && /* incompatible feature */ r != -EBUSY && /* unclean shutdown */ - r != -ESHUTDOWN /* already archived */) + r != -ESHUTDOWN && /* already archived */ + r != -EIO /* IO error, including SIGBUS on mmap */) return r; if ((flags & O_ACCMODE) == O_RDONLY) @@ -2804,7 +2834,12 @@ int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint6 return r; } - return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset); + r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset); + + if (mmap_cache_got_sigbus(to->mmap, to->fd)) + return -EIO; + + return r; } void journal_default_metrics(JournalMetrics *m, int fd) { diff --git a/src/journal/journal-file.h b/src/journal/journal-file.h index 01bb4e038a..19fd7257f4 100644 --- a/src/journal/journal-file.h +++ b/src/journal/journal-file.h @@ -27,7 +27,7 @@ #include #endif -#include "systemd/sd-id128.h" +#include "sd-id128.h" #include "sparse-endian.h" #include "journal-def.h" diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c index a2a2e197c0..6d037cfec4 100644 --- a/src/journal/journald-server.c +++ b/src/journal/journald-server.c @@ -452,6 +452,7 @@ bool shall_try_append_again(JournalFile *f, int r) { -EFBIG Hit fs limit -EDQUOT Quota limit hit -ENOSPC Disk full + -EIO I/O error of some kind (mmap) -EHOSTDOWN Other machine -EBUSY Unclean shutdown -EPROTONOSUPPORT Unsupported feature @@ -469,6 +470,8 @@ bool shall_try_append_again(JournalFile *f, int r) { log_info("%s: Unsupported feature, rotating.", f->path); else if (r == -EBADMSG || r == -ENODATA || r == ESHUTDOWN) log_warning("%s: Journal file corrupted, rotating.", f->path); + else if (r == -EIO) + log_warning("%s: IO error, rotating.", f->path); else return false; diff --git a/src/journal/journald.c b/src/journal/journald.c index 604c8617bb..80f4634f67 100644 --- a/src/journal/journald.c +++ b/src/journal/journald.c @@ -33,6 +33,8 @@ #include "journald-kmsg.h" #include "journald-syslog.h" +#include "sigbus.h" + int main(int argc, char *argv[]) { Server server; int r; @@ -49,6 +51,8 @@ int main(int argc, char *argv[]) { umask(0022); + sigbus_install(); + r = server_init(&server); if (r < 0) goto finish; diff --git a/src/journal/mmap-cache.c b/src/journal/mmap-cache.c index 4c940aaa24..ab21cdc288 100644 --- a/src/journal/mmap-cache.c +++ b/src/journal/mmap-cache.c @@ -29,6 +29,7 @@ #include "log.h" #include "util.h" #include "macro.h" +#include "sigbus.h" #include "mmap-cache.h" typedef struct Window Window; @@ -38,6 +39,7 @@ typedef struct FileDescriptor FileDescriptor; struct Window { MMapCache *cache; + bool invalidated; bool keep_always; bool in_unused; @@ -65,6 +67,7 @@ struct Context { struct FileDescriptor { MMapCache *cache; int fd; + bool sigbus; LIST_HEAD(Window, windows); }; @@ -134,6 +137,21 @@ static void window_unlink(Window *w) { } } +static void window_invalidate(Window *w) { + assert(w); + + if (w->invalidated) + return; + + /* Replace the window with anonymous pages. This is useful + * when we hit a SIGBUS and want to make sure the file cannot + * trigger any further SIGBUS, possibly overrunning the sigbus + * queue. */ + + assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr); + w->invalidated = true; +} + static void window_free(Window *w) { assert(w); @@ -383,6 +401,9 @@ static int try_context( return 0; } + if (c->window->fd->sigbus) + return -EIO; + c->window->keep_always |= keep_always; *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset); @@ -414,6 +435,9 @@ static int find_mmap( assert(f->fd == fd); + if (f->sigbus) + return -EIO; + LIST_FOREACH(by_fd, w, f->windows) if (window_matches(w, fd, prot, offset, size)) break; @@ -572,27 +596,111 @@ int mmap_cache_get( return add_mmap(m, fd, prot, context, keep_always, offset, size, st, ret); } -void mmap_cache_close_fd(MMapCache *m, int fd) { +unsigned mmap_cache_get_hit(MMapCache *m) { + assert(m); + + return m->n_hit; +} + +unsigned mmap_cache_get_missed(MMapCache *m) { + assert(m); + + return m->n_missed; +} + +static void mmap_cache_process_sigbus(MMapCache *m) { + bool found = false; FileDescriptor *f; + Iterator i; + int r; assert(m); - assert(fd >= 0); - f = hashmap_get(m->fds, INT_TO_PTR(fd + 1)); - if (!f) + /* Iterate through all triggered pages and mark their files as + * invalidated */ + for (;;) { + bool ours; + void *addr; + + r = sigbus_pop(&addr); + if (_likely_(r == 0)) + break; + if (r < 0) { + log_error_errno(r, "SIGBUS handling failed: %m"); + abort(); + } + + ours = false; + HASHMAP_FOREACH(f, m->fds, i) { + Window *w; + + LIST_FOREACH(by_fd, w, f->windows) { + if ((uint8_t*) addr >= (uint8_t*) w->ptr && + (uint8_t*) addr < (uint8_t*) w->ptr + w->size) { + found = ours = f->sigbus = true; + break; + } + } + + if (ours) + break; + } + + /* Didn't find a matching window, give up */ + if (!ours) { + log_error("Unknown SIGBUS page, aborting."); + abort(); + } + } + + /* The list of triggered pages is now empty. Now, let's remap + * all windows of the triggered file to anonymous maps, so + * that no page of the file in question is triggered again, so + * that we can be sure not to hit the queue size limit. */ + if (_likely_(!found)) return; - fd_free(f); + HASHMAP_FOREACH(f, m->fds, i) { + Window *w; + + if (!f->sigbus) + continue; + + LIST_FOREACH(by_fd, w, f->windows) + window_invalidate(w); + } } -unsigned mmap_cache_get_hit(MMapCache *m) { +bool mmap_cache_got_sigbus(MMapCache *m, int fd) { + FileDescriptor *f; + assert(m); + assert(fd >= 0); - return m->n_hit; + mmap_cache_process_sigbus(m); + + f = hashmap_get(m->fds, INT_TO_PTR(fd + 1)); + if (!f) + return false; + + return f->sigbus; } -unsigned mmap_cache_get_missed(MMapCache *m) { +void mmap_cache_close_fd(MMapCache *m, int fd) { + FileDescriptor *f; + assert(m); + assert(fd >= 0); - return m->n_missed; + /* Make sure that any queued SIGBUS are first dispatched, so + * that we don't end up with a SIGBUS entry we cannot relate + * to any existing memory map */ + + mmap_cache_process_sigbus(m); + + f = hashmap_get(m->fds, INT_TO_PTR(fd + 1)); + if (!f) + return; + + fd_free(f); } diff --git a/src/journal/mmap-cache.h b/src/journal/mmap-cache.h index fe2c83d751..a85c2b6063 100644 --- a/src/journal/mmap-cache.h +++ b/src/journal/mmap-cache.h @@ -25,7 +25,8 @@ #include #include -#define MMAP_CACHE_MAX_CONTEXTS 8 +/* One context per object type, plus one of the header, plus one "additional" one */ +#define MMAP_CACHE_MAX_CONTEXTS 9 typedef struct MMapCache MMapCache; @@ -47,3 +48,5 @@ void mmap_cache_close_fd(MMapCache *m, int fd); unsigned mmap_cache_get_hit(MMapCache *m); unsigned mmap_cache_get_missed(MMapCache *m); + +bool mmap_cache_got_sigbus(MMapCache *m, int fd); diff --git a/src/shared/sigbus.c b/src/shared/sigbus.c new file mode 100644 index 0000000000..0108603fe8 --- /dev/null +++ b/src/shared/sigbus.c @@ -0,0 +1,152 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2014 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "macro.h" +#include "util.h" +#include "sigbus.h" + +#define SIGBUS_QUEUE_MAX 64 + +static struct sigaction old_sigaction; +static unsigned n_installed = 0; + +/* We maintain a fixed size list of page addresses that triggered a + SIGBUS. We access with list with atomic operations, so that we + don't have to deal with locks between signal handler and main + programs in possibly multiple threads. */ + +static void* volatile sigbus_queue[SIGBUS_QUEUE_MAX]; +static volatile sig_atomic_t n_sigbus_queue = 0; + +static void sigbus_push(void *addr) { + unsigned u; + + assert(addr); + + /* Find a free place, increase the number of entries and leave, if we can */ + for (u = 0; u < SIGBUS_QUEUE_MAX; u++) + if (__sync_bool_compare_and_swap(&sigbus_queue[u], NULL, addr)) { + __sync_fetch_and_add(&n_sigbus_queue, 1); + return; + } + + /* If we can't, make sure the queue size is out of bounds, to + * mark it as overflow */ + for (;;) { + unsigned c; + + __sync_synchronize(); + c = n_sigbus_queue; + + if (c > SIGBUS_QUEUE_MAX) /* already overflow */ + return; + + if (__sync_bool_compare_and_swap(&n_sigbus_queue, c, c + SIGBUS_QUEUE_MAX)) + return; + } +} + +int sigbus_pop(void **ret) { + assert(ret); + + for (;;) { + unsigned u, c; + + __sync_synchronize(); + c = n_sigbus_queue; + + if (_likely_(c == 0)) + return 0; + + if (_unlikely_(c >= SIGBUS_QUEUE_MAX)) + return -EOVERFLOW; + + for (u = 0; u < SIGBUS_QUEUE_MAX; u++) { + void *addr; + + addr = sigbus_queue[u]; + if (!addr) + continue; + + if (__sync_bool_compare_and_swap(&sigbus_queue[u], addr, NULL)) { + __sync_fetch_and_sub(&n_sigbus_queue, 1); + *ret = addr; + return 1; + } + } + } +} + +static void sigbus_handler(int sn, siginfo_t *si, void *data) { + unsigned long ul; + void *aligned; + + assert(sn == SIGBUS); + assert(si); + + if (si->si_code != BUS_ADRERR || !si->si_addr) { + assert_se(sigaction(SIGBUS, &old_sigaction, NULL) == 0); + raise(SIGBUS); + return; + } + + ul = (unsigned long) si->si_addr; + ul = ul / page_size(); + ul = ul * page_size(); + aligned = (void*) ul; + + /* Let's remember which address failed */ + sigbus_push(aligned); + + /* Replace mapping with an anonymous page, so that the + * execution can continue, however with a zeroed out page */ + assert_se(mmap(aligned, page_size(), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == aligned); +} + +void sigbus_install(void) { + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + n_installed++; + + if (n_installed == 1) + assert_se(sigaction(SIGBUS, &sa, &old_sigaction) == 0); + + return; +} + +void sigbus_reset(void) { + + if (n_installed <= 0) + return; + + n_installed--; + + if (n_installed == 0) + assert_se(sigaction(SIGBUS, &old_sigaction, NULL) == 0); + + return; +} diff --git a/src/shared/sigbus.h b/src/shared/sigbus.h new file mode 100644 index 0000000000..25593af2d4 --- /dev/null +++ b/src/shared/sigbus.h @@ -0,0 +1,25 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2014 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +void sigbus_install(void); +void sigbus_reset(void); + +int sigbus_pop(void **ret); diff --git a/src/test/test-sigbus.c b/src/test/test-sigbus.c new file mode 100644 index 0000000000..39d0fec894 --- /dev/null +++ b/src/test/test-sigbus.c @@ -0,0 +1,62 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2014 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "util.h" +#include "sigbus.h" + +int main(int argc, char *argv[]) { + _cleanup_close_ int fd = -1; + char template[] = "/tmp/sigbus-test-XXXXXX"; + void *addr = NULL; + uint8_t *p; + + sigbus_install(); + + assert(sigbus_pop(&addr) == 0); + + assert_se((fd = mkostemp(template, O_RDWR|O_CREAT|O_EXCL)) >= 0); + assert_se(unlink(template) >= 0); + assert_se(fallocate(fd, 0, 0, page_size() * 8) >= 0); + + p = mmap(NULL, page_size() * 16, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + assert_se(p != MAP_FAILED); + + assert_se(sigbus_pop(&addr) == 0); + + p[0] = 0xFF; + assert_se(sigbus_pop(&addr) == 0); + + p[page_size()] = 0xFF; + assert_se(sigbus_pop(&addr) == 0); + + p[page_size()*8] = 0xFF; + p[page_size()*8+1] = 0xFF; + p[page_size()*10] = 0xFF; + assert_se(sigbus_pop(&addr) > 0); + assert_se(addr == p + page_size() * 8); + assert_se(sigbus_pop(&addr) > 0); + assert_se(addr == p + page_size() * 10); + assert_se(sigbus_pop(&addr) == 0); + + sigbus_reset(); +} -- cgit v1.2.3-54-g00ecf