From d79ca7a622abbb0df6f5166cc0e4669373d9a614 Mon Sep 17 00:00:00 2001 From: Klearchos Chaloulos Date: Tue, 5 Apr 2016 13:47:04 +0300 Subject: journal-upload: Update watchdog while in curl_easy_perform It is observed that a combination of high log throughput, low I/O speed on journal remote side and many nodes uploading simultaneously caused the journal-upload process to dump core because of watchdog starvation. This is caused because journal-upload stays in curl_easy_perform(), because it cannot upload fast enough to reach the end of the journal. Currently journal-upload will return from curl_easy_perform() only when the end of the journal is reached. Therefore a check is added in journal_input_callback(), which will update the watchdog if the elapsed time since the start of the uploading process is greater than WATCHDOG_USEC/2. --- src/journal-remote/journal-upload-journal.c | 25 +++++++++++++++++++++++++ src/journal-remote/journal-upload.c | 1 + src/journal-remote/journal-upload.h | 1 + 3 files changed, 27 insertions(+) (limited to 'src/journal-remote') diff --git a/src/journal-remote/journal-upload-journal.c b/src/journal-remote/journal-upload-journal.c index e61b6bc68f..ac6eb58a9f 100644 --- a/src/journal-remote/journal-upload-journal.c +++ b/src/journal-remote/journal-upload-journal.c @@ -25,6 +25,7 @@ #include "log.h" #include "utf8.h" #include "util.h" +#include "sd-daemon.h" /** * Write up to size bytes to buf. Return negative on error, and number of @@ -242,6 +243,28 @@ static ssize_t write_entry(char *buf, size_t size, Uploader *u) { assert_not_reached("WTF?"); } +static inline void check_update_watchdog(Uploader *u) { + usec_t watchdog_usec; + static usec_t before; + usec_t after; + usec_t elapsed_time; + + if (sd_watchdog_enabled(false, &watchdog_usec) < 0) + return; + if (u->reset_reference_timestamp) { + before = now(CLOCK_MONOTONIC); + u->reset_reference_timestamp = false; + } else { + after = now(CLOCK_MONOTONIC); + elapsed_time = usec_sub(after, before); + if (elapsed_time > watchdog_usec / 2) { + log_debug("Update watchdog timer"); + sd_notify(false, "WATCHDOG=1"); + u->reset_reference_timestamp = true; + } + } +} + static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void *userp) { Uploader *u = userp; int r; @@ -252,6 +275,8 @@ static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void assert(u); assert(nmemb <= SSIZE_MAX / size); + check_update_watchdog(u); + j = u->journal; while (j && filled < size * nmemb) { diff --git a/src/journal-remote/journal-upload.c b/src/journal-remote/journal-upload.c index 6e1c3bb9ef..f2e9117f9f 100644 --- a/src/journal-remote/journal-upload.c +++ b/src/journal-remote/journal-upload.c @@ -494,6 +494,7 @@ static int perform_upload(Uploader *u) { assert(u); + u->reset_reference_timestamp = true; code = curl_easy_perform(u->easy); if (code) { if (u->error[0]) diff --git a/src/journal-remote/journal-upload.h b/src/journal-remote/journal-upload.h index b8cd04d527..a31735bd08 100644 --- a/src/journal-remote/journal-upload.h +++ b/src/journal-remote/journal-upload.h @@ -48,6 +48,7 @@ typedef struct Uploader { size_t entries_sent; char *last_cursor, *current_cursor; + bool reset_reference_timestamp; } Uploader; #define JOURNAL_UPLOAD_POLL_TIMEOUT (10 * USEC_PER_SEC) -- cgit v1.2.3-54-g00ecf From 0aa176a751a00b5645007c4d0763078ce2824aba Mon Sep 17 00:00:00 2001 From: Zbigniew Jędrzejewski-Szmek Date: Tue, 5 Apr 2016 20:27:48 -0400 Subject: journal-upload: make watchdog state non-static Also parse watchdog config when creating the Uploader object. --- src/journal-remote/journal-upload-journal.c | 22 ++++++++-------------- src/journal-remote/journal-upload.c | 4 +++- src/journal-remote/journal-upload.h | 4 +++- 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'src/journal-remote') diff --git a/src/journal-remote/journal-upload-journal.c b/src/journal-remote/journal-upload-journal.c index ac6eb58a9f..8ce8e1895e 100644 --- a/src/journal-remote/journal-upload-journal.c +++ b/src/journal-remote/journal-upload-journal.c @@ -244,24 +244,18 @@ static ssize_t write_entry(char *buf, size_t size, Uploader *u) { } static inline void check_update_watchdog(Uploader *u) { - usec_t watchdog_usec; - static usec_t before; usec_t after; usec_t elapsed_time; - if (sd_watchdog_enabled(false, &watchdog_usec) < 0) + if (u->watchdog_usec <= 0) return; - if (u->reset_reference_timestamp) { - before = now(CLOCK_MONOTONIC); - u->reset_reference_timestamp = false; - } else { - after = now(CLOCK_MONOTONIC); - elapsed_time = usec_sub(after, before); - if (elapsed_time > watchdog_usec / 2) { - log_debug("Update watchdog timer"); - sd_notify(false, "WATCHDOG=1"); - u->reset_reference_timestamp = true; - } + + after = now(CLOCK_MONOTONIC); + elapsed_time = usec_sub(after, u->watchdog_timestamp); + if (elapsed_time > u->watchdog_usec / 2) { + log_debug("Update watchdog timer"); + sd_notify(false, "WATCHDOG=1"); + u->watchdog_timestamp = after; } } diff --git a/src/journal-remote/journal-upload.c b/src/journal-remote/journal-upload.c index f2e9117f9f..4647cfdeb3 100644 --- a/src/journal-remote/journal-upload.c +++ b/src/journal-remote/journal-upload.c @@ -463,6 +463,8 @@ static int setup_uploader(Uploader *u, const char *url, const char *state_file) if (r < 0) return log_error_errno(r, "Failed to set up signals: %m"); + (void) sd_watchdog_enabled(false, &u->watchdog_usec); + return load_cursor_state(u); } @@ -494,7 +496,7 @@ static int perform_upload(Uploader *u) { assert(u); - u->reset_reference_timestamp = true; + u->watchdog_timestamp = now(CLOCK_MONOTONIC); code = curl_easy_perform(u->easy); if (code) { if (u->error[0]) diff --git a/src/journal-remote/journal-upload.h b/src/journal-remote/journal-upload.h index a31735bd08..5711905f86 100644 --- a/src/journal-remote/journal-upload.h +++ b/src/journal-remote/journal-upload.h @@ -4,6 +4,7 @@ #include "sd-event.h" #include "sd-journal.h" +#include "time-util.h" typedef enum { ENTRY_CURSOR = 0, /* Nothing actually written yet. */ @@ -48,7 +49,8 @@ typedef struct Uploader { size_t entries_sent; char *last_cursor, *current_cursor; - bool reset_reference_timestamp; + usec_t watchdog_timestamp; + usec_t watchdog_usec; } Uploader; #define JOURNAL_UPLOAD_POLL_TIMEOUT (10 * USEC_PER_SEC) -- cgit v1.2.3-54-g00ecf