diff options
Diffstat (limited to 'lib/wbt.c')
-rw-r--r-- | lib/wbt.c | 288 |
1 files changed, 211 insertions, 77 deletions
@@ -1,5 +1,5 @@ /* - * buffered writeback throttling. losely based on CoDel. We can't drop + * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: * * - Monitor latencies in a defined window of time. @@ -9,30 +9,31 @@ * - For any window where we don't have solid data on what the latencies * look like, retain status quo. * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. * * Copyright (C) 2016 Jens Axboe * - * Things that (may) need changing: - * - * - Different scaling of background/normal/high priority writeback. - * We may have to violate guarantees for max. - * - We can have mismatches between the stat window and our window. - * */ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/wbt.h> +#include <linux/swap.h> #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> enum { /* - * Might need to be higher + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats */ - RWB_MAX_DEPTH = 64, + RWB_DEF_DEPTH = 16, /* * 100msec window @@ -40,10 +41,9 @@ enum { RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, /* - * Disregard stats, if we don't meet these minimums + * Disregard stats, if we don't meet this minimum */ RWB_MIN_WRITE_SAMPLES = 3, - RWB_MIN_READ_SAMPLES = 1, /* * If we have this number of consecutive windows with not enough @@ -89,18 +89,51 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) } } -void __wbt_done(struct rq_wb *rwb) +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->bdi->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +{ + return &rwb->rq_wait[is_kswapd]; +} + +static void rwb_wake_all(struct rq_wb *rwb) +{ + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + struct rq_wait *rqw = &rwb->rq_wait[i]; + + if (waitqueue_active(&rqw->wait)) + wake_up_all(&rqw->wait); + } +} + +void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) { + struct rq_wait *rqw; int inflight, limit; - inflight = atomic_dec_return(&rwb->inflight); + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); + inflight = atomic_dec_return(&rqw->inflight); /* * wbt got disabled with IO in flight. Wake up any potential * waiters, we don't have to do more than that. */ if (unlikely(!rwb_enabled(rwb))) { - wake_up_all(&rwb->wait); + rwb_wake_all(rwb); return; } @@ -108,7 +141,7 @@ void __wbt_done(struct rq_wb *rwb) * If the device does write back caching, drop further down * before we wake people up. */ - if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping)) + if (rwb->wc && !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -119,11 +152,11 @@ void __wbt_done(struct rq_wb *rwb) if (inflight && inflight >= limit) return; - if (waitqueue_active(&rwb->wait)) { + if (waitqueue_active(&rqw->wait)) { int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) - wake_up_nr(&rwb->wait, 1); + wake_up(&rqw->wait); } } @@ -136,27 +169,33 @@ void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat) if (!rwb) return; - if (!wbt_tracked(stat)) { + if (!wbt_is_tracked(stat)) { if (rwb->sync_cookie == stat) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } - wb_timestamp(rwb, &rwb->last_comp); + if (wbt_is_read(stat)) + wb_timestamp(rwb, &rwb->last_comp); + wbt_clear_state(stat); } else { WARN_ON_ONCE(stat == rwb->sync_cookie); - __wbt_done(rwb); - wbt_clear_tracked(stat); + __wbt_done(rwb, wbt_stat_to_mask(stat)); + wbt_clear_state(stat); } } -static void calc_wb_limits(struct rq_wb *rwb) +/* + * Return true, if we can't increase the depth further by scaling + */ +static bool calc_wb_limits(struct rq_wb *rwb) { unsigned int depth; + bool ret = false; if (!rwb->min_lat_nsec) { rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; - return; + return false; } /* @@ -167,22 +206,44 @@ static void calc_wb_limits(struct rq_wb *rwb) * scaling down, then keep a setting of 1/1/1. */ if (rwb->queue_depth == 1) { - if (rwb->scale_step) + if (rwb->scale_step > 0) rwb->wb_max = rwb->wb_normal = 1; - else + else { rwb->wb_max = rwb->wb_normal = 2; + ret = true; + } rwb->wb_background = 1; } else { - depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth); + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); + if (rwb->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); + else if (rwb->scale_step < 0) { + unsigned int maxd = 3 * rwb->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rwb->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } /* * Set our max/normal/bg queue depths based on how far * we have scaled down (->scale_step). */ - rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step)); + rwb->wb_max = depth; rwb->wb_normal = (rwb->wb_max + 1) / 2; rwb->wb_background = (rwb->wb_max + 3) / 4; } + + return ret; } static bool inline stat_sample_valid(struct blk_rq_stat *stat) @@ -209,8 +270,9 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) } enum { - LAT_OK, + LAT_OK = 1, LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, LAT_EXCEEDED, }; @@ -234,8 +296,21 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_EXCEEDED; } - if (!stat_sample_valid(stat)) + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || + wb_recent_wait(rwb) || wbt_inflight(rwb)) + return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; + } /* * If the 'min' latency exceeds our target, step down. @@ -269,23 +344,27 @@ static void rwb_trace_step(struct rq_wb *rwb, const char *msg) static void scale_up(struct rq_wb *rwb) { /* - * If we're at 0, we can't go lower. + * Hit max in previous round, stop here */ - if (!rwb->scale_step) + if (rwb->scaled_max) return; rwb->scale_step--; rwb->unknown_cnt = 0; rwb->stat_ops->clear(rwb->ops_data); - calc_wb_limits(rwb); - if (waitqueue_active(&rwb->wait)) - wake_up_all(&rwb->wait); + rwb->scaled_max = calc_wb_limits(rwb); + + rwb_wake_all(rwb); rwb_trace_step(rwb, "step up"); } -static void scale_down(struct rq_wb *rwb) +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +static void scale_down(struct rq_wb *rwb, bool hard_throttle) { /* * Stop scaling down when we've hit the limit. This also prevents @@ -295,7 +374,12 @@ static void scale_down(struct rq_wb *rwb) if (rwb->wb_max == 1) return; - rwb->scale_step++; + if (rwb->scale_step < 0 && hard_throttle) + rwb->scale_step = 0; + else + rwb->scale_step++; + + rwb->scaled_max = false; rwb->unknown_cnt = 0; rwb->stat_ops->clear(rwb->ops_data); calc_wb_limits(rwb); @@ -306,13 +390,23 @@ static void rwb_arm_timer(struct rq_wb *rwb) { unsigned long expires; - /* - * We should speed this up, using some variant of a fast integer - * inverse square root calculation. Since we only do this for - * every window expiration, it's not a huge deal, though. - */ - rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + if (rwb->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, int_sqrt((rwb->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); mod_timer(&rwb->window_timer, expires); } @@ -320,28 +414,45 @@ static void rwb_arm_timer(struct rq_wb *rwb) static void wb_timer_fn(unsigned long data) { struct rq_wb *rwb = (struct rq_wb *) data; + unsigned int inflight = wbt_inflight(rwb); int status; + status = latency_exceeded(rwb); + + trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); + /* * If we exceeded the latency target, step down. If we did not, * step one level up. If we don't know enough to say either exceeded * or ok, then don't do anything. */ - status = latency_exceeded(rwb); switch (status) { case LAT_EXCEEDED: - scale_down(rwb); + scale_down(rwb, true); break; case LAT_OK: scale_up(rwb); break; + case LAT_UNKNOWN_WRITES: + scale_up(rwb); + break; case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; /* - * We had no read samples, start bumping up the write - * depth slowly + * We get here for two reasons: + * + * 1) We previously scaled reduced depth, and we currently + * don't have a valid read/write sample. For that case, + * slowly return to center state (step == 0). + * 2) We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. */ - if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP) + if (rwb->scale_step > 0) scale_up(rwb); + else if (rwb->scale_step < 0) + scale_down(rwb, false); break; default: break; @@ -350,17 +461,17 @@ static void wb_timer_fn(unsigned long data) /* * Re-arm timer, if we have IO in flight */ - if (rwb->scale_step || atomic_read(&rwb->inflight)) + if (rwb->scale_step || inflight) rwb_arm_timer(rwb); } void wbt_update_limits(struct rq_wb *rwb) { rwb->scale_step = 0; + rwb->scaled_max = false; calc_wb_limits(rwb); - if (waitqueue_active(&rwb->wait)) - wake_up_all(&rwb->wait); + rwb_wake_all(rwb); } static bool close_io(struct rq_wb *rwb) @@ -378,13 +489,14 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) unsigned int limit; /* - * At this point we know it's a buffered write. If REQ_SYNC is - * set, then it's WB_SYNC_ALL writeback, and we'll use the max - * limit for that. If the write is marked as a background write, - * then use the idle limit, or go to normal if we haven't had - * competing IO for a bit. + * At this point we know it's a buffered write. If this is + * kswapd trying to free memory, or REQ_SYNC is set, set, then + * it's WB_SYNC_ALL writeback, and we'll use the max limit for + * that. If the write is marked as a background write, then use + * the idle limit, or go to normal if we haven't had competing + * IO for a bit. */ - if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping)) + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) limit = rwb->wb_max; else if ((rw & REQ_BG) || close_io(rwb)) { /* @@ -398,7 +510,8 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } -static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) +static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, + unsigned long rw) { /* * inc it here even if disabled, since we'll dec it at completion. @@ -406,11 +519,11 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) * and someone turned it off at the same time. */ if (!rwb_enabled(rwb)) { - atomic_inc(&rwb->inflight); + atomic_inc(&rqw->inflight); return true; } - return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); + return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); } /* @@ -419,16 +532,17 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) */ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) { + struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); DEFINE_WAIT(wait); - if (may_queue(rwb, rw)) + if (may_queue(rwb, rqw, rw)) return; do { - prepare_to_wait_exclusive(&rwb->wait, &wait, + prepare_to_wait_exclusive(&rqw->wait, &wait, TASK_UNINTERRUPTIBLE); - if (may_queue(rwb, rw)) + if (may_queue(rwb, rqw, rw)) break; if (lock) @@ -440,15 +554,17 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) spin_lock_irq(lock); } while (1); - finish_wait(&rwb->wait, &wait); + finish_wait(&rqw->wait, &wait); } static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) { + const int op = rw >> BIO_OP_SHIFT; + /* * If not a WRITE (or a discard), do nothing */ - if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD)) + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) return false; /* @@ -466,14 +582,20 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) +unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) { + unsigned int ret = 0; + if (!rwb_enabled(rwb)) - return false; + return 0; + + if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ) + ret = WBT_READ; if (!wbt_should_throttle(rwb, rw)) { - wb_timestamp(rwb, &rwb->last_issue); - return false; + if (ret & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return ret; } __wbt_wait(rwb, rw, lock); @@ -481,7 +603,10 @@ bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) if (!timer_pending(&rwb->window_timer)) rwb_arm_timer(rwb); - return true; + if (current_is_kswapd()) + ret |= WBT_KSWAPD; + + return ret | WBT_TRACKED; } void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) @@ -499,7 +624,7 @@ void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) * only use the address to compare with, which is why we store the * sync_issue time locally. */ - if (!wbt_tracked(stat) && !rwb->sync_issue) { + if (wbt_is_read(stat) && !rwb->sync_issue) { rwb->sync_cookie = stat; rwb->sync_issue = wbt_issue_stat_get_time(stat); } @@ -531,9 +656,11 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) void wbt_disable(struct rq_wb *rwb) { - del_timer_sync(&rwb->window_timer); - rwb->win_nsec = rwb->min_lat_nsec = 0; - wbt_update_limits(rwb); + if (rwb) { + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); + } } EXPORT_SYMBOL_GPL(wbt_disable); @@ -541,20 +668,27 @@ struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops, void *ops_data) { struct rq_wb *rwb; + int i; + + if (!ops->get || !ops->is_current || !ops->clear) + return ERR_PTR(-EINVAL); rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return ERR_PTR(-ENOMEM); - atomic_set(&rwb->inflight, 0); - init_waitqueue_head(&rwb->wait); + for (i = 0; i < WBT_NUM_RWQ; i++) { + atomic_set(&rwb->rq_wait[i].inflight, 0); + init_waitqueue_head(&rwb->rq_wait[i].wait); + } + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); rwb->wc = 1; - rwb->queue_depth = RWB_MAX_DEPTH; + rwb->queue_depth = RWB_DEF_DEPTH; rwb->last_comp = rwb->last_issue = jiffies; rwb->bdi = bdi; rwb->win_nsec = RWB_WINDOW_NSEC; - rwb->stat_ops = ops, + rwb->stat_ops = ops; rwb->ops_data = ops_data; wbt_update_limits(rwb); return rwb; |