summaryrefslogtreecommitdiff
path: root/lib/wbt.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/wbt.c')
-rw-r--r--lib/wbt.c288
1 files changed, 211 insertions, 77 deletions
diff --git a/lib/wbt.c b/lib/wbt.c
index cc5a24270..257c7b099 100644
--- a/lib/wbt.c
+++ b/lib/wbt.c
@@ -1,5 +1,5 @@
/*
- * buffered writeback throttling. losely based on CoDel. We can't drop
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
* packets for IO scheduling, so the logic is something like this:
*
* - Monitor latencies in a defined window of time.
@@ -9,30 +9,31 @@
* - For any window where we don't have solid data on what the latencies
* look like, retain status quo.
* - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ * will temporarily boost write performance, snapping back to a stable
+ * scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ * positive scaling steps where we shrink the monitoring window, a negative
+ * scaling step retains the default step==0 window size.
*
* Copyright (C) 2016 Jens Axboe
*
- * Things that (may) need changing:
- *
- * - Different scaling of background/normal/high priority writeback.
- * We may have to violate guarantees for max.
- * - We can have mismatches between the stat window and our window.
- *
*/
#include <linux/kernel.h>
#include <linux/blk_types.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/wbt.h>
+#include <linux/swap.h>
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
enum {
/*
- * Might need to be higher
+ * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+ * from here depending on device stats
*/
- RWB_MAX_DEPTH = 64,
+ RWB_DEF_DEPTH = 16,
/*
* 100msec window
@@ -40,10 +41,9 @@ enum {
RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
/*
- * Disregard stats, if we don't meet these minimums
+ * Disregard stats, if we don't meet this minimum
*/
RWB_MIN_WRITE_SAMPLES = 3,
- RWB_MIN_READ_SAMPLES = 1,
/*
* If we have this number of consecutive windows with not enough
@@ -89,18 +89,51 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
}
}
-void __wbt_done(struct rq_wb *rwb)
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+ struct bdi_writeback *wb = &rwb->bdi->wb;
+
+ return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+ return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+ int i;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
+ struct rq_wait *rqw = &rwb->rq_wait[i];
+
+ if (waitqueue_active(&rqw->wait))
+ wake_up_all(&rqw->wait);
+ }
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
{
+ struct rq_wait *rqw;
int inflight, limit;
- inflight = atomic_dec_return(&rwb->inflight);
+ if (!(wb_acct & WBT_TRACKED))
+ return;
+
+ rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+ inflight = atomic_dec_return(&rqw->inflight);
/*
* wbt got disabled with IO in flight. Wake up any potential
* waiters, we don't have to do more than that.
*/
if (unlikely(!rwb_enabled(rwb))) {
- wake_up_all(&rwb->wait);
+ rwb_wake_all(rwb);
return;
}
@@ -108,7 +141,7 @@ void __wbt_done(struct rq_wb *rwb)
* If the device does write back caching, drop further down
* before we wake people up.
*/
- if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
+ if (rwb->wc && !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -119,11 +152,11 @@ void __wbt_done(struct rq_wb *rwb)
if (inflight && inflight >= limit)
return;
- if (waitqueue_active(&rwb->wait)) {
+ if (waitqueue_active(&rqw->wait)) {
int diff = limit - inflight;
if (!inflight || diff >= rwb->wb_background / 2)
- wake_up_nr(&rwb->wait, 1);
+ wake_up(&rqw->wait);
}
}
@@ -136,27 +169,33 @@ void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
if (!rwb)
return;
- if (!wbt_tracked(stat)) {
+ if (!wbt_is_tracked(stat)) {
if (rwb->sync_cookie == stat) {
rwb->sync_issue = 0;
rwb->sync_cookie = NULL;
}
- wb_timestamp(rwb, &rwb->last_comp);
+ if (wbt_is_read(stat))
+ wb_timestamp(rwb, &rwb->last_comp);
+ wbt_clear_state(stat);
} else {
WARN_ON_ONCE(stat == rwb->sync_cookie);
- __wbt_done(rwb);
- wbt_clear_tracked(stat);
+ __wbt_done(rwb, wbt_stat_to_mask(stat));
+ wbt_clear_state(stat);
}
}
-static void calc_wb_limits(struct rq_wb *rwb)
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
{
unsigned int depth;
+ bool ret = false;
if (!rwb->min_lat_nsec) {
rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
- return;
+ return false;
}
/*
@@ -167,22 +206,44 @@ static void calc_wb_limits(struct rq_wb *rwb)
* scaling down, then keep a setting of 1/1/1.
*/
if (rwb->queue_depth == 1) {
- if (rwb->scale_step)
+ if (rwb->scale_step > 0)
rwb->wb_max = rwb->wb_normal = 1;
- else
+ else {
rwb->wb_max = rwb->wb_normal = 2;
+ ret = true;
+ }
rwb->wb_background = 1;
} else {
- depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
+ /*
+ * scale_step == 0 is our default state. If we have suffered
+ * latency spikes, step will be > 0, and we shrink the
+ * allowed write depths. If step is < 0, we're only doing
+ * writes, and we allow a temporarily higher depth to
+ * increase performance.
+ */
+ depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+ if (rwb->scale_step > 0)
+ depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+ else if (rwb->scale_step < 0) {
+ unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+ depth = 1 + ((depth - 1) << -rwb->scale_step);
+ if (depth > maxd) {
+ depth = maxd;
+ ret = true;
+ }
+ }
/*
* Set our max/normal/bg queue depths based on how far
* we have scaled down (->scale_step).
*/
- rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
+ rwb->wb_max = depth;
rwb->wb_normal = (rwb->wb_max + 1) / 2;
rwb->wb_background = (rwb->wb_max + 3) / 4;
}
+
+ return ret;
}
static bool inline stat_sample_valid(struct blk_rq_stat *stat)
@@ -209,8 +270,9 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
}
enum {
- LAT_OK,
+ LAT_OK = 1,
LAT_UNKNOWN,
+ LAT_UNKNOWN_WRITES,
LAT_EXCEEDED,
};
@@ -234,8 +296,21 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
return LAT_EXCEEDED;
}
- if (!stat_sample_valid(stat))
+ /*
+ * No read/write mix, if stat isn't valid
+ */
+ if (!stat_sample_valid(stat)) {
+ /*
+ * If we had writes in this stat window and the window is
+ * current, we're only doing writes. If a task recently
+ * waited or still has writes in flights, consider us doing
+ * just writes as well.
+ */
+ if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
+ wb_recent_wait(rwb) || wbt_inflight(rwb))
+ return LAT_UNKNOWN_WRITES;
return LAT_UNKNOWN;
+ }
/*
* If the 'min' latency exceeds our target, step down.
@@ -269,23 +344,27 @@ static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
static void scale_up(struct rq_wb *rwb)
{
/*
- * If we're at 0, we can't go lower.
+ * Hit max in previous round, stop here
*/
- if (!rwb->scale_step)
+ if (rwb->scaled_max)
return;
rwb->scale_step--;
rwb->unknown_cnt = 0;
rwb->stat_ops->clear(rwb->ops_data);
- calc_wb_limits(rwb);
- if (waitqueue_active(&rwb->wait))
- wake_up_all(&rwb->wait);
+ rwb->scaled_max = calc_wb_limits(rwb);
+
+ rwb_wake_all(rwb);
rwb_trace_step(rwb, "step up");
}
-static void scale_down(struct rq_wb *rwb)
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
/*
* Stop scaling down when we've hit the limit. This also prevents
@@ -295,7 +374,12 @@ static void scale_down(struct rq_wb *rwb)
if (rwb->wb_max == 1)
return;
- rwb->scale_step++;
+ if (rwb->scale_step < 0 && hard_throttle)
+ rwb->scale_step = 0;
+ else
+ rwb->scale_step++;
+
+ rwb->scaled_max = false;
rwb->unknown_cnt = 0;
rwb->stat_ops->clear(rwb->ops_data);
calc_wb_limits(rwb);
@@ -306,13 +390,23 @@ static void rwb_arm_timer(struct rq_wb *rwb)
{
unsigned long expires;
- /*
- * We should speed this up, using some variant of a fast integer
- * inverse square root calculation. Since we only do this for
- * every window expiration, it's not a huge deal, though.
- */
- rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+ if (rwb->scale_step > 0) {
+ /*
+ * We should speed this up, using some variant of a fast
+ * integer inverse square root calculation. Since we only do
+ * this for every window expiration, it's not a huge deal,
+ * though.
+ */
+ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
int_sqrt((rwb->scale_step + 1) << 8));
+ } else {
+ /*
+ * For step < 0, we don't want to increase/decrease the
+ * window size.
+ */
+ rwb->cur_win_nsec = rwb->win_nsec;
+ }
+
expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
mod_timer(&rwb->window_timer, expires);
}
@@ -320,28 +414,45 @@ static void rwb_arm_timer(struct rq_wb *rwb)
static void wb_timer_fn(unsigned long data)
{
struct rq_wb *rwb = (struct rq_wb *) data;
+ unsigned int inflight = wbt_inflight(rwb);
int status;
+ status = latency_exceeded(rwb);
+
+ trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
+
/*
* If we exceeded the latency target, step down. If we did not,
* step one level up. If we don't know enough to say either exceeded
* or ok, then don't do anything.
*/
- status = latency_exceeded(rwb);
switch (status) {
case LAT_EXCEEDED:
- scale_down(rwb);
+ scale_down(rwb, true);
break;
case LAT_OK:
scale_up(rwb);
break;
+ case LAT_UNKNOWN_WRITES:
+ scale_up(rwb);
+ break;
case LAT_UNKNOWN:
+ if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+ break;
/*
- * We had no read samples, start bumping up the write
- * depth slowly
+ * We get here for two reasons:
+ *
+ * 1) We previously scaled reduced depth, and we currently
+ * don't have a valid read/write sample. For that case,
+ * slowly return to center state (step == 0).
+ * 2) We started a the center step, but don't have a valid
+ * read/write sample, but we do have writes going on.
+ * Allow step to go negative, to increase write perf.
*/
- if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
+ if (rwb->scale_step > 0)
scale_up(rwb);
+ else if (rwb->scale_step < 0)
+ scale_down(rwb, false);
break;
default:
break;
@@ -350,17 +461,17 @@ static void wb_timer_fn(unsigned long data)
/*
* Re-arm timer, if we have IO in flight
*/
- if (rwb->scale_step || atomic_read(&rwb->inflight))
+ if (rwb->scale_step || inflight)
rwb_arm_timer(rwb);
}
void wbt_update_limits(struct rq_wb *rwb)
{
rwb->scale_step = 0;
+ rwb->scaled_max = false;
calc_wb_limits(rwb);
- if (waitqueue_active(&rwb->wait))
- wake_up_all(&rwb->wait);
+ rwb_wake_all(rwb);
}
static bool close_io(struct rq_wb *rwb)
@@ -378,13 +489,14 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
unsigned int limit;
/*
- * At this point we know it's a buffered write. If REQ_SYNC is
- * set, then it's WB_SYNC_ALL writeback, and we'll use the max
- * limit for that. If the write is marked as a background write,
- * then use the idle limit, or go to normal if we haven't had
- * competing IO for a bit.
+ * At this point we know it's a buffered write. If this is
+ * kswapd trying to free memory, or REQ_SYNC is set, set, then
+ * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+ * that. If the write is marked as a background write, then use
+ * the idle limit, or go to normal if we haven't had competing
+ * IO for a bit.
*/
- if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
+ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
limit = rwb->wb_max;
else if ((rw & REQ_BG) || close_io(rwb)) {
/*
@@ -398,7 +510,8 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
return limit;
}
-static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+ unsigned long rw)
{
/*
* inc it here even if disabled, since we'll dec it at completion.
@@ -406,11 +519,11 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
* and someone turned it off at the same time.
*/
if (!rwb_enabled(rwb)) {
- atomic_inc(&rwb->inflight);
+ atomic_inc(&rqw->inflight);
return true;
}
- return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
+ return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
}
/*
@@ -419,16 +532,17 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
*/
static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
{
+ struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
DEFINE_WAIT(wait);
- if (may_queue(rwb, rw))
+ if (may_queue(rwb, rqw, rw))
return;
do {
- prepare_to_wait_exclusive(&rwb->wait, &wait,
+ prepare_to_wait_exclusive(&rqw->wait, &wait,
TASK_UNINTERRUPTIBLE);
- if (may_queue(rwb, rw))
+ if (may_queue(rwb, rqw, rw))
break;
if (lock)
@@ -440,15 +554,17 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
spin_lock_irq(lock);
} while (1);
- finish_wait(&rwb->wait, &wait);
+ finish_wait(&rqw->wait, &wait);
}
static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
{
+ const int op = rw >> BIO_OP_SHIFT;
+
/*
* If not a WRITE (or a discard), do nothing
*/
- if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
+ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
return false;
/*
@@ -466,14 +582,20 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
* in an irq held spinlock, if it holds one when calling this function.
* If we do sleep, we'll release and re-grab it.
*/
-bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
{
+ unsigned int ret = 0;
+
if (!rwb_enabled(rwb))
- return false;
+ return 0;
+
+ if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ)
+ ret = WBT_READ;
if (!wbt_should_throttle(rwb, rw)) {
- wb_timestamp(rwb, &rwb->last_issue);
- return false;
+ if (ret & WBT_READ)
+ wb_timestamp(rwb, &rwb->last_issue);
+ return ret;
}
__wbt_wait(rwb, rw, lock);
@@ -481,7 +603,10 @@ bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
if (!timer_pending(&rwb->window_timer))
rwb_arm_timer(rwb);
- return true;
+ if (current_is_kswapd())
+ ret |= WBT_KSWAPD;
+
+ return ret | WBT_TRACKED;
}
void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
@@ -499,7 +624,7 @@ void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
* only use the address to compare with, which is why we store the
* sync_issue time locally.
*/
- if (!wbt_tracked(stat) && !rwb->sync_issue) {
+ if (wbt_is_read(stat) && !rwb->sync_issue) {
rwb->sync_cookie = stat;
rwb->sync_issue = wbt_issue_stat_get_time(stat);
}
@@ -531,9 +656,11 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
void wbt_disable(struct rq_wb *rwb)
{
- del_timer_sync(&rwb->window_timer);
- rwb->win_nsec = rwb->min_lat_nsec = 0;
- wbt_update_limits(rwb);
+ if (rwb) {
+ del_timer_sync(&rwb->window_timer);
+ rwb->win_nsec = rwb->min_lat_nsec = 0;
+ wbt_update_limits(rwb);
+ }
}
EXPORT_SYMBOL_GPL(wbt_disable);
@@ -541,20 +668,27 @@ struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
void *ops_data)
{
struct rq_wb *rwb;
+ int i;
+
+ if (!ops->get || !ops->is_current || !ops->clear)
+ return ERR_PTR(-EINVAL);
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
return ERR_PTR(-ENOMEM);
- atomic_set(&rwb->inflight, 0);
- init_waitqueue_head(&rwb->wait);
+ for (i = 0; i < WBT_NUM_RWQ; i++) {
+ atomic_set(&rwb->rq_wait[i].inflight, 0);
+ init_waitqueue_head(&rwb->rq_wait[i].wait);
+ }
+
setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
rwb->wc = 1;
- rwb->queue_depth = RWB_MAX_DEPTH;
+ rwb->queue_depth = RWB_DEF_DEPTH;
rwb->last_comp = rwb->last_issue = jiffies;
rwb->bdi = bdi;
rwb->win_nsec = RWB_WINDOW_NSEC;
- rwb->stat_ops = ops,
+ rwb->stat_ops = ops;
rwb->ops_data = ops_data;
wbt_update_limits(rwb);
return rwb;