Linux-libre 4.8.2-gnupck-4.8.2-gnu

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2016-10-20 00:10:27 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2016-10-20 00:10:27 -0300
commit: d0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree: 7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /lib/wbt.c
parent: e914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)
1 files changed, 211 insertions, 77 deletions
diff --git a/lib/wbt.c b/lib/wbt.c
index cc5a24270..257c7b099 100644
--- a/lib/wbt.c
+++ b/lib/wbt.c
@@ -1,5 +1,5 @@
 /*
- * buffered writeback throttling. losely based on CoDel. We can't drop
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
  * packets for IO scheduling, so the logic is something like this:
  *
  * - Monitor latencies in a defined window of time.
@@ -9,30 +9,31 @@
  * - For any window where we don't have solid data on what the latencies
  *   look like, retain status quo.
  * - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ *   will temporarily boost write performance, snapping back to a stable
+ *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ *   positive scaling steps where we shrink the monitoring window, a negative
+ *   scaling step retains the default step==0 window size.
  *
  * Copyright (C) 2016 Jens Axboe
  *
- * Things that (may) need changing:
- *
- *	- Different scaling of background/normal/high priority writeback.
- *	  We may have to violate guarantees for max.
- *	- We can have mismatches between the stat window and our window.
- *
  */
 #include <linux/kernel.h>
 #include <linux/blk_types.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/wbt.h>
+#include <linux/swap.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/wbt.h>
 
 enum {
 	/*
-	 * Might need to be higher
+	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+	 * from here depending on device stats
 	 */
-	RWB_MAX_DEPTH	= 64,
+	RWB_DEF_DEPTH	= 16,
 
 	/*
 	 * 100msec window
@@ -40,10 +41,9 @@ enum {
 	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
 
 	/*
-	 * Disregard stats, if we don't meet these minimums
+	 * Disregard stats, if we don't meet this minimum
 	 */
 	RWB_MIN_WRITE_SAMPLES	= 3,
-	RWB_MIN_READ_SAMPLES	= 1,
 
 	/*
 	 * If we have this number of consecutive windows with not enough
@@ -89,18 +89,51 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
 	}
 }
 
-void __wbt_done(struct rq_wb *rwb)
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+	struct bdi_writeback *wb = &rwb->bdi->wb;
+
+	return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+	return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+	int i;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		struct rq_wait *rqw = &rwb->rq_wait[i];
+
+		if (waitqueue_active(&rqw->wait))
+			wake_up_all(&rqw->wait);
+	}
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 {
+	struct rq_wait *rqw;
 	int inflight, limit;
 
-	inflight = atomic_dec_return(&rwb->inflight);
+	if (!(wb_acct & WBT_TRACKED))
+		return;
+
+	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	inflight = atomic_dec_return(&rqw->inflight);
 
 	/*
 	 * wbt got disabled with IO in flight. Wake up any potential
 	 * waiters, we don't have to do more than that.
 	 */
 	if (unlikely(!rwb_enabled(rwb))) {
-		wake_up_all(&rwb->wait);
+		rwb_wake_all(rwb);
 		return;
 	}
 
@@ -108,7 +141,7 @@ void __wbt_done(struct rq_wb *rwb)
 	 * If the device does write back caching, drop further down
 	 * before we wake people up.
 	 */
-	if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
+	if (rwb->wc && !wb_recent_wait(rwb))
 		limit = 0;
 	else
 		limit = rwb->wb_normal;
@@ -119,11 +152,11 @@ void __wbt_done(struct rq_wb *rwb)
 	if (inflight && inflight >= limit)
 		return;
 
-	if (waitqueue_active(&rwb->wait)) {
+	if (waitqueue_active(&rqw->wait)) {
 		int diff = limit - inflight;
 
 		if (!inflight || diff >= rwb->wb_background / 2)
-			wake_up_nr(&rwb->wait, 1);
+			wake_up(&rqw->wait);
 	}
 }
 
@@ -136,27 +169,33 @@ void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
 	if (!rwb)
 		return;
 
-	if (!wbt_tracked(stat)) {
+	if (!wbt_is_tracked(stat)) {
 		if (rwb->sync_cookie == stat) {
 			rwb->sync_issue = 0;
 			rwb->sync_cookie = NULL;
 		}
 
-		wb_timestamp(rwb, &rwb->last_comp);
+		if (wbt_is_read(stat))
+			wb_timestamp(rwb, &rwb->last_comp);
+		wbt_clear_state(stat);
 	} else {
 		WARN_ON_ONCE(stat == rwb->sync_cookie);
-		__wbt_done(rwb);
-		wbt_clear_tracked(stat);
+		__wbt_done(rwb, wbt_stat_to_mask(stat));
+		wbt_clear_state(stat);
 	}
 }
 
-static void calc_wb_limits(struct rq_wb *rwb)
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
 {
 	unsigned int depth;
+	bool ret = false;
 
 	if (!rwb->min_lat_nsec) {
 		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
-		return;
+		return false;
 	}
 
 	/*
@@ -167,22 +206,44 @@ static void calc_wb_limits(struct rq_wb *rwb)
 	 * scaling down, then keep a setting of 1/1/1.
 	 */
 	if (rwb->queue_depth == 1) {
-		if (rwb->scale_step)
+		if (rwb->scale_step > 0)
 			rwb->wb_max = rwb->wb_normal = 1;
-		else
+		else {
 			rwb->wb_max = rwb->wb_normal = 2;
+			ret = true;
+		}
 		rwb->wb_background = 1;
 	} else {
-		depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
+		/*
+		 * scale_step == 0 is our default state. If we have suffered
+		 * latency spikes, step will be > 0, and we shrink the
+		 * allowed write depths. If step is < 0, we're only doing
+		 * writes, and we allow a temporarily higher depth to
+		 * increase performance.
+		 */
+		depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+		if (rwb->scale_step > 0)
+			depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+		else if (rwb->scale_step < 0) {
+			unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+			depth = 1 + ((depth - 1) << -rwb->scale_step);
+			if (depth > maxd) {
+				depth = maxd;
+				ret = true;
+			}
+		}
 
 		/*
 		 * Set our max/normal/bg queue depths based on how far
 		 * we have scaled down (->scale_step).
 		 */
-		rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
+		rwb->wb_max = depth;
 		rwb->wb_normal = (rwb->wb_max + 1) / 2;
 		rwb->wb_background = (rwb->wb_max + 3) / 4;
 	}
+
+	return ret;
 }
 
 static bool inline stat_sample_valid(struct blk_rq_stat *stat)
@@ -209,8 +270,9 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
 }
 
 enum {
-	LAT_OK,
+	LAT_OK = 1,
 	LAT_UNKNOWN,
+	LAT_UNKNOWN_WRITES,
 	LAT_EXCEEDED,
 };
 
@@ -234,8 +296,21 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 		return LAT_EXCEEDED;
 	}
 
-	if (!stat_sample_valid(stat))
+	/*
+	 * No read/write mix, if stat isn't valid
+	 */
+	if (!stat_sample_valid(stat)) {
+		/*
+		 * If we had writes in this stat window and the window is
+		 * current, we're only doing writes. If a task recently
+		 * waited or still has writes in flights, consider us doing
+		 * just writes as well.
+		 */
+		if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
+		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+			return LAT_UNKNOWN_WRITES;
 		return LAT_UNKNOWN;
+	}
 
 	/*
 	 * If the 'min' latency exceeds our target, step down.
@@ -269,23 +344,27 @@ static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 static void scale_up(struct rq_wb *rwb)
 {
 	/*
-	 * If we're at 0, we can't go lower.
+	 * Hit max in previous round, stop here
 	 */
-	if (!rwb->scale_step)
+	if (rwb->scaled_max)
 		return;
 
 	rwb->scale_step--;
 	rwb->unknown_cnt = 0;
 	rwb->stat_ops->clear(rwb->ops_data);
-	calc_wb_limits(rwb);
 
-	if (waitqueue_active(&rwb->wait))
-		wake_up_all(&rwb->wait);
+	rwb->scaled_max = calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
 
 	rwb_trace_step(rwb, "step up");
 }
 
-static void scale_down(struct rq_wb *rwb)
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 {
 	/*
 	 * Stop scaling down when we've hit the limit. This also prevents
@@ -295,7 +374,12 @@ static void scale_down(struct rq_wb *rwb)
 	if (rwb->wb_max == 1)
 		return;
 
-	rwb->scale_step++;
+	if (rwb->scale_step < 0 && hard_throttle)
+		rwb->scale_step = 0;
+	else
+		rwb->scale_step++;
+
+	rwb->scaled_max = false;
 	rwb->unknown_cnt = 0;
 	rwb->stat_ops->clear(rwb->ops_data);
 	calc_wb_limits(rwb);
@@ -306,13 +390,23 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 {
 	unsigned long expires;
 
-	/*
-	 * We should speed this up, using some variant of a fast integer
-	 * inverse square root calculation. Since we only do this for
-	 * every window expiration, it's not a huge deal, though.
-	 */
-	rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+	if (rwb->scale_step > 0) {
+		/*
+		 * We should speed this up, using some variant of a fast
+		 * integer inverse square root calculation. Since we only do
+		 * this for every window expiration, it's not a huge deal,
+		 * though.
+		 */
+		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
 					int_sqrt((rwb->scale_step + 1) << 8));
+	} else {
+		/*
+		 * For step < 0, we don't want to increase/decrease the
+		 * window size.
+		 */
+		rwb->cur_win_nsec = rwb->win_nsec;
+	}
+
 	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
 	mod_timer(&rwb->window_timer, expires);
 }
@@ -320,28 +414,45 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 static void wb_timer_fn(unsigned long data)
 {
 	struct rq_wb *rwb = (struct rq_wb *) data;
+	unsigned int inflight = wbt_inflight(rwb);
 	int status;
 
+	status = latency_exceeded(rwb);
+
+	trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
+
 	/*
 	 * If we exceeded the latency target, step down. If we did not,
 	 * step one level up. If we don't know enough to say either exceeded
 	 * or ok, then don't do anything.
 	 */
-	status = latency_exceeded(rwb);
 	switch (status) {
 	case LAT_EXCEEDED:
-		scale_down(rwb);
+		scale_down(rwb, true);
 		break;
 	case LAT_OK:
 		scale_up(rwb);
 		break;
+	case LAT_UNKNOWN_WRITES:
+		scale_up(rwb);
+		break;
 	case LAT_UNKNOWN:
+		if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+			break;
 		/*
-		 * We had no read samples, start bumping up the write
-		 * depth slowly
+		 * We get here for two reasons:
+		 *
+		 * 1) We previously scaled reduced depth, and we currently
+		 *    don't have a valid read/write sample. For that case,
+		 *    slowly return to center state (step == 0).
+		 * 2) We started a the center step, but don't have a valid
+		 *    read/write sample, but we do have writes going on.
+		 *    Allow step to go negative, to increase write perf.
 		 */
-		if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
+		if (rwb->scale_step > 0)
 			scale_up(rwb);
+		else if (rwb->scale_step < 0)
+			scale_down(rwb, false);
 		break;
 	default:
 		break;
@@ -350,17 +461,17 @@ static void wb_timer_fn(unsigned long data)
 	/*
 	 * Re-arm timer, if we have IO in flight
 	 */
-	if (rwb->scale_step || atomic_read(&rwb->inflight))
+	if (rwb->scale_step || inflight)
 		rwb_arm_timer(rwb);
 }
 
 void wbt_update_limits(struct rq_wb *rwb)
 {
 	rwb->scale_step = 0;
+	rwb->scaled_max = false;
 	calc_wb_limits(rwb);
 
-	if (waitqueue_active(&rwb->wait))
-		wake_up_all(&rwb->wait);
+	rwb_wake_all(rwb);
 }
 
 static bool close_io(struct rq_wb *rwb)
@@ -378,13 +489,14 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	unsigned int limit;
 
 	/*
-	 * At this point we know it's a buffered write. If REQ_SYNC is
-	 * set, then it's WB_SYNC_ALL writeback, and we'll use the max
-	 * limit for that. If the write is marked as a background write,
-	 * then use the idle limit, or go to normal if we haven't had
-	 * competing IO for a bit.
+	 * At this point we know it's a buffered write. If this is
+	 * kswapd trying to free memory, or REQ_SYNC is set, set, then
+	 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+	 * that. If the write is marked as a background write, then use
+	 * the idle limit, or go to normal if we haven't had competing
+	 * IO for a bit.
 	 */
-	if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
+	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
 		limit = rwb->wb_max;
 	else if ((rw & REQ_BG) || close_io(rwb)) {
 		/*
@@ -398,7 +510,8 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	return limit;
 }
 
-static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+			     unsigned long rw)
 {
 	/*
 	 * inc it here even if disabled, since we'll dec it at completion.
@@ -406,11 +519,11 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
 	 * and someone turned it off at the same time.
 	 */
 	if (!rwb_enabled(rwb)) {
-		atomic_inc(&rwb->inflight);
+		atomic_inc(&rqw->inflight);
 		return true;
 	}
 
-	return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
+	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
 }
 
 /*
@@ -419,16 +532,17 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
  */
 static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 {
+	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
 	DEFINE_WAIT(wait);
 
-	if (may_queue(rwb, rw))
+	if (may_queue(rwb, rqw, rw))
 		return;
 
 	do {
-		prepare_to_wait_exclusive(&rwb->wait, &wait,
+		prepare_to_wait_exclusive(&rqw->wait, &wait,
 						TASK_UNINTERRUPTIBLE);
 
-		if (may_queue(rwb, rw))
+		if (may_queue(rwb, rqw, rw))
 			break;
 
 		if (lock)
@@ -440,15 +554,17 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 			spin_lock_irq(lock);
 	} while (1);
 
-	finish_wait(&rwb->wait, &wait);
+	finish_wait(&rqw->wait, &wait);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
 {
+	const int op = rw >> BIO_OP_SHIFT;
+
 	/*
 	 * If not a WRITE (or a discard), do nothing
 	 */
-	if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
+	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
 		return false;
 
 	/*
@@ -466,14 +582,20 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
  * in an irq held spinlock, if it holds one when calling this function.
  * If we do sleep, we'll release and re-grab it.
  */
-bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
 {
+	unsigned int ret = 0;
+
 	if (!rwb_enabled(rwb))
-		return false;
+		return 0;
+
+	if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ)
+		ret = WBT_READ;
 
 	if (!wbt_should_throttle(rwb, rw)) {
-		wb_timestamp(rwb, &rwb->last_issue);
-		return false;
+		if (ret & WBT_READ)
+			wb_timestamp(rwb, &rwb->last_issue);
+		return ret;
 	}
 
 	__wbt_wait(rwb, rw, lock);
@@ -481,7 +603,10 @@ bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
 	if (!timer_pending(&rwb->window_timer))
 		rwb_arm_timer(rwb);
 
-	return true;
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+
+	return ret | WBT_TRACKED;
 }
 
 void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
@@ -499,7 +624,7 @@ void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
 	 * only use the address to compare with, which is why we store the
 	 * sync_issue time locally.
 	 */
-	if (!wbt_tracked(stat) && !rwb->sync_issue) {
+	if (wbt_is_read(stat) && !rwb->sync_issue) {
 		rwb->sync_cookie = stat;
 		rwb->sync_issue = wbt_issue_stat_get_time(stat);
 	}
@@ -531,9 +656,11 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
 
 void wbt_disable(struct rq_wb *rwb)
 {
-	del_timer_sync(&rwb->window_timer);
-	rwb->win_nsec = rwb->min_lat_nsec = 0;
-	wbt_update_limits(rwb);
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		rwb->win_nsec = rwb->min_lat_nsec = 0;
+		wbt_update_limits(rwb);
+	}
 }
 EXPORT_SYMBOL_GPL(wbt_disable);
 
@@ -541,20 +668,27 @@ struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
 		       void *ops_data)
 {
 	struct rq_wb *rwb;
+	int i;
+
+	if (!ops->get || !ops->is_current || !ops->clear)
+		return ERR_PTR(-EINVAL);
 
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return ERR_PTR(-ENOMEM);
 
-	atomic_set(&rwb->inflight, 0);
-	init_waitqueue_head(&rwb->wait);
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		atomic_set(&rwb->rq_wait[i].inflight, 0);
+		init_waitqueue_head(&rwb->rq_wait[i].wait);
+	}
+
 	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
 	rwb->wc = 1;
-	rwb->queue_depth = RWB_MAX_DEPTH;
+	rwb->queue_depth = RWB_DEF_DEPTH;
 	rwb->last_comp = rwb->last_issue = jiffies;
 	rwb->bdi = bdi;
 	rwb->win_nsec = RWB_WINDOW_NSEC;
-	rwb->stat_ops = ops,
+	rwb->stat_ops = ops;
 	rwb->ops_data = ops_data;
 	wbt_update_limits(rwb);
 	return rwb;
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2016-10-20 00:10:27 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2016-10-20 00:10:27 -0300
commit	d0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree	7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /lib/wbt.c
parent	e914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)