1 files changed, 568 insertions, 242 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 9190a5554..eef6ff49c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -83,7 +83,7 @@ static const int bfq_back_max = 16 * 1024;
 static const int bfq_back_penalty = 2;
 
 /* Idling period duration, in ns. */
-static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
+static u32 bfq_slice_idle = NSEC_PER_SEC / 125;
 
 /* Minimum number of assigned budgets for which stats are safe to compute. */
 static const int bfq_stats_min_budgets = 194;
@@ -103,7 +103,7 @@ static const int bfq_timeout = HZ / 8;
 
 struct kmem_cache *bfq_pool;
 
-/* Below this threshold (in ms), we consider thinktime immediate. */
+/* Below this threshold (in ns), we consider thinktime immediate. */
 #define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
 
 /* hw_tag detection: parallel requests threshold and min samples needed. */
@@ -114,8 +114,12 @@ struct kmem_cache *bfq_pool;
 #define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
 
-/* Min samples used for peak rate estimation (for autotuning). */
-#define BFQ_PEAK_RATE_SAMPLES	32
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES	32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL	300*NSEC_PER_MSEC
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
 
 /* Shift used for peak rate fixed precision calculations. */
 #define BFQ_RATE_SHIFT		16
@@ -634,6 +638,23 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 		bfq_mark_bfqq_IO_bound(bfqq);
 	else
 		bfq_clear_bfqq_IO_bound(bfqq);
+
+	bfqq->wr_coeff = bic->saved_wr_coeff;
+	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+	BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
+	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_before_jiffies(bfqq->last_wr_start_finish +
+				   bfqq->wr_cur_max_time))) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+		    "resume state: switching off wr");
+
+		bfqq->wr_coeff = 1;
+	}
+	/* make sure weight will be updated, however we got here */
+	bfqq->entity.prio_changed = 1;
 }
 
 static int bfqq_process_refs(struct bfq_queue *bfqq)
@@ -1052,6 +1073,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 		 * operation, is reset only when bfqq is selected for
 		 * service (see bfq_get_next_queue).
 		 */
+		BUG_ON(bfqq->max_budget < 0);
 		entity->budget = min_t(unsigned long,
 				       bfq_bfqq_budget_left(bfqq),
 				       bfqq->max_budget);
@@ -1060,6 +1082,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 		return true;
 	}
 
+	BUG_ON(bfqq->max_budget < 0);
 	entity->budget = max_t(unsigned long, bfqq->max_budget,
 			       bfq_serv_to_charge(bfqq->next_rq, bfqq));
 	BUG_ON(entity->budget < 0);
@@ -1082,6 +1105,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 		} else {
+			bfqq->wr_start_at_switch_to_srt = jiffies;
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
 				BFQ_SOFTRT_WEIGHT_FACTOR;
 			bfqq->wr_cur_max_time =
@@ -1115,32 +1139,13 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 				     jiffies,
 				     jiffies_to_msecs(bfqq->
 						      wr_cur_max_time));
-		} else if (time_before(
-				   bfqq->last_wr_start_finish +
-				   bfqq->wr_cur_max_time,
-				   jiffies +
-				   bfqd->bfq_wr_rt_max_time) &&
-			   soft_rt) {
+		} else if (soft_rt) {
 			/*
-			 * The remaining weight-raising time is lower
-			 * than bfqd->bfq_wr_rt_max_time, which means
-			 * that the application is enjoying weight
-			 * raising either because deemed soft-rt in
-			 * the near past, or because deemed interactive
-			 * a long ago.
-			 * In both cases, resetting now the current
-			 * remaining weight-raising time for the
-			 * application to the weight-raising duration
-			 * for soft rt applications would not cause any
-			 * latency increase for the application (as the
-			 * new duration would be higher than the
-			 * remaining time).
-			 *
-			 * In addition, the application is now meeting
-			 * the requirements for being deemed soft rt.
-			 * In the end we can correctly and safely
-			 * (re)charge the weight-raising duration for
-			 * the application with the weight-raising
+			 * The application is now or still meeting the
+			 * requirements for being deemed soft rt.  We
+			 * can then correctly and safely (re)charge
+			 * the weight-raising duration for the
+			 * application with the weight-raising
 			 * duration for soft rt applications.
 			 *
 			 * In particular, doing this recharge now, i.e.,
@@ -1164,14 +1169,22 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 			 *    latency because the application is not
 			 *    weight-raised while they are pending.
 			 */
+			if (bfqq->wr_cur_max_time !=
+				bfqd->bfq_wr_rt_max_time) {
+				bfqq->wr_start_at_switch_to_srt =
+					bfqq->last_wr_start_finish;
+                BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+				bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+					BFQ_SOFTRT_WEIGHT_FACTOR;
+				bfq_log_bfqq(bfqd, bfqq,
+					     "switching to soft_rt wr");
+			} else
+				bfq_log_bfqq(bfqd, bfqq,
+					"moving forward soft_rt wr duration");
 			bfqq->last_wr_start_finish = jiffies;
-			bfqq->wr_cur_max_time =
-				bfqd->bfq_wr_rt_max_time;
-			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
-				BFQ_SOFTRT_WEIGHT_FACTOR;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "switching to soft_rt wr, or "
-				     " just moving forward duration");
 		}
 	}
 }
@@ -1450,14 +1463,24 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
 	return NULL;
 }
 
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+	sector_t sdist = 0;
+
+	if (last_pos) {
+		if (last_pos < blk_rq_pos(rq))
+			sdist = blk_rq_pos(rq) - last_pos;
+		else
+			sdist = last_pos - blk_rq_pos(rq);
+	}
+
+	return sdist;
+}
+
 static void bfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
-
 	bfqd->rq_in_driver++;
-	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
-	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
-		(unsigned long long) bfqd->last_position);
 }
 
 static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
@@ -1622,11 +1645,16 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 		bfqq->bfqd->wr_busy_queues--;
 	bfqq->wr_coeff = 1;
 	bfqq->wr_cur_max_time = 0;
+	bfqq->last_wr_start_finish = jiffies;
 	/*
 	 * Trigger a weight change on the next invocation of
 	 * __bfq_entity_update_weight_prio.
 	 */
 	bfqq->entity.prio_changed = 1;
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "end_wr: wrais ending at %lu, rais_max_time %u",
+		     bfqq->last_wr_start_finish,
+		     jiffies_to_msecs(bfqq->wr_cur_max_time));
 	bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
 		     bfqq->bfqd->wr_busy_queues);
 }
@@ -1934,18 +1962,24 @@ check_scheduled:
 
 static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 {
+	struct bfq_io_cq *bic = bfqq->bic;
+
 	/*
 	 * If !bfqq->bic, the queue is already shared or its requests
 	 * have already been redirected to a shared queue; both idle window
 	 * and weight raising state have already been saved. Do nothing.
 	 */
-	if (!bfqq->bic)
+	if (!bic)
 		return;
 
-	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
-	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
-	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
-	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+	bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+	bic->saved_wr_coeff = bfqq->wr_coeff;
+	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
 }
 
 static void bfq_get_bic_reference(struct bfq_queue *bfqq)
@@ -1984,6 +2018,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 		new_bfqq->wr_coeff = bfqq->wr_coeff;
 		new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
 		new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+		new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
 		if (bfq_bfqq_busy(new_bfqq))
 			bfqd->wr_busy_queues++;
 		new_bfqq->entity.prio_changed = 1;
@@ -2116,9 +2151,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 		BUG_ON(bfqq == bfqd->in_service_queue);
 		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
 
-		if (bfqq->wr_coeff > 1 &&
+		if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+		    bfqq->wr_coeff > 1 &&
 		    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
-			time_is_before_jiffies(bfqq->budget_timeout)) {
+		    time_is_before_jiffies(bfqq->budget_timeout)) {
 			/*
 			 * For soft real-time queues, move the start
 			 * of the weight-raising period forward by the
@@ -2144,7 +2180,20 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 			 * request.
 			 */
 			bfqq->last_wr_start_finish += jiffies -
-				bfqq->budget_timeout;
+				max_t(unsigned long, bfqq->last_wr_start_finish,
+				      bfqq->budget_timeout);
+			if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
+			       pr_crit(
+			       "BFQ WARNING:last %lu budget %lu jiffies %lu",
+			       bfqq->last_wr_start_finish,
+			       bfqq->budget_timeout,
+			       jiffies);
+			       pr_crit("diff %lu", jiffies -
+				       max_t(unsigned long,
+					     bfqq->last_wr_start_finish,
+					     bfqq->budget_timeout));
+			       bfqq->last_wr_start_finish = jiffies;
+			}
 		}
 
 		bfq_set_budget_timeout(bfqd, bfqq);
@@ -2172,7 +2221,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 {
 	struct bfq_queue *bfqq = bfqd->in_service_queue;
 	struct bfq_io_cq *bic;
-	unsigned long sl;
+	u32 sl;
 
 	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
 
@@ -2206,15 +2255,326 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	 */
 	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
 	    bfq_symmetric_scenario(bfqd))
-		sl = min_t(u64, sl, BFQ_MIN_TT);
+		sl = min_t(u32, sl, BFQ_MIN_TT);
 
 	bfqd->last_idling_start = ktime_get();
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
 		      HRTIMER_MODE_REL);
 	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
-	bfq_log(bfqd, "arm idle: %llu/%llu ms",
-		div_u64(sl, NSEC_PER_MSEC),
-		div_u64(bfqd->bfq_slice_idle, NSEC_PER_MSEC));
+	bfq_log(bfqd, "arm idle: %ld/%ld ms",
+		sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
+}
+
+/*
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
+ */
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
+{
+	return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+		jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
+
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+	int dev_type = blk_queue_nonrot(bfqd->queue);
+
+	if (bfqd->bfq_user_max_budget == 0) {
+		bfqd->bfq_max_budget =
+			bfq_calc_max_budget(bfqd);
+		BUG_ON(bfqd->bfq_max_budget < 0);
+		bfq_log(bfqd, "new max_budget = %d",
+			bfqd->bfq_max_budget);
+	}
+
+	if (bfqd->device_speed == BFQ_BFQD_FAST &&
+	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_SLOW;
+		bfqd->RT_prod = R_slow[dev_type] *
+			T_slow[dev_type];
+	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_FAST;
+		bfqd->RT_prod = R_fast[dev_type] *
+			T_fast[dev_type];
+	}
+
+	bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+		dev_type == 0 ? "ROT" : "NONROT",
+		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+		bfqd->device_speed == BFQ_BFQD_FAST ?
+		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+		BFQ_RATE_SHIFT);
+}
+
+void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
+{
+	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+		bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
+		bfqd->peak_rate_samples = 1;
+		bfqd->sequential_samples = 0;
+		bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+			blk_rq_sectors(rq);
+	} else /* no new rq dispatched, just reset the number of samples */
+		bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+	bfq_log(bfqd,
+		"reset_rate_computation at end, sample %u/%u tot_sects %llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		bfqd->tot_sectors_dispatched);
+}
+
+void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+	u32 rate, weight, divisor;
+
+	/*
+	 * For the convergence property to hold (see comments on
+	 * bfq_update_peak_rate()) and for the assessment to be
+	 * reliable, a minimum number of samples must be present, and
+	 * a minimum amount of time must have elapsed. If not so, do
+	 * not compute new rate. Just reset parameters, to get ready
+	 * for a new evaluation attempt.
+	 */
+	if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+	    bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
+		bfq_log(bfqd,
+	"update_rate_reset: only resetting, delta_first %lluus samples %d",
+			bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
+		goto reset_computation;
+	}
+
+	/*
+	 * If a new request completion has occurred after last
+	 * dispatch, then, to approximate the rate at which requests
+	 * have been served by the device, it is more precise to
+	 * extend the observation interval to the last completion.
+	 */
+	bfqd->delta_from_first =
+		max_t(u64, bfqd->delta_from_first,
+		      bfqd->last_completion - bfqd->first_dispatch);
+
+	BUG_ON(bfqd->delta_from_first == 0);
+	/*
+	 * Rate computed in sects/usec, and not sects/nsec, for
+	 * precision issues.
+	 */
+	rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+			div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+	bfq_log(bfqd,
+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
+		bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		rate > 20<<BFQ_RATE_SHIFT);
+
+	/*
+	 * Peak rate not updated if:
+	 * - the percentage of sequential dispatches is below 3/4 of the
+	 *   total, and rate is below the current estimated peak rate
+	 * - rate is unreasonably high (> 20M sectors/sec)
+	 */
+	if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 &&
+	     rate <= bfqd->peak_rate) ||
+		rate > 20<<BFQ_RATE_SHIFT) {
+		bfq_log(bfqd,
+		"update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+		goto reset_computation;
+	} else {
+		bfq_log(bfqd,
+		"update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+	}
+
+	/*
+	 * We have to update the peak rate, at last! To this purpose,
+	 * we use a low-pass filter. We compute the smoothing constant
+	 * of the filter as a function of the 'weight' of the new
+	 * measured rate.
+	 *
+	 * As can be seen in next formulas, we define this weight as a
+	 * quantity proportional to how sequential the workload is,
+	 * and to how long the observation time interval is.
+	 *
+	 * The weight runs from 0 to 8. The maximum value of the
+	 * weight, 8, yields the minimum value for the smoothing
+	 * constant. At this minimum value for the smoothing constant,
+	 * the measured rate contributes for half of the next value of
+	 * the estimated peak rate.
+	 *
+	 * So, the first step is to compute the weight as a function
+	 * of how sequential the workload is. Note that the weight
+	 * cannot reach 9, because bfqd->sequential_samples cannot
+	 * become equal to bfqd->peak_rate_samples, which, in its
+	 * turn, holds true because bfqd->sequential_samples is not
+	 * incremented for the first sample.
+	 */
+	weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+	/*
+	 * Second step: further refine the weight as a function of the
+	 * duration of the observation interval.
+	 */
+	weight = min_t(u32, 8,
+		       div_u64(weight * bfqd->delta_from_first,
+			       BFQ_RATE_REF_INTERVAL));
+
+	/*
+	 * Divisor ranging from 10, for minimum weight, to 2, for
+	 * maximum weight.
+	 */
+	divisor = 10 - weight;
+	BUG_ON(divisor == 0);
+
+	/*
+	 * Finally, update peak rate:
+	 *
+	 * peak_rate = peak_rate * (divisor-1) / divisor  +  rate / divisor
+	 */
+	bfqd->peak_rate *= divisor-1;
+	bfqd->peak_rate /= divisor;
+	rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+	bfq_log(bfqd,
+		"update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
+		divisor,
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
+		(u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
+
+	BUG_ON(bfqd->peak_rate == 0);
+	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+	bfqd->peak_rate += rate;
+	update_thr_responsiveness_params(bfqd);
+	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+reset_computation:
+	bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+	u64 now_ns = ktime_get_ns();
+
+	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+		bfq_log(bfqd,
+		"update_peak_rate: goto reset, samples %d",
+				bfqd->peak_rate_samples) ;
+		bfq_reset_rate_computation(bfqd, rq);
+		goto update_last_values; /* will add one sample */
+	}
+
+	/*
+	 * Device idle for very long: the observation interval lasting
+	 * up to this dispatch cannot be a valid observation interval
+	 * for computing a new peak rate (similarly to the late-
+	 * completion event in bfq_completed_request()). Go to
+	 * update_rate_and_reset to have the following three steps
+	 * taken:
+	 * - close the observation interval at the last (previous)
+	 *   request dispatch or completion
+	 * - compute rate, if possible, for that observation interval
+	 * - start a new observation interval with this dispatch
+	 */
+	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+	    bfqd->rq_in_driver == 0) {
+		bfq_log(bfqd,
+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
+			(now_ns - bfqd->last_dispatch)>>10,
+			bfqd->peak_rate_samples) ;
+		goto update_rate_and_reset;
+	}
+
+	/* Update sampling information */
+	bfqd->peak_rate_samples++;
+
+	if ((bfqd->rq_in_driver > 0 ||
+		now_ns - bfqd->last_completion < BFQ_MIN_TT)
+	     && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+		bfqd->sequential_samples++;
+
+	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+	/* Reset max observed rq size every 32 dispatches */
+	if (likely(bfqd->peak_rate_samples % 32))
+		bfqd->last_rq_max_size =
+			max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
+	else
+		bfqd->last_rq_max_size = blk_rq_sectors(rq);
+
+	bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
+
+	bfq_log(bfqd,
+	"update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		bfqd->tot_sectors_dispatched,
+		bfqd->delta_from_first>>10);
+
+	/* Target observation interval not yet reached, go on sampling */
+	if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+		goto update_last_values;
+
+update_rate_and_reset:
+	bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfqd->last_dispatch = now_ns;
+
+	bfq_log(bfqd,
+	"update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
+		(now_ns - bfqd->first_dispatch)>>10,
+		(unsigned long long) bfqd->last_position,
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+	bfq_log(bfqd,
+	"update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
 }
 
 /*
@@ -2235,6 +2595,7 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	 * incrementing bfqq->dispatched.
 	 */
 	bfqq->dispatched++;
+	bfq_update_peak_rate(q->elevator->elevator_data, rq);
 
 	bfq_remove_request(rq);
 	elv_dispatch_sort(q, rq);
@@ -2474,27 +2835,15 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 			bfqq->entity.budget);
 }
 
-static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
-{
-	/*
-	 * The max_budget calculated when autotuning is equal to the
-	 * amount of sectors transferred in timeout at the
-	 * estimated peak rate.
-	 */
-	return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >>
-		BFQ_RATE_SHIFT;
-}
-
 /*
- * Update the read peak rate (quantity used for auto-tuning) as a
- * function of the rate at which bfqq has been served, and check
- * whether the process associated with bfqq is "slow". Return true if
- * the process is slow. The slow flag is used, in addition to the
- * budget timeout, to reduce the amount of service provided to seeky
- * processes, and hence reduce their chances to lower the
- * throughput. More details in the body of the function.
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
  *
- * An important observation is in order: with devices with internal
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
  * queues, it is hard if ever possible to know when and for how long
  * an I/O request is processed by the device (apart from the trivial
  * I/O pattern where a new request is dispatched only after the
@@ -2502,26 +2851,27 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
  * the real rate at which the I/O requests of each bfq_queue are
  * served.  In fact, for an I/O scheduler like BFQ, serving a
  * bfq_queue means just dispatching its requests during its service
- * slot, i.e., until the budget of the queue is exhausted, or the
- * queue remains idle, or, finally, a timeout fires. But, during the
- * service slot of a bfq_queue, the device may be still processing
- * requests of bfq_queues served in previous service slots. On the
- * opposite end, the requests of the in-service bfq_queue may be
- * completed after the service slot of the queue finishes. Anyway,
- * unless more sophisticated solutions are used (where possible), the
- * sum of the sizes of the requests dispatched during the service slot
- * of a bfq_queue is probably the only approximation available for
- * the service received by the bfq_queue during its service slot. And,
- * as written above, this sum is the quantity used in this function to
- * evaluate the peak rate.
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
  */
-static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 				 bool compensate, enum bfqq_expiration reason,
 				 unsigned long *delta_ms)
 {
-	u64 bw, bwdiv10, delta_usecs, delta_ms_tmp;
 	ktime_t delta_ktime;
-	int update = 0;
+	u32 delta_usecs;
 	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
 
 	if (!bfq_bfqq_sync(bfqq))
@@ -2534,129 +2884,45 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
 	delta_usecs = ktime_to_us(delta_ktime);
 
-	/* Don't trust short/unrealistic values. */
+	/* don't trust short/unrealistic values. */
 	if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) {
 		if (blk_queue_nonrot(bfqd->queue))
-			*delta_ms = BFQ_MIN_TT; /*
-						 * give same worst-case
-						 * guarantees as
-						 * idling for seeky
-						*/
-		else /* Charge at least one seek */
-			*delta_ms = jiffies_to_msecs(bfq_slice_idle);
+			 /*
+			  * give same worst-case guarantees as idling
+			  * for seeky
+			  */
+			*delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+		else /* charge at least one seek */
+			*delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+		bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs);
+
 		return slow;
 	}
 
-	delta_ms_tmp = delta_usecs;
-	do_div(delta_ms_tmp, NSEC_PER_MSEC);
-	*delta_ms = delta_ms_tmp;
+	*delta_ms = delta_usecs / USEC_PER_MSEC;
 
 	/*
-	 * Calculate the bandwidth for the last slice.  We use a 64 bit
-	 * value to store the peak rate, in sectors per usec in fixed
-	 * point math.  We do so to have enough precision in the estimate
-	 * and to avoid overflows.
-	 */
-	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
-	do_div(bw, (unsigned long)delta_usecs);
-
-	bfq_log(bfqd, "measured bw = %llu sects/sec",
-		(1000000*bw)>>BFQ_RATE_SHIFT);
-	/*
-	 * Use only long (> 20ms) intervals to filter out spikes for
-	 * the peak rate estimation.
+	 * Use only long (> 20ms) intervals to filter out excessive
+	 * spikes in service rate estimation.
 	 */
 	if (delta_usecs > 20000) {
-		bool fully_sequential = bfqq->seek_history == 0;
-		/*
-		 * Soft real-time queues are not good candidates for
-		 * evaluating bw, as they are likely to be slow even
-		 * if sequential.
-		 */
-		bool non_soft_rt = bfqq->wr_coeff == 1 ||
-			bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time;
-		bool consumed_large_budget =
-			reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
-			bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3;
-		bool served_for_long_time =
-			reason == BFQ_BFQQ_BUDGET_TIMEOUT ||
-			consumed_large_budget;
-
-		BUG_ON(bfqq->seek_history == 0 &&
-		       hweight32(bfqq->seek_history) != 0);
-
-		if (bw > bfqd->peak_rate ||
-		    (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt &&
-		     served_for_long_time)) {
-			/*
-			 * To smooth oscillations use a low-pass filter with
-			 * alpha=9/10, i.e.,
-			 * new_rate = (9/10) * old_rate + (1/10) * bw
-			 */
-			bwdiv10 = bw;
-			do_div(bwdiv10, 10);
-			if (bwdiv10 == 0)
-				return false; /* bw too low to be used */
-			bfqd->peak_rate *= 9;
-			do_div(bfqd->peak_rate, 10);
-			bfqd->peak_rate += bwdiv10;
-			update = 1;
-			bfq_log(bfqd, "new peak_rate = %llu sects/sec",
-				(1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT);
-		}
-
-		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
-
-		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
-			bfqd->peak_rate_samples++;
-
-		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
-		    update) {
-			int dev_type = blk_queue_nonrot(bfqd->queue);
-
-			if (bfqd->bfq_user_max_budget == 0) {
-				bfqd->bfq_max_budget =
-					bfq_calc_max_budget(bfqd);
-				bfq_log(bfqd, "new max_budget = %d",
-					bfqd->bfq_max_budget);
-			}
-			if (bfqd->device_speed == BFQ_BFQD_FAST &&
-			    bfqd->peak_rate < device_speed_thresh[dev_type]) {
-				bfqd->device_speed = BFQ_BFQD_SLOW;
-				bfqd->RT_prod = R_slow[dev_type] *
-						T_slow[dev_type];
-			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
-			    bfqd->peak_rate > device_speed_thresh[dev_type]) {
-				bfqd->device_speed = BFQ_BFQD_FAST;
-				bfqd->RT_prod = R_fast[dev_type] *
-						T_fast[dev_type];
-			}
-			bfq_log(bfqd,
-	"dev_type %d dev_speed_class = %d (%d sects/sec), thresh %d setcs/sec",
-				dev_type, bfqd->device_speed,
-				bfqd->device_speed == BFQ_BFQD_FAST ?
-				(1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT :
-				(1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT,
-				(1000000*device_speed_thresh[dev_type])>>
-				BFQ_RATE_SHIFT);
-		}
 		/*
-		 * Caveat: processes doing IO in the slower disk zones
-		 * tend to be slow(er) even if not seeky. In this
-		 * respect, the estimated peak rate is likely to be an
-		 * average over the disk surface. Accordingly, to not
-		 * be too harsh with unlucky processes, a process is
-		 * deemed slow only if its bw has been lower than half
-		 * of the estimated peak rate.
+		 * Caveat for rotational devices: processes doing I/O
+		 * in the slower disk zones tend to be slow(er) even
+		 * if not seeky. In this respect, the estimated peak
+		 * rate is likely to be an average over the disk
+		 * surface. Accordingly, to not be too harsh with
+		 * unlucky processes, a process is deemed slow only if
+		 * its rate has been lower than half of the estimated
+		 * peak rate.
 		 */
-		slow = bw < bfqd->peak_rate / 2;
+		slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+		bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
+			bfqq->entity.service, bfqd->bfq_max_budget);
 	}
 
-	bfq_log_bfqq(bfqd, bfqq,
-		"update_peak_rate: bw %llu sect/s, peak rate %llu, slow %d",
-		     (1000000*bw)>>BFQ_RATE_SHIFT,
-		     (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT,
-		     bw < bfqd->peak_rate / 2);
+	bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
 
 	return slow;
 }
@@ -2785,10 +3051,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
 	/*
-	 * Update device peak rate for autotuning and check whether the
-	 * process is slow (see bfq_update_peak_rate).
+	 * Check whether the process is slow (see bfq_bfqq_is_slow).
 	 */
-	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta);
+	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
 
 	/*
 	 * Increase service_from_backlogged before next statement,
@@ -3285,6 +3550,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	struct bfq_entity *entity = &bfqq->entity;
 
 	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+		BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		       time_is_after_jiffies(bfqq->last_wr_start_finish));
+
 		bfq_log_bfqq(bfqd, bfqq,
 			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
 			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
@@ -3302,15 +3570,26 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		 * time has elapsed from the beginning of this
 		 * weight-raising period, then end weight raising.
 		 */
-		if (bfq_bfqq_in_large_burst(bfqq) ||
-		    time_is_before_jiffies(bfqq->last_wr_start_finish +
-					   bfqq->wr_cur_max_time)) {
-			bfqq->last_wr_start_finish = jiffies;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "wrais ending at %lu, rais_max_time %u",
-				     bfqq->last_wr_start_finish,
-				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+		if (bfq_bfqq_in_large_burst(bfqq))
 			bfq_bfqq_end_wr(bfqq);
+		else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+					   bfqq->wr_cur_max_time)) {
+			if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+			time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+					bfq_wr_duration(bfqd)))
+				bfq_bfqq_end_wr(bfqq);
+			else {
+				/* switch back to interactive wr */
+				bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+				bfqq->last_wr_start_finish =
+					bfqq->wr_start_at_switch_to_srt;
+				BUG_ON(time_is_after_jiffies(
+					       bfqq->last_wr_start_finish));
+				bfqq->entity.prio_changed = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					"back to interactive wr");
+			}
 		}
 	}
 	/* Update weight both if it must be raised and if it must be lowered */
@@ -3468,6 +3747,21 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 	if (unlikely(force))
 		return bfq_forced_dispatch(bfqd);
 
+	/*
+	 * Force device to serve one request at a time if
+	 * strict_guarantees is true. Forcing this service scheme is
+	 * currently the ONLY way to guarantee that the request
+	 * service order enforced by the scheduler is respected by a
+	 * queueing device. Otherwise the device is free even to make
+	 * some unlucky request wait for as long as the device
+	 * wishes.
+	 *
+	 * Of course, serving one request at at time may cause loss of
+	 * throughput.
+	 */
+	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+		return 0;
+
 	bfqq = bfq_select_queue(bfqd);
 	if (!bfqq)
 		return 0;
@@ -3701,9 +3995,11 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->pid = pid;
 
 	bfqq->wr_coeff = 1;
-	bfqq->last_wr_start_finish = bfq_smallest_from_now();
+	bfqq->last_wr_start_finish = jiffies;
+	bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
 	bfqq->budget_timeout = bfq_smallest_from_now();
 	bfqq->split_time = bfq_smallest_from_now();
+
 	/*
 	 * Set to the value for which bfqq will not be deemed as
 	 * soft rt when it becomes backlogged.
@@ -3797,7 +4093,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
 	struct bfq_ttime *ttime = &bic->ttime;
 	u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
 
-	elapsed = min(elapsed, 2UL * bfqd->bfq_slice_idle);
+	elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
 
 	ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
 	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
@@ -3805,22 +4101,13 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
 				     ttime->ttime_samples);
 }
 
-
 static void
 bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		       struct request *rq)
 {
-	sector_t sdist = 0;
-
-	if (bfqq->last_request_pos) {
-		if (bfqq->last_request_pos < blk_rq_pos(rq))
-			sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
-		else
-			sdist = bfqq->last_request_pos - blk_rq_pos(rq);
-	}
-
 	bfqq->seek_history <<= 1;
-	bfqq->seek_history |= (sdist > BFQQ_SEEK_THR);
+	bfqq->seek_history |=
+		get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR;
 }
 
 /*
@@ -4013,6 +4300,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
+	u64 now_ns;
+	u32 delta_us;
 
 	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
 		     blk_rq_sectors(rq));
@@ -4043,7 +4332,44 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 					&bfqd->queue_weights_tree);
 	}
 
-	RQ_BIC(rq)->ttime.last_end_request = ktime_get_ns();
+	now_ns = ktime_get_ns();
+
+	RQ_BIC(rq)->ttime.last_end_request = now_ns;
+
+	/*
+	 * Using us instead of ns, to get a reasonable precision in
+	 * computing rate in next check.
+	 */
+	delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+	bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
+		delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
+		(USEC_PER_SEC*
+		(u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
+			>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
+
+	/*
+	 * If the request took rather long to complete, and, according
+	 * to the maximum request size recorded, this completion latency
+	 * implies that the request was certainly served at a very low
+	 * rate (less than 1M sectors/sec), then the whole observation
+	 * interval that lasts up to this time instant cannot be a
+	 * valid time interval for computing a new peak rate.  Invoke
+	 * bfq_update_rate_reset to have the following three steps
+	 * taken:
+	 * - close the observation interval at the last (previous)
+	 *   request dispatch or completion
+	 * - compute rate, if possible, for that observation interval
+	 * - reset to zero samples, which will trigger a proper
+	 *   re-initialization of the observation interval on next
+	 *   dispatch
+	 */
+	if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+	   (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+			1UL<<(BFQ_RATE_SHIFT - 10))
+		bfq_update_rate_reset(bfqd, NULL);
+	bfqd->last_completion = now_ns;
 
 	/*
 	 * If we are waiting to discover whether the request pattern
@@ -4729,14 +5055,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e,
 	return count;
 }
 
-static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
-		return bfq_calc_max_budget(bfqd);
-	else
-		return bfq_default_max_budget;
-}
-
 static ssize_t bfq_max_budget_store(struct elevator_queue *e,
 				    const char *page, size_t count)
 {
@@ -4745,7 +5063,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e,
 	int ret = bfq_var_store(&__data, (page), count);
 
 	if (__data == 0)
-		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
 	else {
 		if (__data > INT_MAX)
 			__data = INT_MAX;
@@ -4775,7 +5093,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
 
 	bfqd->bfq_timeout = msecs_to_jiffies(__data);
 	if (bfqd->bfq_user_max_budget == 0)
-		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
 
 	return ret;
 }
@@ -4790,8 +5108,8 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
 	if (__data > 1)
 		__data = 1;
 	if (!bfqd->strict_guarantees && __data == 1
-	    && bfqd->bfq_slice_idle < msecs_to_jiffies(8))
-		bfqd->bfq_slice_idle = msecs_to_jiffies(8);
+	    && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+		bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
 
 	bfqd->strict_guarantees = __data;
 
@@ -4891,7 +5209,7 @@ static struct blkcg_policy blkcg_policy_bfq = {
 static int __init bfq_init(void)
 {
 	int ret;
-	char msg[50] = "BFQ I/O-scheduler: v8r3";
+	char msg[50] = "BFQ I/O-scheduler: v8r4";
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	ret = blkcg_policy_register(&blkcg_policy_bfq);
@@ -4904,14 +5222,22 @@ static int __init bfq_init(void)
 		goto err_pol_unreg;
 
 	/*
-	 * Times to load large popular applications for the typical systems
-	 * installed on the reference devices (see the comments before the
-	 * definitions of the two arrays).
+	 * Times to load large popular applications for the typical
+	 * systems installed on the reference devices (see the
+	 * comments before the definitions of the next two
+	 * arrays). Actually, we use slightly slower values, as the
+	 * estimated peak rate tends to be smaller than the actual
+	 * peak rate.  The reason for this last fact is that estimates
+	 * are computed over much shorter time intervals than the long
+	 * intervals typically used for benchmarking. Why? First, to
+	 * adapt more quickly to variations. Second, because an I/O
+	 * scheduler cannot rely on a peak-rate-evaluation workload to
+	 * be run for a long time.
 	 */
-	T_slow[0] = msecs_to_jiffies(3500);
-	T_slow[1] = msecs_to_jiffies(1500);
-	T_fast[0] = msecs_to_jiffies(8000);
-	T_fast[1] = msecs_to_jiffies(3000);
+	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+	T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */
+	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
 
 	/*
 	 * Thresholds that determine the switch between speed classes