From 037d32aa8f748e39844d2a5b607fb063b4583843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Mon, 24 Oct 2016 00:01:43 -0300 Subject: Linux-libre 4.8.4-gnu --- block/bfq-iosched.c | 810 ++++++++++++++++++++++++++++++++++++---------------- block/bfq-sched.c | 32 +++ block/bfq.h | 39 ++- block/cfq-iosched.c | 13 +- 4 files changed, 644 insertions(+), 250 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9190a5554..eef6ff49c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -83,7 +83,7 @@ static const int bfq_back_max = 16 * 1024; static const int bfq_back_penalty = 2; /* Idling period duration, in ns. */ -static u64 bfq_slice_idle = NSEC_PER_SEC / 125; +static u32 bfq_slice_idle = NSEC_PER_SEC / 125; /* Minimum number of assigned budgets for which stats are safe to compute. */ static const int bfq_stats_min_budgets = 194; @@ -103,7 +103,7 @@ static const int bfq_timeout = HZ / 8; struct kmem_cache *bfq_pool; -/* Below this threshold (in ms), we consider thinktime immediate. */ +/* Below this threshold (in ns), we consider thinktime immediate. */ #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) /* hw_tag detection: parallel requests threshold and min samples needed. */ @@ -114,8 +114,12 @@ struct kmem_cache *bfq_pool; #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -/* Min samples used for peak rate estimation (for autotuning). */ -#define BFQ_PEAK_RATE_SAMPLES 32 +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL 300*NSEC_PER_MSEC +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC /* Shift used for peak rate fixed precision calculations. */ #define BFQ_RATE_SHIFT 16 @@ -634,6 +638,23 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); + + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + + bfqq->wr_coeff = 1; + } + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -1052,6 +1073,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, * operation, is reset only when bfqq is selected for * service (see bfq_get_next_queue). */ + BUG_ON(bfqq->max_budget < 0); entity->budget = min_t(unsigned long, bfq_bfqq_budget_left(bfqq), bfqq->max_budget); @@ -1060,6 +1082,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return true; } + BUG_ON(bfqq->max_budget < 0); entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); BUG_ON(entity->budget < 0); @@ -1082,6 +1105,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { + bfqq->wr_start_at_switch_to_srt = jiffies; bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -1115,32 +1139,13 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, jiffies, jiffies_to_msecs(bfqq-> wr_cur_max_time)); - } else if (time_before( - bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time, - jiffies + - bfqd->bfq_wr_rt_max_time) && - soft_rt) { + } else if (soft_rt) { /* - * The remaining weight-raising time is lower - * than bfqd->bfq_wr_rt_max_time, which means - * that the application is enjoying weight - * raising either because deemed soft-rt in - * the near past, or because deemed interactive - * a long ago. - * In both cases, resetting now the current - * remaining weight-raising time for the - * application to the weight-raising duration - * for soft rt applications would not cause any - * latency increase for the application (as the - * new duration would be higher than the - * remaining time). - * - * In addition, the application is now meeting - * the requirements for being deemed soft rt. - * In the end we can correctly and safely - * (re)charge the weight-raising duration for - * the application with the weight-raising + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising * duration for soft rt applications. * * In particular, doing this recharge now, i.e., @@ -1164,14 +1169,22 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, * latency because the application is not * weight-raised while they are pending. */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr"); + } else + bfq_log_bfqq(bfqd, bfqq, + "moving forward soft_rt wr duration"); bfqq->last_wr_start_finish = jiffies; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfq_log_bfqq(bfqd, bfqq, - "switching to soft_rt wr, or " - " just moving forward duration"); } } } @@ -1450,14 +1463,24 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, return NULL; } +static sector_t get_sdist(sector_t last_pos, struct request *rq) +{ + sector_t sdist = 0; + + if (last_pos) { + if (last_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - last_pos; + else + sdist = last_pos - blk_rq_pos(rq); + } + + return sdist; +} + static void bfq_activate_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; - bfqd->rq_in_driver++; - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", - (unsigned long long) bfqd->last_position); } static void bfq_deactivate_request(struct request_queue *q, struct request *rq) @@ -1622,11 +1645,16 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; + bfqq->last_wr_start_finish = jiffies; /* * Trigger a weight change on the next invocation of * __bfq_entity_update_weight_prio. */ bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "end_wr: wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", bfqq->bfqd->wr_busy_queues); } @@ -1934,18 +1962,24 @@ check_scheduled: static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { + struct bfq_io_cq *bic = bfqq->bic; + /* * If !bfqq->bic, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (!bfqq->bic) + if (!bic) return; - bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); - bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); } static void bfq_get_bic_reference(struct bfq_queue *bfqq) @@ -1984,6 +2018,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, new_bfqq->wr_coeff = bfqq->wr_coeff; new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; if (bfq_bfqq_busy(new_bfqq)) bfqd->wr_busy_queues++; new_bfqq->entity.prio_changed = 1; @@ -2116,9 +2151,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, BUG_ON(bfqq == bfqd->in_service_queue); BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - if (bfqq->wr_coeff > 1 && + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - time_is_before_jiffies(bfqq->budget_timeout)) { + time_is_before_jiffies(bfqq->budget_timeout)) { /* * For soft real-time queues, move the start * of the weight-raising period forward by the @@ -2144,7 +2180,20 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, * request. */ bfqq->last_wr_start_finish += jiffies - - bfqq->budget_timeout; + max_t(unsigned long, bfqq->last_wr_start_finish, + bfqq->budget_timeout); + if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { + pr_crit( + "BFQ WARNING:last %lu budget %lu jiffies %lu", + bfqq->last_wr_start_finish, + bfqq->budget_timeout, + jiffies); + pr_crit("diff %lu", jiffies - + max_t(unsigned long, + bfqq->last_wr_start_finish, + bfqq->budget_timeout)); + bfqq->last_wr_start_finish = jiffies; + } } bfq_set_budget_timeout(bfqd, bfqq); @@ -2172,7 +2221,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; struct bfq_io_cq *bic; - unsigned long sl; + u32 sl; BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -2206,15 +2255,326 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) */ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && bfq_symmetric_scenario(bfqd)) - sl = min_t(u64, sl, BFQ_MIN_TT); + sl = min_t(u32, sl, BFQ_MIN_TT); bfqd->last_idling_start = ktime_get(); hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), HRTIMER_MODE_REL); bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); - bfq_log(bfqd, "arm idle: %llu/%llu ms", - div_u64(sl, NSEC_PER_MSEC), - div_u64(bfqd->bfq_slice_idle, NSEC_PER_MSEC)); + bfq_log(bfqd, "arm idle: %ld/%ld ms", + sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); +} + +/* + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. + */ +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) +{ + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} + +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + */ +void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + BUG_ON(bfqd->bfq_max_budget < 0); + bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } + + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + + bfq_log(bfqd, +"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", + dev_type == 0 ? "ROT" : "NONROT", + bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", + bfqd->device_speed == BFQ_BFQD_FAST ? + (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : + (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); +} + +void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); +} + +void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +{ + u32 rate, weight, divisor; + + /* + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. + */ + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, + "update_rate_reset: only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } + + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + BUG_ON(bfqd->delta_from_first == 0); + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20< 20M sectors/sec) + */ + if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, + "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + } + + /* + * We have to update the peak rate, at last! To this purpose, + * we use a low-pass filter. We compute the smoothing constant + * of the filter as a function of the 'weight' of the new + * measured rate. + * + * As can be seen in next formulas, we define this weight as a + * quantity proportional to how sequential the workload is, + * and to how long the observation time interval is. + * + * The weight runs from 0 to 8. The maximum value of the + * weight, 8, yields the minimum value for the smoothing + * constant. At this minimum value for the smoothing constant, + * the measured rate contributes for half of the next value of + * the estimated peak rate. + * + * So, the first step is to compute the weight as a function + * of how sequential the workload is. Note that the weight + * cannot reach 9, because bfqd->sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + BUG_ON(divisor == 0); + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, + "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); + + BUG_ON(bfqd->peak_rate == 0); + BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, + "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } + + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; + } + + /* Update sampling information */ + bfqd->peak_rate_samples++; + + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + bfqd->sequential_samples++; + + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + else + bfqd->last_rq_max_size = blk_rq_sectors(rq); + + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, + "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); + + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, + "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, + "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); } /* @@ -2235,6 +2595,7 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) * incrementing bfqq->dispatched. */ bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); bfq_remove_request(rq); elv_dispatch_sort(q, rq); @@ -2474,27 +2835,15 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfqq->entity.budget); } -static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -{ - /* - * The max_budget calculated when autotuning is equal to the - * amount of sectors transferred in timeout at the - * estimated peak rate. - */ - return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >> - BFQ_RATE_SHIFT; -} - /* - * Update the read peak rate (quantity used for auto-tuning) as a - * function of the rate at which bfqq has been served, and check - * whether the process associated with bfqq is "slow". Return true if - * the process is slow. The slow flag is used, in addition to the - * budget timeout, to reduce the amount of service provided to seeky - * processes, and hence reduce their chances to lower the - * throughput. More details in the body of the function. + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). * - * An important observation is in order: with devices with internal + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal * queues, it is hard if ever possible to know when and for how long * an I/O request is processed by the device (apart from the trivial * I/O pattern where a new request is dispatched only after the @@ -2502,26 +2851,27 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) * the real rate at which the I/O requests of each bfq_queue are * served. In fact, for an I/O scheduler like BFQ, serving a * bfq_queue means just dispatching its requests during its service - * slot, i.e., until the budget of the queue is exhausted, or the - * queue remains idle, or, finally, a timeout fires. But, during the - * service slot of a bfq_queue, the device may be still processing - * requests of bfq_queues served in previous service slots. On the - * opposite end, the requests of the in-service bfq_queue may be - * completed after the service slot of the queue finishes. Anyway, - * unless more sophisticated solutions are used (where possible), the - * sum of the sizes of the requests dispatched during the service slot - * of a bfq_queue is probably the only approximation available for - * the service received by the bfq_queue during its service slot. And, - * as written above, this sum is the quantity used in this function to - * evaluate the peak rate. + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. */ -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason, unsigned long *delta_ms) { - u64 bw, bwdiv10, delta_usecs, delta_ms_tmp; ktime_t delta_ktime; - int update = 0; + u32 delta_usecs; bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ if (!bfq_bfqq_sync(bfqq)) @@ -2534,129 +2884,45 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); - /* Don't trust short/unrealistic values. */ + /* don't trust short/unrealistic values. */ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { if (blk_queue_nonrot(bfqd->queue)) - *delta_ms = BFQ_MIN_TT; /* - * give same worst-case - * guarantees as - * idling for seeky - */ - else /* Charge at least one seek */ - *delta_ms = jiffies_to_msecs(bfq_slice_idle); + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); + return slow; } - delta_ms_tmp = delta_usecs; - do_div(delta_ms_tmp, NSEC_PER_MSEC); - *delta_ms = delta_ms_tmp; + *delta_ms = delta_usecs / USEC_PER_MSEC; /* - * Calculate the bandwidth for the last slice. We use a 64 bit - * value to store the peak rate, in sectors per usec in fixed - * point math. We do so to have enough precision in the estimate - * and to avoid overflows. - */ - bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)delta_usecs); - - bfq_log(bfqd, "measured bw = %llu sects/sec", - (1000000*bw)>>BFQ_RATE_SHIFT); - /* - * Use only long (> 20ms) intervals to filter out spikes for - * the peak rate estimation. + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. */ if (delta_usecs > 20000) { - bool fully_sequential = bfqq->seek_history == 0; - /* - * Soft real-time queues are not good candidates for - * evaluating bw, as they are likely to be slow even - * if sequential. - */ - bool non_soft_rt = bfqq->wr_coeff == 1 || - bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time; - bool consumed_large_budget = - reason == BFQ_BFQQ_BUDGET_EXHAUSTED && - bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3; - bool served_for_long_time = - reason == BFQ_BFQQ_BUDGET_TIMEOUT || - consumed_large_budget; - - BUG_ON(bfqq->seek_history == 0 && - hweight32(bfqq->seek_history) != 0); - - if (bw > bfqd->peak_rate || - (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt && - served_for_long_time)) { - /* - * To smooth oscillations use a low-pass filter with - * alpha=9/10, i.e., - * new_rate = (9/10) * old_rate + (1/10) * bw - */ - bwdiv10 = bw; - do_div(bwdiv10, 10); - if (bwdiv10 == 0) - return false; /* bw too low to be used */ - bfqd->peak_rate *= 9; - do_div(bfqd->peak_rate, 10); - bfqd->peak_rate += bwdiv10; - update = 1; - bfq_log(bfqd, "new peak_rate = %llu sects/sec", - (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT); - } - - update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; - - if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) - bfqd->peak_rate_samples++; - - if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && - update) { - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd); - bfq_log(bfqd, "new max_budget = %d", - bfqd->bfq_max_budget); - } - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - bfq_log(bfqd, - "dev_type %d dev_speed_class = %d (%d sects/sec), thresh %d setcs/sec", - dev_type, bfqd->device_speed, - bfqd->device_speed == BFQ_BFQD_FAST ? - (1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT : - (1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT, - (1000000*device_speed_thresh[dev_type])>> - BFQ_RATE_SHIFT); - } /* - * Caveat: processes doing IO in the slower disk zones - * tend to be slow(er) even if not seeky. In this - * respect, the estimated peak rate is likely to be an - * average over the disk surface. Accordingly, to not - * be too harsh with unlucky processes, a process is - * deemed slow only if its bw has been lower than half - * of the estimated peak rate. + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. */ - slow = bw < bfqd->peak_rate / 2; + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); } - bfq_log_bfqq(bfqd, bfqq, - "update_peak_rate: bw %llu sect/s, peak rate %llu, slow %d", - (1000000*bw)>>BFQ_RATE_SHIFT, - (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT, - bw < bfqd->peak_rate / 2); + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); return slow; } @@ -2785,10 +3051,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, BUG_ON(bfqq != bfqd->in_service_queue); /* - * Update device peak rate for autotuning and check whether the - * process is slow (see bfq_update_peak_rate). + * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); /* * Increase service_from_backlogged before next statement, @@ -3285,6 +3550,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct bfq_entity *entity = &bfqq->entity; if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqd, bfqq, "raising period dur %u/%u msec, old coeff %u, w %d(%d)", jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), @@ -3302,15 +3570,26 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) * time has elapsed from the beginning of this * weight-raising period, then end weight raising. */ - if (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + if (bfq_bfqq_in_large_burst(bfqq)) bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + /* switch back to interactive wr */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = + bfqq->wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "back to interactive wr"); + } } } /* Update weight both if it must be raised and if it must be lowered */ @@ -3468,6 +3747,21 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) if (unlikely(force)) return bfq_forced_dispatch(bfqd); + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at at time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + return 0; + bfqq = bfq_select_queue(bfqd); if (!bfqq) return 0; @@ -3701,9 +3995,11 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->pid = pid; bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = bfq_smallest_from_now(); + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); bfqq->budget_timeout = bfq_smallest_from_now(); bfqq->split_time = bfq_smallest_from_now(); + /* * Set to the value for which bfqq will not be deemed as * soft rt when it becomes backlogged. @@ -3797,7 +4093,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_ttime *ttime = &bic->ttime; u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; - elapsed = min(elapsed, 2UL * bfqd->bfq_slice_idle); + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); @@ -3805,22 +4101,13 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, ttime->ttime_samples); } - static void bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *rq) { - sector_t sdist = 0; - - if (bfqq->last_request_pos) { - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - } - bfqq->seek_history <<= 1; - bfqq->seek_history |= (sdist > BFQQ_SEEK_THR); + bfqq->seek_history |= + get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR; } /* @@ -4013,6 +4300,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; + u64 now_ns; + u32 delta_us; bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", blk_rq_sectors(rq)); @@ -4043,7 +4332,44 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) &bfqd->queue_weights_tree); } - RQ_BIC(rq)->ttime.last_end_request = ktime_get_ns(); + now_ns = ktime_get_ns(); + + RQ_BIC(rq)->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<last_completion = now_ns; /* * If we are waiting to discover whether the request pattern @@ -4729,14 +5055,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } -static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd); - else - return bfq_default_max_budget; -} - static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4745,7 +5063,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, int ret = bfq_var_store(&__data, (page), count); if (__data == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); else { if (__data > INT_MAX) __data = INT_MAX; @@ -4775,7 +5093,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, bfqd->bfq_timeout = msecs_to_jiffies(__data); if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); return ret; } @@ -4790,8 +5108,8 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, if (__data > 1) __data = 1; if (!bfqd->strict_guarantees && __data == 1 - && bfqd->bfq_slice_idle < msecs_to_jiffies(8)) - bfqd->bfq_slice_idle = msecs_to_jiffies(8); + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; bfqd->strict_guarantees = __data; @@ -4891,7 +5209,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[50] = "BFQ I/O-scheduler: v8r3"; + char msg[50] = "BFQ I/O-scheduler: v8r4"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -4904,14 +5222,22 @@ static int __init bfq_init(void) goto err_pol_unreg; /* - * Times to load large popular applications for the typical systems - * installed on the reference devices (see the comments before the - * definitions of the two arrays). + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definitions of the next two + * arrays). Actually, we use slightly slower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. */ - T_slow[0] = msecs_to_jiffies(3500); - T_slow[1] = msecs_to_jiffies(1500); - T_fast[0] = msecs_to_jiffies(8000); - T_fast[1] = msecs_to_jiffies(3000); + T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ + T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */ + T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ /* * Thresholds that determine the switch between speed classes diff --git a/block/bfq-sched.c b/block/bfq-sched.c index f8960a4e9..45d63d3ff 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -327,10 +327,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) static void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->min_start = entity->start; bfq_update_min(entity, node->rb_right); bfq_update_min(entity, node->rb_left); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#endif + } } /** @@ -1127,7 +1143,23 @@ static void bfq_update_vtime(struct bfq_service_tree *st) entry = rb_entry(node, struct bfq_entity, rb_node); if (bfq_gt(entry->min_start, st->vtime)) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entry); st->vtime = entry->min_start; + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_vtime: new vtime %llu %p", + ((st->vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entry, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_vtime: new vtime %llu %p", + ((st->vtime>>10)*1000)>>12, st); + } +#endif bfq_forget_idle(st); } } diff --git a/block/bfq.h b/block/bfq.h index 00142fcce..ea1e7d852 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v8r3 for 4.8.0: data structures and common functions prototypes. + * BFQ-v8r4 for 4.8.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -298,6 +298,10 @@ struct bfq_queue { * last transition from idle to backlogged. */ unsigned long service_from_backlogged; + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ }; @@ -352,6 +356,13 @@ struct bfq_io_cq { * with another cooperating queue. */ bool was_in_burst_list; + + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; }; enum bfq_device_speed { @@ -431,14 +442,32 @@ struct bfq_data { /* on-disk position of the last served request */ sector_t last_position; + /* time of last request completion (ns) */ + u64 last_completion; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + /* beginning of the last budget */ ktime_t last_budget_start; /* beginning of the last idle slice */ ktime_t last_idling_start; - /* number of samples used to calculate @peak_rate */ + + /* number of samples in current observation interval */ int peak_rate_samples; - /* peak transfer rate observed for a budget */ - u64 peak_rate; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* current estimate of device peak rate */ + u32 peak_rate; + /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; @@ -457,7 +486,7 @@ struct bfq_data { /* maximum allowed backward seek */ unsigned int bfq_back_max; /* maximum idling time */ - u64 bfq_slice_idle; + u32 bfq_slice_idle; /* last time CLASS_IDLE was served */ u64 bfq_class_idle_last_service; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fdcd5999d..f336dcbed 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (ktime_get_ns() < rq->fifo_time) rq = NULL; - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; + if (cfq_cfqq_must_dispatch(cfqq)) + return true; + /* * Drain async requests before we start sync IO */ @@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); + rq = cfq_check_fifo(cfqq); + if (rq) + cfq_mark_cfqq_must_dispatch(cfqq); + if (!cfq_may_dispatch(cfqd, cfqq)) return false; /* * follow expired path, else get first next available */ - rq = cfq_check_fifo(cfqq); if (!rq) rq = cfqq->next_rq; + else + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); /* * insert request into driver dispatch list @@ -4002,7 +4009,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) return true; /* -- cgit v1.2.3